weka.associations.Apriori Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    Apriori.java
 *    Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.associations;

import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.List;
import java.util.Vector;

import weka.core.AttributeStats;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WekaEnumeration;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Remove;

/**
 *  Class implementing an Apriori-type algorithm.
 * Iteratively reduces the minimum support until it finds the required number of
 * rules with the given minimum confidence.

 * The algorithm has an option to mine class association rules. It is adapted as
 * explained in the second reference.

 * 

 * For more information see:

 * 

 * R. Agrawal, R. Srikant: Fast Algorithms for Mining Association Rules in Large
 * Databases. In: 20th International Conference on Very Large Data Bases,
 * 478-499, 1994.

 * 

 * Bing Liu, Wynne Hsu, Yiming Ma: Integrating Classification and Association
 * Rule Mining. In: Fourth International Conference on Knowledge Discovery and
 * Data Mining, 80-86, 1998.
 * 
 * 
 * 
 *  BibTeX:
 * 
 * 
 * @inproceedings{Agrawal1994,
 *    author = {R. Agrawal and R. Srikant},
 *    booktitle = {20th International Conference on Very Large Data Bases},
 *    pages = {478-499},
 *    publisher = {Morgan Kaufmann, Los Altos, CA},
 *    title = {Fast Algorithms for Mining Association Rules in Large Databases},
 *    year = {1994}
 * }
 * 
 * @inproceedings{Liu1998,
 *    author = {Bing Liu and Wynne Hsu and Yiming Ma},
 *    booktitle = {Fourth International Conference on Knowledge Discovery and Data Mining},
 *    pages = {80-86},
 *    publisher = {AAAI Press},
 *    title = {Integrating Classification and Association Rule Mining},
 *    year = {1998}
 * }
 * 
 * 
 * 
 * 
 *  Valid options are:
 * 

 * 
 * 
 * -N <required number of rules output>
 *  The required number of rules. (default = 10)
 * 
 * 
 *  * -T <0=confidence | 1=lift | 2=leverage | 3=Conviction>
 *  The metric type by which to rank rules. (default = confidence)
 * 
 * 
 *  * -C <minimum metric score of a rule>
 *  The minimum confidence of a rule. (default = 0.9)
 * 
 * 
 *  * -D <delta for minimum support>
 *  The delta by which the minimum support is decreased in
 *  each iteration. (default = 0.05)
 * 
 * 
 *  * -U <upper bound for minimum support>
 *  Upper bound for minimum support. (default = 1.0)
 * 
 * 
 *  * -M <lower bound for minimum support>
 *  The lower bound for the minimum support. (default = 0.1)
 * 
 * 
 *  * -S <significance level>
 *  If used, rules are tested for significance at
 *  the given level. Slower. (default = no significance testing)
 * 
 * 
 *  * -I
 *  If set the itemsets found are also output. (default = no)
 * 
 * 
 *  * -R
 *  Remove columns that contain all missing values (default = no)
 * 
 * 
 *  * -V
 *  Report progress iteratively. (default = no)
 * 
 * 
 *  * -A
 *  If set class association rules are mined. (default = no)
 * 
 * 
 *  * -Z
 *  Treat zero (i.e. first value of nominal attributes) as missing
 * 
 * 
 *  * -B <toString delimiters>
 *  If used, two characters to use as rule delimiters
 *  in the result of toString: the first to delimit fields,
 *  the second to delimit items within fields.
 *  (default = traditional toString result)
 * 
 * 
 *  * -c <the class index>
 *  The class index. (default = last)
 * 
 * 
 * 
 * 
 * @author Eibe Frank ([email protected])
 * @author Mark Hall ([email protected])
 * @author Stefan Mutter ([email protected])
 * @version $Revision: 12014 $
 */
public class Apriori extends AbstractAssociator implements OptionHandler,
  AssociationRulesProducer, CARuleMiner, TechnicalInformationHandler {

  /** for serialization */
  static final long serialVersionUID = 3277498842319212687L;

  /** The minimum support. */
  protected double m_minSupport;

  /** The upper bound on the support */
  protected double m_upperBoundMinSupport;

  /** The lower bound for the minimum support. */
  protected double m_lowerBoundMinSupport;

  /** Metric type: Confidence */
  protected static final int CONFIDENCE = 0;
  /** Metric type: Lift */
  protected static final int LIFT = 1;
  /** Metric type: Leverage */
  protected static final int LEVERAGE = 2;
  /** Metric type: Conviction */
  protected static final int CONVICTION = 3;
  /** Metric types. */
  public static final Tag[] TAGS_SELECTION = {
    new Tag(CONFIDENCE, "Confidence"), new Tag(LIFT, "Lift"),
    new Tag(LEVERAGE, "Leverage"), new Tag(CONVICTION, "Conviction") };

  /** The selected metric type. */
  protected int m_metricType = CONFIDENCE;

  /** The minimum metric score. */
  protected double m_minMetric;

  /** The maximum number of rules that are output. */
  protected int m_numRules;

  /** Delta by which m_minSupport is decreased in each iteration. */
  protected double m_delta;

  /** Significance level for optional significance test. */
  protected double m_significanceLevel;

  /** Number of cycles used before required number of rules was one. */
  protected int m_cycles;

  /** The set of all sets of itemsets L. */
  protected ArrayList> m_Ls;

  /** The same information stored in hash tables. */
  protected ArrayList> m_hashtables;

  /** The list of all generated rules. */
  protected ArrayList