All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.associations.Apriori Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.

There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    Apriori.java
 *    Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.associations;

import java.util.*;

import weka.core.AttributeStats;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WekaEnumeration;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Remove;

/**
 *  Class implementing an Apriori-type algorithm.
 * Iteratively reduces the minimum support until it finds the required number of
 * rules with the given minimum confidence.
* The algorithm has an option to mine class association rules. It is adapted as * explained in the second reference.
*
* For more information see:
*
* R. Agrawal, R. Srikant: Fast Algorithms for Mining Association Rules in Large * Databases. In: 20th International Conference on Very Large Data Bases, * 478-499, 1994.
*
* Bing Liu, Wynne Hsu, Yiming Ma: Integrating Classification and Association * Rule Mining. In: Fourth International Conference on Knowledge Discovery and * Data Mining, 80-86, 1998. *

* * * BibTeX: * *

 * @inproceedings{Agrawal1994,
 *    author = {R. Agrawal and R. Srikant},
 *    booktitle = {20th International Conference on Very Large Data Bases},
 *    pages = {478-499},
 *    publisher = {Morgan Kaufmann, Los Altos, CA},
 *    title = {Fast Algorithms for Mining Association Rules in Large Databases},
 *    year = {1994}
 * }
 * 
 * @inproceedings{Liu1998,
 *    author = {Bing Liu and Wynne Hsu and Yiming Ma},
 *    booktitle = {Fourth International Conference on Knowledge Discovery and Data Mining},
 *    pages = {80-86},
 *    publisher = {AAAI Press},
 *    title = {Integrating Classification and Association Rule Mining},
 *    year = {1998}
 * }
 * 
*

* * * Valid options are: *

* *

 * -N <required number of rules output>
 *  The required number of rules. (default = 10)
 * 
* *
 * -T <0=confidence | 1=lift | 2=leverage | 3=Conviction>
 *  The metric type by which to rank rules. (default = confidence)
 * 
* *
 * -C <minimum metric score of a rule>
 *  The minimum confidence of a rule. (default = 0.9)
 * 
* *
 * -D <delta for minimum support>
 *  The delta by which the minimum support is decreased in
 *  each iteration. (default = 0.05)
 * 
* *
 * -U <upper bound for minimum support>
 *  Upper bound for minimum support. (default = 1.0)
 * 
* *
 * -M <lower bound for minimum support>
 *  The lower bound for the minimum support. (default = 0.1)
 * 
* *
 * -S <significance level>
 *  If used, rules are tested for significance at
 *  the given level. Slower. (default = no significance testing)
 * 
* *
 * -I
 *  If set the itemsets found are also output. (default = no)
 * 
* *
 * -R
 *  Remove columns that contain all missing values (default = no)
 * 
* *
 * -V
 *  Report progress iteratively. (default = no)
 * 
* *
 * -A
 *  If set class association rules are mined. (default = no)
 * 
* *
 * -Z
 *  Treat zero (i.e. first value of nominal attributes) as missing
 * 
* *
 * -B <toString delimiters>
 *  If used, two characters to use as rule delimiters
 *  in the result of toString: the first to delimit fields,
 *  the second to delimit items within fields.
 *  (default = traditional toString result)
 * 
* *
 * -c <the class index>
 *  The class index. (default = last)
 * 
* * * * @author Eibe Frank ([email protected]) * @author Mark Hall ([email protected]) * @author Stefan Mutter ([email protected]) * @version $Revision: 15519 $ */ public class Apriori extends AbstractAssociator implements OptionHandler, AssociationRulesProducer, CARuleMiner, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = 3277498842319212687L; /** The minimum support. */ protected double m_minSupport; /** The upper bound on the support */ protected double m_upperBoundMinSupport; /** The lower bound for the minimum support. */ protected double m_lowerBoundMinSupport; /** Metric type: Confidence */ protected static final int CONFIDENCE = 0; /** Metric type: Lift */ protected static final int LIFT = 1; /** Metric type: Leverage */ protected static final int LEVERAGE = 2; /** Metric type: Conviction */ protected static final int CONVICTION = 3; /** Metric types. */ public static final Tag[] TAGS_SELECTION = { new Tag(CONFIDENCE, "Confidence"), new Tag(LIFT, "Lift"), new Tag(LEVERAGE, "Leverage"), new Tag(CONVICTION, "Conviction") }; /** The selected metric type. */ protected int m_metricType = CONFIDENCE; /** The minimum metric score. */ protected double m_minMetric; /** The maximum number of rules that are output. */ protected int m_numRules; /** Delta by which m_minSupport is decreased in each iteration. */ protected double m_delta; /** Significance level for optional significance test. */ protected double m_significanceLevel; /** Number of cycles used before required number of rules was one. */ protected int m_cycles; /** The set of all sets of itemsets L. */ protected ArrayList> m_Ls; /** The same information stored in hash tables. */ protected ArrayList> m_hashtables; /** The list of all generated rules. */ protected ArrayList[] m_allTheRules; /** * The instances (transactions) to be used for generating the association * rules. */ protected Instances m_instances; /** Output itemsets found? */ protected boolean m_outputItemSets; /** Remove columns with all missing values */ protected boolean m_removeMissingCols; /** Report progress iteratively */ protected boolean m_verbose; /** Only the class attribute of all Instances. */ protected Instances m_onlyClass; /** The class index. */ protected int m_classIndex; /** Flag indicating whether class association rules are mined. */ protected boolean m_car; /** * Treat zeros as missing (rather than a value in their own right) */ protected boolean m_treatZeroAsMissing = false; /** * ToString delimiters, if any */ protected String m_toStringDelimiters = null; /** * Returns a string describing this associator * * @return a description of the evaluator suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Class implementing an Apriori-type algorithm. Iteratively reduces " + "the minimum support until it finds the required number of rules with " + "the given minimum confidence.\n" + "The algorithm has an option to mine class association rules. It is " + "adapted as explained in the second reference.\n\n" + "For more information see:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; TechnicalInformation additional; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "R. Agrawal and R. Srikant"); result.setValue(Field.TITLE, "Fast Algorithms for Mining Association Rules in Large Databases"); result.setValue(Field.BOOKTITLE, "20th International Conference on Very Large Data Bases"); result.setValue(Field.YEAR, "1994"); result.setValue(Field.PAGES, "478-499"); result.setValue(Field.PUBLISHER, "Morgan Kaufmann, Los Altos, CA"); additional = result.add(Type.INPROCEEDINGS); additional.setValue(Field.AUTHOR, "Bing Liu and Wynne Hsu and Yiming Ma"); additional.setValue(Field.TITLE, "Integrating Classification and Association Rule Mining"); additional.setValue(Field.BOOKTITLE, "Fourth International Conference on Knowledge Discovery and Data Mining"); additional.setValue(Field.YEAR, "1998"); additional.setValue(Field.PAGES, "80-86"); additional.setValue(Field.PUBLISHER, "AAAI Press"); return result; } /** * Constructor that allows to sets default values for the minimum confidence * and the maximum number of rules the minimum confidence. */ public Apriori() { resetOptions(); } /** * Resets the options to the default values. */ public void resetOptions() { m_removeMissingCols = false; m_verbose = false; m_delta = 0.05; m_minMetric = 0.90; m_numRules = 10; m_lowerBoundMinSupport = 0.1; m_upperBoundMinSupport = 1.0; m_significanceLevel = -1; m_outputItemSets = false; m_car = false; m_classIndex = -1; m_treatZeroAsMissing = false; m_metricType = CONFIDENCE; } /** * Removes columns that are all missing from the data * * @param instances the instances * @return a new set of instances with all missing columns removed * @throws Exception if something goes wrong */ protected Instances removeMissingColumns(Instances instances) throws Exception { int numInstances = instances.numInstances(); StringBuffer deleteString = new StringBuffer(); int removeCount = 0; boolean first = true; int maxCount = 0; for (int i = 0; i < instances.numAttributes(); i++) { AttributeStats as = instances.attributeStats(i); if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) { // see if we can decrease this by looking for the most frequent value int[] counts = as.nominalCounts; if (counts[Utils.maxIndex(counts)] > maxCount) { maxCount = counts[Utils.maxIndex(counts)]; } } if (as.missingCount == numInstances) { if (first) { deleteString.append((i + 1)); first = false; } else { deleteString.append("," + (i + 1)); } removeCount++; } } if (m_verbose) { System.err.println("Removed : " + removeCount + " columns with all missing " + "values."); } if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) { m_upperBoundMinSupport = (double) maxCount / (double) numInstances; if (m_verbose) { System.err.println("Setting upper bound min support to : " + m_upperBoundMinSupport); } } if (deleteString.toString().length() > 0) { Remove af = new Remove(); af.setAttributeIndices(deleteString.toString()); af.setInvertSelection(false); af.setInputFormat(instances); Instances newInst = Filter.useFilter(instances, af); return newInst; } return instances; } /** * Returns default capabilities of the classifier. * * @return the capabilities of this classifier */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // enable what we can handle // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class (can handle a nominal class if CAR rules are selected). This result.enable(Capability.NO_CLASS); result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); return result; } /** * Method that generates all large itemsets with a minimum support, and from * these all association rules with a minimum confidence. * * @param instances the instances to be used for generating the associations * @throws Exception if rules can't be built successfully */ @SuppressWarnings("unchecked") @Override public void buildAssociations(Instances instances) throws Exception { double[] confidences, supports; int[] indices; ArrayList[] sortedRuleSet; double necSupport = 0; instances = new Instances(instances); if (m_removeMissingCols) { instances = removeMissingColumns(instances); } if (m_car && m_metricType != CONFIDENCE) { throw new Exception("For CAR-Mining metric type has to be confidence!"); } // only set class index if CAR is requested if (m_car) { if (m_classIndex == -1) { instances.setClassIndex(instances.numAttributes() - 1); } else if (m_classIndex <= instances.numAttributes() && m_classIndex > 0) { instances.setClassIndex(m_classIndex - 1); } else { throw new Exception("Invalid class index."); } } // can associator handle the data? getCapabilities().testWithFail(instances); m_cycles = 0; // make sure that the lower bound is equal to at least one instance double lowerBoundMinSupportToUse = (m_lowerBoundMinSupport * instances.numInstances() < 1.0) ? 1.0 / instances.numInstances() : m_lowerBoundMinSupport; if (m_car) { // m_instances does not contain the class attribute m_instances = LabeledItemSet.divide(instances, false); // m_onlyClass contains only the class attribute m_onlyClass = LabeledItemSet.divide(instances, true); } else { m_instances = instances; } if (m_car && m_numRules == Integer.MAX_VALUE) { // Set desired minimum support m_minSupport = lowerBoundMinSupportToUse; } else { // Decrease minimum support until desired number of rules found. // m_minSupport = m_upperBoundMinSupport - m_delta; m_minSupport = 1.0 - m_delta; m_minSupport = (m_minSupport < lowerBoundMinSupportToUse) ? lowerBoundMinSupportToUse : m_minSupport; } do { // Reserve space for variables m_Ls = new ArrayList>(); m_hashtables = new ArrayList>(); m_allTheRules = new ArrayList[6]; m_allTheRules[0] = new ArrayList(); m_allTheRules[1] = new ArrayList(); m_allTheRules[2] = new ArrayList(); // if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { m_allTheRules[3] = new ArrayList(); m_allTheRules[4] = new ArrayList(); m_allTheRules[5] = new ArrayList(); // } sortedRuleSet = new ArrayList[6]; sortedRuleSet[0] = new ArrayList(); sortedRuleSet[1] = new ArrayList(); sortedRuleSet[2] = new ArrayList(); // if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { sortedRuleSet[3] = new ArrayList(); sortedRuleSet[4] = new ArrayList(); sortedRuleSet[5] = new ArrayList(); // } if (!m_car) { // Find large itemsets and rules findLargeItemSets(); if (m_significanceLevel != -1 || m_metricType != CONFIDENCE) { findRulesBruteForce(); } else { findRulesQuickly(); } } else { findLargeCarItemSets(); findCarRulesQuickly(); } // prune rules for upper bound min support if (m_upperBoundMinSupport < 1.0) { pruneRulesForUpperBoundSupport(); } // Sort rules according to their support /* * supports = new double[m_allTheRules[2].size()]; for (int i = 0; i < * m_allTheRules[2].size(); i++) supports[i] = * (double)((AprioriItemSet)m_allTheRules[1].elementAt(i)).support(); * indices = Utils.stableSort(supports); for (int i = 0; i < * m_allTheRules[2].size(); i++) { * sortedRuleSet[0].add(m_allTheRules[0].get(indices[i])); * sortedRuleSet[1].add(m_allTheRules[1].get(indices[i])); * sortedRuleSet[2].add(m_allTheRules[2].get(indices[i])); if * (m_metricType != CONFIDENCE || m_significanceLevel != -1) { * sortedRuleSet[3].add(m_allTheRules[3].get(indices[i])); * sortedRuleSet[4].add(m_allTheRules[4].get(indices[i])); * sortedRuleSet[5].add(m_allTheRules[5].get(indices[i])); } } */ int j = m_allTheRules[2].size() - 1; supports = new double[m_allTheRules[2].size()]; for (int i = 0; i < (j + 1); i++) { supports[j - i] = ((double) ((ItemSet) m_allTheRules[1].get(j - i)) .support()) * (-1); } indices = Utils.stableSort(supports); for (int i = 0; i < (j + 1); i++) { sortedRuleSet[0].add(m_allTheRules[0].get(indices[j - i])); sortedRuleSet[1].add(m_allTheRules[1].get(indices[j - i])); sortedRuleSet[2].add(m_allTheRules[2].get(indices[j - i])); if (!m_car) { // if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { sortedRuleSet[3].add(m_allTheRules[3].get(indices[j - i])); sortedRuleSet[4].add(m_allTheRules[4].get(indices[j - i])); sortedRuleSet[5].add(m_allTheRules[5].get(indices[j - i])); } // } } // Sort rules according to their confidence m_allTheRules[0].clear(); m_allTheRules[1].clear(); m_allTheRules[2].clear(); // if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { m_allTheRules[3].clear(); m_allTheRules[4].clear(); m_allTheRules[5].clear(); // } confidences = new double[sortedRuleSet[2].size()]; int sortType = 2 + m_metricType; for (int i = 0; i < sortedRuleSet[2].size(); i++) { confidences[i] = ((Double) sortedRuleSet[sortType].get(i)) .doubleValue(); } indices = Utils.stableSort(confidences); for (int i = sortedRuleSet[0].size() - 1; (i >= (sortedRuleSet[0].size() - m_numRules)) && (i >= 0); i--) { m_allTheRules[0].add(sortedRuleSet[0].get(indices[i])); m_allTheRules[1].add(sortedRuleSet[1].get(indices[i])); m_allTheRules[2].add(sortedRuleSet[2].get(indices[i])); // if (m_metricType != CONFIDENCE || m_significanceLevel != -1) { if (!m_car) { m_allTheRules[3].add(sortedRuleSet[3].get(indices[i])); m_allTheRules[4].add(sortedRuleSet[4].get(indices[i])); m_allTheRules[5].add(sortedRuleSet[5].get(indices[i])); } // } } if (m_verbose) { if (m_Ls.size() > 1) { System.out.println(toString()); } } if (m_minSupport == lowerBoundMinSupportToUse || m_minSupport - m_delta > lowerBoundMinSupportToUse) { m_minSupport -= m_delta; } else { m_minSupport = lowerBoundMinSupportToUse; } necSupport = Math.rint(m_minSupport * m_instances.numInstances()); m_cycles++; } while ((m_allTheRules[0].size() < m_numRules) && (Utils.grOrEq(m_minSupport, lowerBoundMinSupportToUse)) /* (necSupport >= lowerBoundNumInstancesSupport) */ /* (Utils.grOrEq(m_minSupport, m_lowerBoundMinSupport)) */&& (necSupport >= 1)); m_minSupport += m_delta; } private void pruneRulesForUpperBoundSupport() { int necMaxSupport = (int) (m_upperBoundMinSupport * m_instances.numInstances() + 0.5); @SuppressWarnings("unchecked") ArrayList[] prunedRules = new ArrayList[6]; for (int i = 0; i < 6; i++) { prunedRules[i] = new ArrayList(); } for (int i = 0; i < m_allTheRules[0].size(); i++) { if (((ItemSet) m_allTheRules[1].get(i)).support() <= necMaxSupport) { prunedRules[0].add(m_allTheRules[0].get(i)); prunedRules[1].add(m_allTheRules[1].get(i)); prunedRules[2].add(m_allTheRules[2].get(i)); if (!m_car) { prunedRules[3].add(m_allTheRules[3].get(i)); prunedRules[4].add(m_allTheRules[4].get(i)); prunedRules[5].add(m_allTheRules[5].get(i)); } } } m_allTheRules[0] = prunedRules[0]; m_allTheRules[1] = prunedRules[1]; m_allTheRules[2] = prunedRules[2]; m_allTheRules[3] = prunedRules[3]; m_allTheRules[4] = prunedRules[4]; m_allTheRules[5] = prunedRules[5]; } /** * Method that mines all class association rules with minimum support and with * a minimum confidence. * * @return an sorted array of FastVector (confidence depended) containing the * rules and metric information * @param data the instances for which class association rules should be mined * @throws Exception if rules can't be built successfully */ @Override public ArrayList[] mineCARs(Instances data) throws Exception { m_car = true; buildAssociations(data); return m_allTheRules; } /** * Gets the instances without the class atrribute. * * @return the instances without the class attribute. */ @Override public Instances getInstancesNoClass() { return m_instances; } /** * Gets only the class attribute of the instances. * * @return the class attribute of all instances. */ @Override public Instances getInstancesOnlyClass() { return m_onlyClass; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration