All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.associations.PredictiveApriori Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    PredictiveApriori.java
 *    Copyright (C) 2004 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.associations;

import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.TreeSet;
import java.util.Vector;

import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WekaEnumeration;

/**
 *  Class implementing the predictive apriori algorithm
 * to mine association rules.
* It searches with an increasing support threshold for the best 'n' rules * concerning a support-based corrected confidence value.
*
* For more information see:
*
* Tobias Scheffer: Finding Association Rules That Trade Support Optimally * against Confidence. In: 5th European Conference on Principles of Data Mining * and Knowledge Discovery, 424-435, 2001.
*
* The implementation follows the paper expect for adding a rule to the output * of the 'n' best rules. A rule is added if:
* the expected predictive accuracy of this rule is among the 'n' best and it is * not subsumed by a rule with at least the same expected predictive accuracy * (out of an unpublished manuscript from T. Scheffer). *

* * * BibTeX: * *

 * @inproceedings{Scheffer2001,
 *    author = {Tobias Scheffer},
 *    booktitle = {5th European Conference on Principles of Data Mining and Knowledge Discovery},
 *    pages = {424-435},
 *    publisher = {Springer},
 *    title = {Finding Association Rules That Trade Support Optimally against Confidence},
 *    year = {2001}
 * }
 * 
*

* * * Valid options are: *

* *

 * -N <required number of rules output>
 *  The required number of rules. (default = 100)
 * 
* *
 * -A
 *  If set class association rules are mined. (default = no)
 * 
* *
 * -c <the class index>
 *  The class index. (default = last)
 * 
* * * * @author Stefan Mutter ([email protected]) * @version $Revision: 11047 $ */ public class PredictiveApriori extends AbstractAssociator implements OptionHandler, CARuleMiner, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = 8109088846865075341L; /** The minimum support. */ protected int m_premiseCount; /** The maximum number of rules that are output. */ protected int m_numRules; /** The number of rules created for the prior estimation. */ protected static final int m_numRandRules = 1000; /** The number of intervals used for the prior estimation. */ protected static final int m_numIntervals = 100; /** The set of all sets of itemsets. */ protected ArrayList> m_Ls; /** The same information stored in hash tables. */ protected ArrayList> m_hashtables; /** The list of all generated rules. */ protected ArrayList[] m_allTheRules; /** * The instances (transactions) to be used for generating the association * rules. */ protected Instances m_instances; /** The hashtable containing the prior probabilities. */ protected Hashtable m_priors; /** The mid points of the intervals used for the prior estimation. */ protected double[] m_midPoints; /** * The expected predictive accuracy a rule needs to be a candidate for the * output. */ protected double m_expectation; /** The n best rules. */ protected TreeSet m_best; /** Flag keeping track if the list of the n best rules has changed. */ protected boolean m_bestChanged; /** Counter for the time of generation for an association rule. */ protected int m_count; /** The prior estimator. */ protected PriorEstimation m_priorEstimator; /** The class index. */ protected int m_classIndex; /** Flag indicating whether class association rules are mined. */ protected boolean m_car; /** * Returns a string describing this associator * * @return a description of the evaluator suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Class implementing the predictive apriori algorithm to mine " + "association rules.\n" + "It searches with an increasing support threshold for the best 'n' " + "rules concerning a support-based corrected confidence value.\n\n" + "For more information see:\n\n" + getTechnicalInformation().toString() + "\n\n" + "The implementation follows the paper expect for adding a rule to the " + "output of the 'n' best rules. A rule is added if:\n" + "the expected predictive accuracy of this rule is among the 'n' best " + "and it is not subsumed by a rule with at least the same expected " + "predictive accuracy (out of an unpublished manuscript from T. " + "Scheffer)."; } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "Tobias Scheffer"); result .setValue(Field.TITLE, "Finding Association Rules That Trade Support Optimally against Confidence"); result .setValue(Field.BOOKTITLE, "5th European Conference on Principles of Data Mining and Knowledge Discovery"); result.setValue(Field.YEAR, "2001"); result.setValue(Field.PAGES, "424-435"); result.setValue(Field.PUBLISHER, "Springer"); return result; } /** * Constructor that allows to sets default values for the minimum confidence * and the maximum number of rules the minimum confidence. */ public PredictiveApriori() { resetOptions(); } /** * Resets the options to the default values. */ public void resetOptions() { m_numRules = 105; m_premiseCount = 1; m_best = new TreeSet(); m_bestChanged = false; m_expectation = 0; m_count = 1; m_car = false; m_classIndex = -1; m_priors = new Hashtable(); } /** * Returns default capabilities of the classifier. * * @return the capabilities of this classifier */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NO_CLASS); result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); return result; } /** * Method that generates all large itemsets with a minimum support, and from * these all association rules. * * @param instances the instances to be used for generating the associations * @throws Exception if rules can't be built successfully */ @SuppressWarnings("unchecked") @Override public void buildAssociations(Instances instances) throws Exception { int temp = m_premiseCount, exactNumber = m_numRules - 5; m_premiseCount = 1; m_best = new TreeSet(); m_bestChanged = false; m_expectation = 0; m_count = 1; m_instances = new Instances(instances); if (m_classIndex == -1) { m_instances.setClassIndex(m_instances.numAttributes() - 1); } else if (m_classIndex <= m_instances.numAttributes() && m_classIndex > 0) { m_instances.setClassIndex(m_classIndex - 1); } else { throw new Exception("Invalid class index."); } // can associator handle the data? getCapabilities().testWithFail(m_instances); // prior estimation m_priorEstimator = new PriorEstimation(m_instances, m_numRandRules, m_numIntervals, m_car); m_priors = m_priorEstimator.estimatePrior(); m_midPoints = m_priorEstimator.getMidPoints(); m_Ls = new ArrayList>(); m_hashtables = new ArrayList>(); for (int i = 1; i < m_instances.numAttributes(); i++) { m_bestChanged = false; if (!m_car) { // find large item sets findLargeItemSets(i); // find association rules (rule generation procedure) findRulesQuickly(); } else { findLargeCarItemSets(i); findCaRulesQuickly(); } if (m_bestChanged) { temp = m_premiseCount; while (RuleGeneration.expectation(m_premiseCount, m_premiseCount, m_midPoints, m_priors) <= m_expectation) { m_premiseCount++; if (m_premiseCount > m_instances.numInstances()) { break; } } } if (m_premiseCount > m_instances.numInstances()) { // Reserve space for variables m_allTheRules = new ArrayList[3]; m_allTheRules[0] = new ArrayList(); m_allTheRules[1] = new ArrayList(); m_allTheRules[2] = new ArrayList(); int k = 0; while (m_best.size() > 0 && exactNumber > 0) { m_allTheRules[0].add(k, m_best.last().premise()); m_allTheRules[1].add(k, m_best.last().consequence()); m_allTheRules[2].add(k, new Double(m_best.last().accuracy())); m_best.remove(m_best.last()); k++; exactNumber--; } return; } if (temp != m_premiseCount && m_Ls.size() > 0) { ArrayList kSets = m_Ls.get(m_Ls.size() - 1); m_Ls.remove(m_Ls.size() - 1); kSets = ItemSet .deleteItemSets(kSets, m_premiseCount, Integer.MAX_VALUE); m_Ls.add(kSets); } } // Reserve space for variables m_allTheRules = new ArrayList[3]; m_allTheRules[0] = new ArrayList(); m_allTheRules[1] = new ArrayList(); m_allTheRules[2] = new ArrayList(); int k = 0; while (m_best.size() > 0 && exactNumber > 0) { m_allTheRules[0].add(k, m_best.last().premise()); m_allTheRules[1].add(k, m_best.last().consequence()); m_allTheRules[2].add(k, new Double(m_best.last().accuracy())); m_best.remove(m_best.last()); k++; exactNumber--; } } /** * Method that mines the n best class association rules. * * @return an sorted array of FastVector (depending on the expected predictive * accuracy) containing the rules and metric information * @param data the instances for which class association rules should be mined * @throws Exception if rules can't be built successfully */ @SuppressWarnings("unchecked") @Override public ArrayList[] mineCARs(Instances data) throws Exception { m_car = true; m_best = new TreeSet(); m_premiseCount = 1; m_bestChanged = false; m_expectation = 0; m_count = 1; buildAssociations(data); ArrayList[] allCARRules = new ArrayList[3]; allCARRules[0] = new ArrayList(); allCARRules[1] = new ArrayList(); allCARRules[2] = new ArrayList(); for (int k = 0; k < m_allTheRules[0].size(); k++) { int[] newPremiseArray = new int[m_instances.numAttributes() - 1]; int help = 0; for (int j = 0; j < m_instances.numAttributes(); j++) { if (j != m_instances.classIndex()) { newPremiseArray[help] = ((ItemSet) m_allTheRules[0].get(k)).itemAt(j); help++; } } ItemSet newPremise = new ItemSet(m_instances.numInstances(), newPremiseArray); newPremise.setCounter(((ItemSet) m_allTheRules[0].get(k)).counter()); allCARRules[0].add(newPremise); int[] newConsArray = new int[1]; newConsArray[0] = ((ItemSet) m_allTheRules[1].get(k)).itemAt(m_instances .classIndex()); ItemSet newCons = new ItemSet(m_instances.numInstances(), newConsArray); newCons.setCounter(((ItemSet) m_allTheRules[1].get(k)).counter()); allCARRules[1].add(newCons); allCARRules[2].add(m_allTheRules[2].get(k)); } return allCARRules; } /** * Gets the instances without the class attribute * * @return instances without class attribute */ @Override public Instances getInstancesNoClass() { Instances noClass = null; try { noClass = LabeledItemSet.divide(m_instances, false); } catch (Exception e) { e.printStackTrace(); System.out.println("\n" + e.getMessage()); } // System.out.println(noClass); return noClass; } /** * Gets the class attribute of all instances * * @return Instances containing only the class attribute */ @Override public Instances getInstancesOnlyClass() { Instances onlyClass = null; try { onlyClass = LabeledItemSet.divide(m_instances, true); } catch (Exception e) { e.printStackTrace(); System.out.println("\n" + e.getMessage()); } return onlyClass; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration