All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.associations.FPGrowth Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    FPGrowth.java
 *    Copyright (C) 2009-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.associations;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.SparseInstance;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;

/**
 *  Class implementing the FP-growth algorithm for
 * finding large item sets without candidate generation. Iteratively reduces the
 * minimum support until it finds the required number of rules with the given
 * minimum metric. For more information see:
*
* J. Han, J.Pei, Y. Yin: Mining frequent patterns without candidate generation. * In: Proceedings of the 2000 ACM-SIGMID International Conference on Management * of Data, 1-12, 2000. *

* * * BibTeX: * *

 * @inproceedings{Han2000,
 *    author = {J. Han and J.Pei and Y. Yin},
 *    booktitle = {Proceedings of the 2000 ACM-SIGMID International Conference on Management of Data},
 *    pages = {1-12},
 *    title = {Mining frequent patterns without candidate generation},
 *    year = {2000}
 * }
 * 
*

* * * Valid options are: *

* *

 * -P <attribute index of positive value>
 *  Set the index of the attribute value to consider as 'positive'
 *  for binary attributes in normal dense instances. Index 2 is always
 *  used for sparse instances. (default = 2)
 * 
* *
 * -I <max items>
 *  The maximum number of items to include in large items sets (and rules). (default = -1, i.e. no limit.)
 * 
* *
 * -N <require number of rules>
 *  The required number of rules. (default = 10)
 * 
* *
 * -T <0=confidence | 1=lift | 2=leverage | 3=Conviction>
 *  The metric by which to rank rules. (default = confidence)
 * 
* *
 * -C <minimum metric score of a rule>
 *  The minimum metric score of a rule. (default = 0.9)
 * 
* *
 * -U <upper bound for minimum support>
 *  Upper bound for minimum support. (default = 1.0)
 * 
* *
 * -M <lower bound for minimum support>
 *  The lower bound for the minimum support. (default = 0.1)
 * 
* *
 * -D <delta for minimum support>
 *  The delta by which the minimum support is decreased in
 *  each iteration. (default = 0.05)
 * 
* *
 * -S
 *  Find all rules that meet the lower bound on
 *  minimum support and the minimum metric constraint.
 *  Turning this mode on will disable the iterative support reduction
 *  procedure to find the specified number of rules.
 * 
* *
 * -transactions <comma separated list of attribute names>
 *  Only consider transactions that contain these items (default = no restriction)
 * 
* *
 * -rules <comma separated list of attribute names>
 *  Only print rules that contain these items. (default = no restriction)
 * 
* *
 * -use-or
 *  Use OR instead of AND for must contain list(s). Use in conjunction
 *  with -transactions and/or -rules
 * 
* * * * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) * @version $Revision: 10172 $ */ public class FPGrowth extends AbstractAssociator implements AssociationRulesProducer, OptionHandler, TechnicalInformationHandler { /** For serialization */ private static final long serialVersionUID = 3620717108603442911L; /** * Class for maintaining a frequent item set. */ protected static class FrequentBinaryItemSet implements Serializable, Cloneable { /** For serialization */ private static final long serialVersionUID = -6543815873565829448L; /** The list of items in the item set */ protected ArrayList m_items = new ArrayList(); /** the support of this item set **/ protected int m_support; /** * Constructor * * @param items the items that make up the frequent item set. * @param support the support of this item set. */ public FrequentBinaryItemSet(ArrayList items, int support) { m_items = items; m_support = support; Collections.sort(m_items); } /** * Add an item to this item set. * * @param i the item to add. */ public void addItem(BinaryItem i) { m_items.add(i); Collections.sort(m_items); } /** * Set the support for this item set. * * @param support the support for this item set. */ public void setSupport(int support) { m_support = support; } /** * Get the support of this item set. * * @return the support of this item set. */ public int getSupport() { return m_support; } /** * Get the items in this item set. * * @return the items in this item set. */ public Collection getItems() { return m_items; } /** * Get a particular item from this item set. * * @param index the index of the item to get. * @return the item. */ public BinaryItem getItem(int index) { return m_items.get(index); } /** * Get the number of items in this item set. * * @return the number of items in this item set. */ public int numberOfItems() { return m_items.size(); } /** * Get a textual description of this item set. * * @return a textual description of this item set. */ @Override public String toString() { StringBuffer buff = new StringBuffer(); Iterator i = m_items.iterator(); while (i.hasNext()) { buff.append(i.next().toString() + " "); } buff.append(": " + m_support); return buff.toString(); } /** * Make a copy of this item set. * * @return a copy of this item set. */ @Override public Object clone() { ArrayList items = new ArrayList(m_items); return new FrequentBinaryItemSet(items, m_support); } } /** * Maintains a list of frequent item sets. */ protected static class FrequentItemSets implements Serializable { /** For serialization */ private static final long serialVersionUID = 4173606872363973588L; /** The list of frequent item sets */ protected ArrayList m_sets = new ArrayList(); /** The total number of transactions in the data */ protected int m_numberOfTransactions; /** * Constructor. * * @param numTransactions the total number of transactions in the data. */ public FrequentItemSets(int numTransactions) { m_numberOfTransactions = numTransactions; } /** * Get an item set. * * @param index the index of the item set to get. * @return an item set. */ public FrequentBinaryItemSet getItemSet(int index) { return m_sets.get(index); } /** * Get an iterator that can be used to access all the item sets. * * @return an iterator. */ public Iterator iterator() { return m_sets.iterator(); } /** * Get the total number of transactions in the data that the item sets were * derived from. * * @return the total number of transactions in the data. */ public int getNumberOfTransactions() { return m_numberOfTransactions; } /** * Add an item set. * * @param setToAdd the item set to add. */ public void addItemSet(FrequentBinaryItemSet setToAdd) { m_sets.add(setToAdd); } /** * Sort the item sets according to the supplied comparator. * * @param comp the comparator to use. */ public void sort(Comparator comp) { Collections.sort(m_sets, comp); } /** * Get the number of item sets. * * @return the number of item sets. */ public int size() { return m_sets.size(); } /** * Sort the item sets. Sorts by item set length. Ties are broken by * comparing the items in the two item sets. */ public void sort() { Comparator compF = new Comparator() { @Override public int compare(FrequentBinaryItemSet one, FrequentBinaryItemSet two) { Collection compOne = one.getItems(); Collection compTwo = two.getItems(); // if (one.getSupport() == two.getSupport()) { // if supports are equal then list shorter item sets before longer // ones if (compOne.size() < compTwo.size()) { return -1; } else if (compOne.size() > compTwo.size()) { return 1; } else { // compare items Iterator twoIterator = compTwo.iterator(); for (BinaryItem oneI : compOne) { BinaryItem twoI = twoIterator.next(); int result = oneI.compareTo(twoI); if (result != 0) { return result; } } return 0; // equal } // return 0; /* * } else if (one.getSupport() > two.getSupport()) { // reverse * ordering (i.e. descending by support) return -1; } */ // return 1; } }; sort(compF); } /** * Get a textual description of this list of item sets. * * @param numSets the number of item sets to display. * @return a textual description of the item sets. */ public String toString(int numSets) { if (m_sets.size() == 0) { return "No frequent items sets found!"; } StringBuffer result = new StringBuffer(); result.append("" + m_sets.size() + " frequent item sets found"); if (numSets > 0) { result.append(" , displaying " + numSets); } result.append(":\n\n"); int count = 0; for (FrequentBinaryItemSet i : m_sets) { if (numSets > 0 && count > numSets) { break; } result.append(i.toString() + "\n"); count++; } return result.toString(); } } /** * This class holds the counts for projected tree nodes and header lists. */ protected static class ShadowCounts implements Serializable { /** For serialization */ private static final long serialVersionUID = 4435433714185969155L; /** Holds the counts at different recursion levels */ private final ArrayList m_counts = new ArrayList(); /** * Get the count at the specified recursion depth. * * @param recursionLevel the depth of the recursion. * @return the count. */ public int getCount(int recursionLevel) { if (recursionLevel >= m_counts.size()) { return 0; } else { return m_counts.get(recursionLevel); } } /** * Increase the count at a given recursion level. * * @param recursionLevel the level at which to increase the count. * @param incr the amount by which to increase the count. */ public void increaseCount(int recursionLevel, int incr) { // basically treat the list like a stack where we // can add a new element, or increment the element // at the top if (recursionLevel == m_counts.size()) { // new element m_counts.add(incr); } else if (recursionLevel == m_counts.size() - 1) { // otherwise increment the top int n = m_counts.get(recursionLevel).intValue(); m_counts.set(recursionLevel, (n + incr)); } } /** * Remove the count at the given recursion level. * * @param recursionLevel the level at which to remove the count. */ public void removeCount(int recursionLevel) { if (recursionLevel < m_counts.size()) { m_counts.remove(recursionLevel); } } } /** * A node in the FP-tree. */ protected static class FPTreeNode implements Serializable { /** For serialization */ private static final long serialVersionUID = 4396315323673737660L; /** link to another sibling at this level in the tree */ protected FPTreeNode m_levelSibling; /** link to the parent node */ protected FPTreeNode m_parent; /** item at this node */ protected BinaryItem m_item; /** ID (for graphing the tree) */ protected int m_ID; /** the children of this node */ protected Map m_children = new HashMap(); /** counts associated with projected versions of this node */ protected ShadowCounts m_projectedCounts = new ShadowCounts(); /** * Construct a new node with the given parent link and item. * * @param parent a pointer to the parent of this node. * @param item the item at this node. */ public FPTreeNode(FPTreeNode parent, BinaryItem item) { m_parent = parent; m_item = item; } /** * Insert an item set into the tree at this node. Removes the first item * from the supplied item set and makes a recursive call to insert the * remaining items. * * @param itemSet the item set to insert. * @param headerTable the header table for the tree. * @param incr the amount by which to increase counts. */ public void addItemSet(Collection itemSet, Map headerTable, int incr) { Iterator i = itemSet.iterator(); if (i.hasNext()) { BinaryItem first = i.next(); FPTreeNode aChild; if (!m_children.containsKey(first)) { // not in the tree, so add it. aChild = new FPTreeNode(this, first); m_children.put(first, aChild); // update the header if (!headerTable.containsKey(first)) { headerTable.put(first, new FPTreeRoot.Header()); } // append new node to header list headerTable.get(first).addToList(aChild); } else { // get the appropriate child node aChild = m_children.get(first); } // update counts in header table headerTable.get(first).getProjectedCounts().increaseCount(0, incr); // increase the child's count aChild.increaseProjectedCount(0, incr); // proceed recursively itemSet.remove(first); aChild.addItemSet(itemSet, headerTable, incr); } } /** * Increase the projected count at the given recursion level at this node * * @param recursionLevel the recursion level to increase the node count at. * @param incr the amount by which to increase the count. */ public void increaseProjectedCount(int recursionLevel, int incr) { m_projectedCounts.increaseCount(recursionLevel, incr); } /** * Remove the projected count at the given recursion level for this node. * * @param recursionLevel the recursion level at which to remove the count. */ public void removeProjectedCount(int recursionLevel) { m_projectedCounts.removeCount(recursionLevel); } /** * Get the projected count at the given recursion level for this node. * * @param recursionLevel the recursion level at which to get the count. * @return the count. */ public int getProjectedCount(int recursionLevel) { return m_projectedCounts.getCount(recursionLevel); } /** * Get the parent node. * * @return the parent node. */ public FPTreeNode getParent() { return m_parent; } /** * Get the item at this node. * * @return the item at this node. */ public BinaryItem getItem() { return m_item; } /** * Return a textual description of this node for a given recursion level. * * @param recursionLevel the recursion depth to use. * @return a textual description of this node. */ public String toString(int recursionLevel) { return toString("", recursionLevel); } /** * Return a textual description of this node for a given recursion level. * * @param prefix a prefix string to prepend. * @param recursionLevel the recursion level to use. * @return a textual description of this node. */ public String toString(String prefix, int recursionLevel) { StringBuffer buffer = new StringBuffer(); buffer.append(prefix); buffer.append("| "); buffer.append(m_item.toString()); buffer.append(" ("); buffer.append(m_projectedCounts.getCount(recursionLevel)); buffer.append(")\n"); for (FPTreeNode node : m_children.values()) { buffer.append(node.toString(prefix + "| ", recursionLevel)); } return buffer.toString(); } protected int assignIDs(int lastID) { int currentLastID = lastID + 1; m_ID = currentLastID; if (m_children != null) { Collection kids = m_children.values(); for (FPTreeNode n : kids) { currentLastID = n.assignIDs(currentLastID); } } return currentLastID; } /** * Generate a dot graph description string for the tree. * * @param text a StringBuffer to store the graph description in. */ public void graphFPTree(StringBuffer text) { if (m_children != null) { Collection kids = m_children.values(); for (FPTreeNode n : kids) { text.append("N" + n.m_ID); text.append(" [label=\""); text.append(n.getItem().toString() + " (" + n.getProjectedCount(0) + ")\\n"); text.append("\"]\n"); n.graphFPTree(text); text.append("N" + m_ID + "->" + "N" + n.m_ID + "\n"); } } } } /** * Root of the FPTree */ private static class FPTreeRoot extends FPTreeNode { /** For serialization */ private static final long serialVersionUID = 632150939785333297L; /** * Stores a header entry for an FPTree */ protected static class Header implements Serializable { /** For serialization */ private static final long serialVersionUID = -6583156284891368909L; /** The list of pointers into the tree structure */ protected List m_headerList = new LinkedList(); /** Projected header counts for this entry */ protected ShadowCounts m_projectedHeaderCounts = new ShadowCounts(); /** * Add a tree node into the list for this header entry. * * @param toAdd the node to add. */ public void addToList(FPTreeNode toAdd) { m_headerList.add(toAdd); } /** * Get the list of nodes for this header entry. * * @return the list of nodes for this header entry. */ public List getHeaderList() { return m_headerList; } /** * Get the projected counts for this header entry. * * @return the projected counts for this header entry. */ public ShadowCounts getProjectedCounts() { return m_projectedHeaderCounts; } } /** Stores the header table as mapped Header entries */ protected Map m_headerTable = new HashMap(); /** * Create a new FPTreeRoot. */ public FPTreeRoot() { super(null, null); } /** * Insert an item set into the tree. * * @param itemSet the item set to insert into the tree. * @param incr the increment by which to increase counters. */ public void addItemSet(Collection itemSet, int incr) { super.addItemSet(itemSet, m_headerTable, incr); } /** * Get the header table for this tree. * * @return the header table for this tree. */ public Map getHeaderTable() { return m_headerTable; } public boolean isEmpty(int recursionLevel) { for (FPTreeNode c : m_children.values()) { if (c.getProjectedCount(recursionLevel) > 0) { return false; } } return true; } /** * Get a textual description of the tree at a given recursion (projection) * level. * * @param pad the string to use as a prefix for indenting nodes. * @param recursionLevel the recursion level (projection) to use. * @return the textual description of the tree. */ @Override public String toString(String pad, int recursionLevel) { StringBuffer result = new StringBuffer(); result.append(pad); result.append("+ ROOT\n"); for (FPTreeNode node : m_children.values()) { result.append(node.toString(pad + "| ", recursionLevel)); } return result.toString(); } } private static void nextSubset(boolean[] subset) { for (int i = 0; i < subset.length; i++) { if (!subset[i]) { subset[i] = true; break; } else { subset[i] = false; } } } private static Collection getPremise(FrequentBinaryItemSet fis, boolean[] subset) { boolean ok = false; for (int i = 0; i < subset.length; i++) { if (!subset[i]) { ok = true; break; } } if (!ok) { return null; } List premise = new ArrayList(); ArrayList items = new ArrayList(fis.getItems()); for (int i = 0; i < subset.length; i++) { if (subset[i]) { premise.add(items.get(i)); } } return premise; } private static Collection getConsequence(FrequentBinaryItemSet fis, boolean[] subset) { List consequence = new ArrayList(); ArrayList items = new ArrayList(fis.getItems()); for (int i = 0; i < subset.length; i++) { if (!subset[i]) { consequence.add(items.get(i)); } } return consequence; } /** * Generate all association rules, from the supplied frequet item sets, that * meet a given minimum metric threshold. Uses a brute force approach. * * @param largeItemSets the set of frequent item sets * @param metricToUse the metric to use * @param metricThreshold the threshold value that a rule must meet * @param upperBoundMinSuppAsInstances the upper bound on the support in order * to accept the rule * @param lowerBoundMinSuppAsInstances the lower bound on the support in order * to accept the rule * @param totalTransactions the total number of transactions in the data * @return a list of association rules */ public static List generateRulesBruteForce( FrequentItemSets largeItemSets, DefaultAssociationRule.METRIC_TYPE metricToUse, double metricThreshold, int upperBoundMinSuppAsInstances, int lowerBoundMinSuppAsInstances, int totalTransactions) { List rules = new ArrayList(); largeItemSets.sort(); Map, Integer> frequencyLookup = new HashMap, Integer>(); Iterator setI = largeItemSets.iterator(); // process each large item set while (setI.hasNext()) { FrequentBinaryItemSet fis = setI.next(); frequencyLookup.put(fis.getItems(), fis.getSupport()); if (fis.getItems().size() > 1) { // generate all the possible subsets for the premise boolean[] subset = new boolean[fis.getItems().size()]; Collection premise = null; Collection consequence = null; while ((premise = getPremise(fis, subset)) != null) { if (premise.size() > 0 && premise.size() < fis.getItems().size()) { consequence = getConsequence(fis, subset); int totalSupport = fis.getSupport(); int supportPremise = frequencyLookup.get(premise).intValue(); int supportConsequence = frequencyLookup.get(consequence) .intValue(); // a candidate rule DefaultAssociationRule candidate = new DefaultAssociationRule( premise, consequence, metricToUse, supportPremise, supportConsequence, totalSupport, totalTransactions); if (candidate.getPrimaryMetricValue() > metricThreshold && candidate.getTotalSupport() >= lowerBoundMinSuppAsInstances && candidate.getTotalSupport() <= upperBoundMinSuppAsInstances) { // accept this rule rules.add(candidate); } } nextSubset(subset); } } } return rules; } public static List pruneRules( List rulesToPrune, ArrayList itemsToConsider, boolean useOr) { ArrayList result = new ArrayList(); for (AssociationRule r : rulesToPrune) { if (r.containsItems(itemsToConsider, useOr)) { result.add(r); } } return result; } /** The number of rules to find */ protected int m_numRulesToFind = 10; // protected double m_upperBoundMinSupport = 0.36; /** The upper bound on the minimum support */ protected double m_upperBoundMinSupport = 1.0; /** The lower bound on minimum support */ protected double m_lowerBoundMinSupport = 0.1; /** The amount by which to decrease the support in each iteration */ protected double m_delta = 0.05; /** The number of instances in the data */ protected int m_numInstances; /** * When processing data off of disk report progress this frequently (number of * instances). */ protected int m_offDiskReportingFrequency = 10000; /** * If true, just all rules meeting the lower bound on the minimum support will * be found. The number of rules to find will be ignored and the iterative * reduction of support will not be done. */ protected boolean m_findAllRulesForSupportLevel = false; // protected double m_lowerBoundMinSupport = 0.0; /** The index (1 based) of binary attributes to treat as the positive value */ protected int m_positiveIndex = 2; protected DefaultAssociationRule.METRIC_TYPE m_metric = DefaultAssociationRule.METRIC_TYPE.CONFIDENCE; protected double m_metricThreshold = 0.9; /** Holds the large item sets found */ protected FrequentItemSets m_largeItemSets; /** Holds the rules */ protected List m_rules; // maximum number of items in a large item set (zero means no limit) protected int m_maxItems = -1; /** * If set, limit the transactions (instances) input to the algorithm to those * that contain these items */ protected String m_transactionsMustContain = ""; /** Use OR rather than AND when considering must contain lists */ protected boolean m_mustContainOR = false; /** If set, then only output rules containing these itmes */ protected String m_rulesMustContain = ""; /** * Returns default capabilities of the classifier. * * @return the capabilities of this classifier */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // enable what we can handle // attributes result.enable(Capability.UNARY_ATTRIBUTES); result.enable(Capability.BINARY_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Returns a string describing this associator * * @return a description of the evaluator suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Class implementing the FP-growth algorithm for finding" + " large item sets without candidate generation. Iteratively" + " reduces the minimum support until it finds the required" + " number of rules with the given minimum metric." + " For more information see:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "J. Han and J.Pei and Y. Yin"); result.setValue(Field.TITLE, "Mining frequent patterns without candidate generation"); result.setValue(Field.BOOKTITLE, "Proceedings of the 2000 ACM-SIGMID International" + " Conference on Management of Data"); result.setValue(Field.YEAR, "2000"); result.setValue(Field.PAGES, "1-12"); return result; } private boolean passesMustContain(Instance inst, boolean[] transactionsMustContainIndexes, int numInTransactionsMustContainList) { boolean result = false; if (inst instanceof SparseInstance) { int containsCount = 0; for (int i = 0; i < inst.numValues(); i++) { int attIndex = inst.index(i); if (m_mustContainOR) { if (transactionsMustContainIndexes[attIndex]) { // break here since the operator is OR and this // instance contains at least one of the items return true; } } else { if (transactionsMustContainIndexes[attIndex]) { containsCount++; } } } if (!m_mustContainOR) { if (containsCount == numInTransactionsMustContainList) { return true; } } } else { int containsCount = 0; for (int i = 0; i < transactionsMustContainIndexes.length; i++) { if (transactionsMustContainIndexes[i]) { if ((int) inst.value(i) == m_positiveIndex - 1) { if (m_mustContainOR) { // break here since the operator is OR and // this instance contains at least one of the // requested items return true; } else { containsCount++; } } } } if (!m_mustContainOR) { if (containsCount == numInTransactionsMustContainList) { return true; } } } return result; } private void processSingleton(Instance current, ArrayList singletons) throws Exception { if (current instanceof SparseInstance) { for (int j = 0; j < current.numValues(); j++) { int attIndex = current.index(j); singletons.get(attIndex).increaseFrequency(); } } else { for (int j = 0; j < current.numAttributes(); j++) { if (!current.isMissing(j)) { if (current.attribute(j).numValues() == 1 || current.value(j) == m_positiveIndex - 1) { singletons.get(j).increaseFrequency(); } } } } } /** * Get the singleton items in the data * * @param source the source of the data (either Instances or an ArffLoader). * @return a list of singleton item sets * @throws Exception if the singletons can't be found for some reason */ protected ArrayList getSingletons(Object source) throws Exception { ArrayList singletons = new ArrayList(); Instances data = null; if (source instanceof Instances) { data = (Instances) source; } else if (source instanceof weka.core.converters.ArffLoader) { data = ((weka.core.converters.ArffLoader) source).getStructure(); } for (int i = 0; i < data.numAttributes(); i++) { singletons.add(new BinaryItem(data.attribute(i), m_positiveIndex - 1)); } if (source instanceof Instances) { // set the number of instances m_numInstances = data.numInstances(); for (int i = 0; i < data.numInstances(); i++) { Instance current = data.instance(i); processSingleton(current, singletons); } } else if (source instanceof weka.core.converters.ArffLoader) { weka.core.converters.ArffLoader loader = (weka.core.converters.ArffLoader) source; Instance current = null; int count = 0; while ((current = loader.getNextInstance(data)) != null) { processSingleton(current, singletons); count++; if (count % m_offDiskReportingFrequency == 0) { System.err.println("Singletons: done " + count); } } // set the number of instances m_numInstances = count; loader.reset(); } return singletons; } /** * Get the singleton items in the data * * @param data the Instances to process * @return a list of singleton item sets * @throws Exception if the singletons can't be found for some reason */ protected ArrayList getSingletons(Instances data) throws Exception { return getSingletons((Object) data); /* * ArrayList singletons = new ArrayList(); * * for (int i = 0; i < data.numAttributes(); i++) { singletons.add(new * BinaryItem(data.attribute(i), m_positiveIndex - 1)); } * * for (int i = 0; i < data.numInstances(); i++) { Instance current = * data.instance(i); if (current instanceof SparseInstance) { for (int j = * 0; j < current.numValues(); j++) { int attIndex = current.index(j); * singletons.get(attIndex).increaseFrequency(); } } else { for (int j = 0; * j < data.numAttributes(); j++) { if (!current.isMissing(j)) { if * (current.attribute(j).numValues() == 1 || current.value(j) == * m_positiveIndex - 1) { singletons.get(j).increaseFrequency(); } } } } } * * return singletons; */ } /* * protected ArrayList getFrequent(ArrayList items, * int minSupport) { ArrayList frequent = new * ArrayList(); for (BinaryItem b : items) { if (b.getFrequency() * > minSupport) { frequent.add(b); } } * * // sort in descending order of support Collections.sort(frequent); return * frequent; } */ /** * Inserts a single instance into the FPTree. * * @param current the instance to insert * @param singletons the singleton item sets * @param tree the tree to insert into * @param minSupport the minimum support threshold */ private void insertInstance(Instance current, ArrayList singletons, FPTreeRoot tree, int minSupport) { ArrayList transaction = new ArrayList(); if (current instanceof SparseInstance) { for (int j = 0; j < current.numValues(); j++) { int attIndex = current.index(j); if (singletons.get(attIndex).getFrequency() >= minSupport) { transaction.add(singletons.get(attIndex)); } } Collections.sort(transaction); tree.addItemSet(transaction, 1); } else { for (int j = 0; j < current.numAttributes(); j++) { if (!current.isMissing(j)) { if (current.attribute(j).numValues() == 1 || current.value(j) == m_positiveIndex - 1) { if (singletons.get(j).getFrequency() >= minSupport) { transaction.add(singletons.get(j)); } } } } Collections.sort(transaction); tree.addItemSet(transaction, 1); } } /** * Construct the frequent pattern tree by inserting each transaction in the * data into the tree. Only those items from each transaction that meet the * minimum support threshold are inserted. * * @param singletons the singleton item sets * @param data the Instances containing the transactions * @param minSupport the minimum support * @return the root of the tree */ protected FPTreeRoot buildFPTree(ArrayList singletons, Object dataSource, int minSupport) throws Exception { FPTreeRoot tree = new FPTreeRoot(); Instances data = null; if (dataSource instanceof Instances) { data = (Instances) dataSource; } else if (dataSource instanceof weka.core.converters.ArffLoader) { data = ((weka.core.converters.ArffLoader) dataSource).getStructure(); } if (dataSource instanceof Instances) { for (int i = 0; i < data.numInstances(); i++) { insertInstance(data.instance(i), singletons, tree, minSupport); } } else if (dataSource instanceof weka.core.converters.ArffLoader) { weka.core.converters.ArffLoader loader = (weka.core.converters.ArffLoader) dataSource; Instance current = null; int count = 0; while ((current = loader.getNextInstance(data)) != null) { insertInstance(current, singletons, tree, minSupport); count++; if (count % m_offDiskReportingFrequency == 0) { System.err.println("build tree done: " + count); } } } return tree; } /** * Construct the frequent pattern tree by inserting each transaction in the * data into the tree. Only those items from each transaction that meet the * minimum support threshold are inserted. * * @param singletons the singleton item sets * @param data the Instances containing the transactions * @param minSupport the minimum support * @return the root of the tree */ /* * protected FPTreeRoot buildFPTree(ArrayList singletons, * Instances data, int minSupport) { * * FPTreeRoot tree = new FPTreeRoot(); * * for (int i = 0; i < data.numInstances(); i++) { Instance current = * data.instance(i); ArrayList transaction = new * ArrayList(); if (current instanceof SparseInstance) { for (int * j = 0; j < current.numValues(); j++) { int attIndex = current.index(j); if * (singletons.get(attIndex).getFrequency() >= minSupport) { * transaction.add(singletons.get(attIndex)); } } * Collections.sort(transaction); tree.addItemSet(transaction, 1); } else { * for (int j = 0; j < data.numAttributes(); j++) { if (!current.isMissing(j)) * { if (current.attribute(j).numValues() == 1 || current.value(j) == * m_positiveIndex - 1) { if (singletons.get(j).getFrequency() >= minSupport) * { transaction.add(singletons.get(j)); } } } } * Collections.sort(transaction); tree.addItemSet(transaction, 1); } } * * return tree; } */ /** * Find large item sets in the FP-tree. * * @param tree the root of the tree to mine * @param largeItemSets holds the large item sets found * @param recursionLevel the recursion level for the current projected counts * @param conditionalItems the current set of items that the current * (projected) tree is conditional on * @param minSupport the minimum acceptable support */ protected void mineTree(FPTreeRoot tree, FrequentItemSets largeItemSets, int recursionLevel, FrequentBinaryItemSet conditionalItems, int minSupport) { if (!tree.isEmpty(recursionLevel)) { if (m_maxItems > 0 && recursionLevel >= m_maxItems) { // don't mine any further return; } Map headerTable = tree.getHeaderTable(); Set keys = headerTable.keySet(); // System.err.println("Number of freq item sets collected " + // largeItemSets.size()); Iterator i = keys.iterator(); while (i.hasNext()) { BinaryItem item = i.next(); FPTreeRoot.Header itemHeader = headerTable.get(item); // check for minimum support at this level int support = itemHeader.getProjectedCounts().getCount(recursionLevel); if (support >= minSupport) { // process header list at this recursion level for (FPTreeNode n : itemHeader.getHeaderList()) { // push count up path to root int currentCount = n.getProjectedCount(recursionLevel); if (currentCount > 0) { FPTreeNode temp = n.getParent(); while (temp != tree) { // set/increase for the node temp.increaseProjectedCount(recursionLevel + 1, currentCount); // set/increase for the header table headerTable.get(temp.getItem()).getProjectedCounts() .increaseCount(recursionLevel + 1, currentCount); temp = temp.getParent(); } } } FrequentBinaryItemSet newConditional = (FrequentBinaryItemSet) conditionalItems .clone(); // this item gets added to the conditional items newConditional.addItem(item); newConditional.setSupport(support); // now add this conditional item set to the list of large item sets largeItemSets.addItemSet(newConditional); // now recursively process the new tree mineTree(tree, largeItemSets, recursionLevel + 1, newConditional, minSupport); // reverse the propagated counts for (FPTreeNode n : itemHeader.getHeaderList()) { FPTreeNode temp = n.getParent(); while (temp != tree) { temp.removeProjectedCount(recursionLevel + 1); temp = temp.getParent(); } } // reverse the propagated counts in the header list // at this recursion level for (FPTreeRoot.Header h : headerTable.values()) { h.getProjectedCounts().removeCount(recursionLevel + 1); } } } } } /** * Construct a new FPGrowth object. */ public FPGrowth() { resetOptions(); } /** * Reset all options to their default values. */ public void resetOptions() { m_delta = 0.05; m_metricThreshold = 0.9; m_numRulesToFind = 10; m_lowerBoundMinSupport = 0.1; m_upperBoundMinSupport = 1.0; // m_minSupport = -1; m_positiveIndex = 2; m_transactionsMustContain = ""; m_rulesMustContain = ""; m_mustContainOR = false; } /** * Tip text for this property suitable for displaying in the GUI. * * @return the tip text for this property. */ public String positiveIndexTipText() { return "Set the index of binary valued attributes that is to be considered" + " the positive index. Has no effect for sparse data (in this case" + " the first index (i.e. non-zero values) is always treated as " + " positive. Also has no effect for unary valued attributes (i.e." + " when using the Weka Apriori-style format for market basket data," + " which uses missing value \"?\" to indicate" + " absence of an item."; } /** * Set the index of the attribute value to consider as positive for binary * attributes in normal dense instances. Index 1 is always used for sparse * instances. * * @param index the index to use for positive values in binary attributes. */ public void setPositiveIndex(int index) { m_positiveIndex = index; } /** * Get the index of the attribute value to consider as positive for binary * attributes in normal dense instances. Index 1 is always used for sparse * instances. * * @return the index to use for positive values in binary attributes. */ public int getPositiveIndex() { return m_positiveIndex; } /** * Set the desired number of rules to find. * * @param numR the number of rules to find. */ public void setNumRulesToFind(int numR) { m_numRulesToFind = numR; } /** * Get the number of rules to find. * * @return the number of rules to find. */ public int getNumRulesToFind() { return m_numRulesToFind; } /** * Tip text for this property suitable for displaying in the GUI. * * @return the tip text for this property. */ public String numRulesToFindTipText() { return "The number of rules to output"; } /** * Set the metric type to use. * * @param d the metric type */ public void setMetricType(SelectedTag d) { int ordinal = d.getSelectedTag().getID(); for (DefaultAssociationRule.METRIC_TYPE m : DefaultAssociationRule.METRIC_TYPE .values()) { if (m.ordinal() == ordinal) { m_metric = m; break; } } } /** * Set the maximum number of items to include in large items sets. * * @param max the maxim number of items to include in large item sets. */ public void setMaxNumberOfItems(int max) { m_maxItems = max; } /** * Gets the maximum number of items to be included in large item sets. * * @return the maximum number of items to be included in large items sets. */ public int getMaxNumberOfItems() { return m_maxItems; } /** * Tip text for this property suitable for displaying in the GUI. * * @return the tip text for this property. */ public String maxNumberOfItemsTipText() { return "The maximum number of items to include in frequent item sets. -1 " + "means no limit."; } /** * Get the metric type to use. * * @return the metric type to use. */ public SelectedTag getMetricType() { return new SelectedTag(m_metric.ordinal(), DefaultAssociationRule.TAGS_SELECTION); } /** * Tip text for this property suitable for displaying in the GUI. * * @return the tip text for this property. */ public String metricTypeTipText() { return "Set the type of metric by which to rank rules. Confidence is " + "the proportion of the examples covered by the premise that are also " + "covered by the consequence(Class association rules can only be mined using confidence). Lift is confidence divided by the " + "proportion of all examples that are covered by the consequence. This " + "is a measure of the importance of the association that is independent " + "of support. Leverage is the proportion of additional examples covered " + "by both the premise and consequence above those expected if the " + "premise and consequence were independent of each other. The total " + "number of examples that this represents is presented in brackets " + "following the leverage. Conviction is " + "another measure of departure from independence."; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String minMetricTipText() { return "Minimum metric score. Consider only rules with scores higher than " + "this value."; } /** * Get the value of minConfidence. * * @return Value of minConfidence. */ public double getMinMetric() { return m_metricThreshold; } /** * Set the value of minConfidence. * * @param v Value to assign to minConfidence. */ public void setMinMetric(double v) { m_metricThreshold = v; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String transactionsMustContainTipText() { return "Limit input to FPGrowth to those transactions (instances)" + " that contain these items. Provide a comma separated" + " list of attribute names."; } /** * Set the comma separated list of items that transactions must contain in * order to be considered for large item sets and rules. * * @param list a comma separated list of items (empty string indicates no * restriction on the transactions). */ public void setTransactionsMustContain(String list) { m_transactionsMustContain = list; } /** * Gets the comma separated list of items that transactions must contain in * order to be considered for large item sets and rules. * * @return return the comma separated list of items that transactions must * contain. */ public String getTransactionsMustContain() { return m_transactionsMustContain; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String rulesMustContainTipText() { return "Only print rules that contain these items. Provide " + "a comma separated list of attribute names."; } /** * Set the comma separated list of items that rules must contain in order to * be output. * * @param list a comma separated list of items (empty string indicates no * restriction on the rules). */ public void setRulesMustContain(String list) { m_rulesMustContain = list; } /** * Get the comma separated list of items that rules must contain in order to * be output. * * @return the comma separated list of items that rules must contain in order * to be output. */ public String getRulesMustContain() { return m_rulesMustContain; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String useORForMustContainListTipText() { return "Use OR instead of AND for transactions/rules must contain lists."; } /** * Set whether to use OR rather than AND when considering must contain lists. * * @param b true if OR should be used instead of AND when considering * transaction and rules must contain lists. */ public void setUseORForMustContainList(boolean b) { m_mustContainOR = b; } /** * Gets whether OR is to be used rather than AND when considering must contain * lists. * * @return true if OR is used instead of AND. */ public boolean getUseORForMustContainList() { return m_mustContainOR; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying, in the * explorer/experimenter gui */ public String deltaTipText() { return "Iteratively decrease support by this factor. Reduces support " + "until min support is reached or required number of rules has been " + "generated."; } /** * Get the value of delta. * * @return Value of delta. */ public double getDelta() { return m_delta; } /** * Set the value of delta. * * @param v Value to assign to delta. */ public void setDelta(double v) { m_delta = v; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String lowerBoundMinSupportTipText() { return "Lower bound for minimum support as a fraction or number of instances."; } /** * Get the value of lowerBoundMinSupport. * * @return Value of lowerBoundMinSupport. */ public double getLowerBoundMinSupport() { return m_lowerBoundMinSupport; } /** * Set the value of lowerBoundMinSupport. * * @param v Value to assign to lowerBoundMinSupport. */ public void setLowerBoundMinSupport(double v) { m_lowerBoundMinSupport = v; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String upperBoundMinSupportTipText() { return "Upper bound for minimum support as a fraction or number of instances. " + "Start iteratively decreasing " + "minimum support from this value."; } /** * Get the value of upperBoundMinSupport. * * @return Value of upperBoundMinSupport. */ public double getUpperBoundMinSupport() { return m_upperBoundMinSupport; } /** * Set the value of upperBoundMinSupport. * * @param v Value to assign to upperBoundMinSupport. */ public void setUpperBoundMinSupport(double v) { m_upperBoundMinSupport = v; } /** * Tip text for this property suitable for displaying in the GUI. * * @return the tip text for this property. */ public String findAllRulesForSupportLevelTipText() { return "Find all rules that meet " + "the lower bound on minimum support and the minimum metric constraint. " + "Turning this mode on will disable the iterative support reduction " + "procedure to find the specified number of rules."; } /** * If true then turn off the iterative support reduction method of finding x * rules that meet the minimum support and metric thresholds and just return * all the rules that meet the lower bound on minimum support and the minimum * metric. * * @param s true if all rules meeting the lower bound on the support and * minimum metric thresholds are to be found. */ public void setFindAllRulesForSupportLevel(boolean s) { m_findAllRulesForSupportLevel = s; } /** * Get whether all rules meeting the lower bound on min support and the * minimum metric threshold are to be found. * * @return true if all rules meeting the lower bound on min support and the * min metric threshold are to be found. */ public boolean getFindAllRulesForSupportLevel() { return m_findAllRulesForSupportLevel; } /** * Set how often to report some progress when the data is being read * incrementally off of the disk rather than loaded into memory. * * @param freq the frequency to print progress. */ public void setOffDiskReportingFrequency(int freq) { m_offDiskReportingFrequency = freq; } /* * public void setMinimumSupport(double minSupp) { m_minSupport = minSupp; } * * public double getMinimumSupport() { return m_minSupport; } */ /** * Gets the list of mined association rules. * * @return the list of association rules discovered during mining. Returns * null if mining hasn't been performed yet. */ @Override public AssociationRules getAssociationRules() { List rulesToReturn = new ArrayList(); int count = 0; for (AssociationRule r : m_rules) { rulesToReturn.add(r); count++; if (!m_findAllRulesForSupportLevel && count == m_numRulesToFind) { break; } } return new AssociationRules(rulesToReturn, this); } /** * Gets a list of the names of the metrics output for each rule. This list * should be the same (in terms of the names and order thereof) as that * produced by AssociationRule.getMetricNamesForRule(). * * @return an array of the names of the metrics available for each rule * learned by this producer. */ @Override public String[] getRuleMetricNames() { String[] metricNames = new String[DefaultAssociationRule.TAGS_SELECTION.length]; for (int i = 0; i < DefaultAssociationRule.TAGS_SELECTION.length; i++) { metricNames[i] = DefaultAssociationRule.TAGS_SELECTION[i].getReadable(); } return metricNames; } /** * Returns true if this AssociationRulesProducer can actually produce rules. * Most implementing classes will always return true from this method * (obviously :-)). However, an implementing class that actually acts as a * wrapper around things that may or may not implement * AssociationRulesProducer will want to return false if the thing they wrap * can't produce rules. * * @return true if this producer can produce rules in its current * configuration */ @Override public boolean canProduceRules() { return true; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration




© 2015 - 2025 Weber Informatics LLC | Privacy Policy