weka.associations.FPGrowth Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-stable Show documentation
Show all versions of weka-stable Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This is the stable version. Apart from bugfixes, this version
does not receive any other updates.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* FPGrowth.java
* Copyright (C) 2009-2012 University of Waikato, Hamilton, New Zealand
*
*/
package weka.associations;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.SparseInstance;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
/**
* Class implementing the FP-growth algorithm for
* finding large item sets without candidate generation. Iteratively reduces the
* minimum support until it finds the required number of rules with the given
* minimum metric. For more information see:
*
* J. Han, J.Pei, Y. Yin: Mining frequent patterns without candidate generation.
* In: Proceedings of the 2000 ACM-SIGMID International Conference on Management
* of Data, 1-12, 2000.
*
*
*
* BibTeX:
*
*
* @inproceedings{Han2000,
* author = {J. Han and J.Pei and Y. Yin},
* booktitle = {Proceedings of the 2000 ACM-SIGMID International Conference on Management of Data},
* pages = {1-12},
* title = {Mining frequent patterns without candidate generation},
* year = {2000}
* }
*
*
*
*
* Valid options are:
*
*
*
* -P <attribute index of positive value>
* Set the index of the attribute value to consider as 'positive'
* for binary attributes in normal dense instances. Index 2 is always
* used for sparse instances. (default = 2)
*
*
*
* -I <max items>
* The maximum number of items to include in large items sets (and rules). (default = -1, i.e. no limit.)
*
*
*
* -N <require number of rules>
* The required number of rules. (default = 10)
*
*
*
* -T <0=confidence | 1=lift | 2=leverage | 3=Conviction>
* The metric by which to rank rules. (default = confidence)
*
*
*
* -C <minimum metric score of a rule>
* The minimum metric score of a rule. (default = 0.9)
*
*
*
* -U <upper bound for minimum support>
* Upper bound for minimum support. (default = 1.0)
*
*
*
* -M <lower bound for minimum support>
* The lower bound for the minimum support. (default = 0.1)
*
*
*
* -D <delta for minimum support>
* The delta by which the minimum support is decreased in
* each iteration. (default = 0.05)
*
*
*
* -S
* Find all rules that meet the lower bound on
* minimum support and the minimum metric constraint.
* Turning this mode on will disable the iterative support reduction
* procedure to find the specified number of rules.
*
*
*
* -transactions <comma separated list of attribute names>
* Only consider transactions that contain these items (default = no restriction)
*
*
*
* -rules <comma separated list of attribute names>
* Only print rules that contain these items. (default = no restriction)
*
*
*
* -use-or
* Use OR instead of AND for must contain list(s). Use in conjunction
* with -transactions and/or -rules
*
*
*
*
* @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
* @version $Revision: 10172 $
*/
public class FPGrowth extends AbstractAssociator implements
AssociationRulesProducer, OptionHandler, TechnicalInformationHandler {
/** For serialization */
private static final long serialVersionUID = 3620717108603442911L;
/**
* Class for maintaining a frequent item set.
*/
protected static class FrequentBinaryItemSet implements Serializable,
Cloneable {
/** For serialization */
private static final long serialVersionUID = -6543815873565829448L;
/** The list of items in the item set */
protected ArrayList m_items = new ArrayList();
/** the support of this item set **/
protected int m_support;
/**
* Constructor
*
* @param items the items that make up the frequent item set.
* @param support the support of this item set.
*/
public FrequentBinaryItemSet(ArrayList items, int support) {
m_items = items;
m_support = support;
Collections.sort(m_items);
}
/**
* Add an item to this item set.
*
* @param i the item to add.
*/
public void addItem(BinaryItem i) {
m_items.add(i);
Collections.sort(m_items);
}
/**
* Set the support for this item set.
*
* @param support the support for this item set.
*/
public void setSupport(int support) {
m_support = support;
}
/**
* Get the support of this item set.
*
* @return the support of this item set.
*/
public int getSupport() {
return m_support;
}
/**
* Get the items in this item set.
*
* @return the items in this item set.
*/
public Collection getItems() {
return m_items;
}
/**
* Get a particular item from this item set.
*
* @param index the index of the item to get.
* @return the item.
*/
public BinaryItem getItem(int index) {
return m_items.get(index);
}
/**
* Get the number of items in this item set.
*
* @return the number of items in this item set.
*/
public int numberOfItems() {
return m_items.size();
}
/**
* Get a textual description of this item set.
*
* @return a textual description of this item set.
*/
@Override
public String toString() {
StringBuffer buff = new StringBuffer();
Iterator i = m_items.iterator();
while (i.hasNext()) {
buff.append(i.next().toString() + " ");
}
buff.append(": " + m_support);
return buff.toString();
}
/**
* Make a copy of this item set.
*
* @return a copy of this item set.
*/
@Override
public Object clone() {
ArrayList items = new ArrayList(m_items);
return new FrequentBinaryItemSet(items, m_support);
}
}
/**
* Maintains a list of frequent item sets.
*/
protected static class FrequentItemSets implements Serializable {
/** For serialization */
private static final long serialVersionUID = 4173606872363973588L;
/** The list of frequent item sets */
protected ArrayList m_sets = new ArrayList();
/** The total number of transactions in the data */
protected int m_numberOfTransactions;
/**
* Constructor.
*
* @param numTransactions the total number of transactions in the data.
*/
public FrequentItemSets(int numTransactions) {
m_numberOfTransactions = numTransactions;
}
/**
* Get an item set.
*
* @param index the index of the item set to get.
* @return an item set.
*/
public FrequentBinaryItemSet getItemSet(int index) {
return m_sets.get(index);
}
/**
* Get an iterator that can be used to access all the item sets.
*
* @return an iterator.
*/
public Iterator iterator() {
return m_sets.iterator();
}
/**
* Get the total number of transactions in the data that the item sets were
* derived from.
*
* @return the total number of transactions in the data.
*/
public int getNumberOfTransactions() {
return m_numberOfTransactions;
}
/**
* Add an item set.
*
* @param setToAdd the item set to add.
*/
public void addItemSet(FrequentBinaryItemSet setToAdd) {
m_sets.add(setToAdd);
}
/**
* Sort the item sets according to the supplied comparator.
*
* @param comp the comparator to use.
*/
public void sort(Comparator comp) {
Collections.sort(m_sets, comp);
}
/**
* Get the number of item sets.
*
* @return the number of item sets.
*/
public int size() {
return m_sets.size();
}
/**
* Sort the item sets. Sorts by item set length. Ties are broken by
* comparing the items in the two item sets.
*/
public void sort() {
Comparator compF = new Comparator() {
@Override
public int compare(FrequentBinaryItemSet one, FrequentBinaryItemSet two) {
Collection compOne = one.getItems();
Collection compTwo = two.getItems();
// if (one.getSupport() == two.getSupport()) {
// if supports are equal then list shorter item sets before longer
// ones
if (compOne.size() < compTwo.size()) {
return -1;
} else if (compOne.size() > compTwo.size()) {
return 1;
} else {
// compare items
Iterator twoIterator = compTwo.iterator();
for (BinaryItem oneI : compOne) {
BinaryItem twoI = twoIterator.next();
int result = oneI.compareTo(twoI);
if (result != 0) {
return result;
}
}
return 0; // equal
}
// return 0;
/*
* } else if (one.getSupport() > two.getSupport()) { // reverse
* ordering (i.e. descending by support) return -1; }
*/
// return 1;
}
};
sort(compF);
}
/**
* Get a textual description of this list of item sets.
*
* @param numSets the number of item sets to display.
* @return a textual description of the item sets.
*/
public String toString(int numSets) {
if (m_sets.size() == 0) {
return "No frequent items sets found!";
}
StringBuffer result = new StringBuffer();
result.append("" + m_sets.size() + " frequent item sets found");
if (numSets > 0) {
result.append(" , displaying " + numSets);
}
result.append(":\n\n");
int count = 0;
for (FrequentBinaryItemSet i : m_sets) {
if (numSets > 0 && count > numSets) {
break;
}
result.append(i.toString() + "\n");
count++;
}
return result.toString();
}
}
/**
* This class holds the counts for projected tree nodes and header lists.
*/
protected static class ShadowCounts implements Serializable {
/** For serialization */
private static final long serialVersionUID = 4435433714185969155L;
/** Holds the counts at different recursion levels */
private final ArrayList m_counts = new ArrayList();
/**
* Get the count at the specified recursion depth.
*
* @param recursionLevel the depth of the recursion.
* @return the count.
*/
public int getCount(int recursionLevel) {
if (recursionLevel >= m_counts.size()) {
return 0;
} else {
return m_counts.get(recursionLevel);
}
}
/**
* Increase the count at a given recursion level.
*
* @param recursionLevel the level at which to increase the count.
* @param incr the amount by which to increase the count.
*/
public void increaseCount(int recursionLevel, int incr) {
// basically treat the list like a stack where we
// can add a new element, or increment the element
// at the top
if (recursionLevel == m_counts.size()) {
// new element
m_counts.add(incr);
} else if (recursionLevel == m_counts.size() - 1) {
// otherwise increment the top
int n = m_counts.get(recursionLevel).intValue();
m_counts.set(recursionLevel, (n + incr));
}
}
/**
* Remove the count at the given recursion level.
*
* @param recursionLevel the level at which to remove the count.
*/
public void removeCount(int recursionLevel) {
if (recursionLevel < m_counts.size()) {
m_counts.remove(recursionLevel);
}
}
}
/**
* A node in the FP-tree.
*/
protected static class FPTreeNode implements Serializable {
/** For serialization */
private static final long serialVersionUID = 4396315323673737660L;
/** link to another sibling at this level in the tree */
protected FPTreeNode m_levelSibling;
/** link to the parent node */
protected FPTreeNode m_parent;
/** item at this node */
protected BinaryItem m_item;
/** ID (for graphing the tree) */
protected int m_ID;
/** the children of this node */
protected Map m_children = new HashMap();
/** counts associated with projected versions of this node */
protected ShadowCounts m_projectedCounts = new ShadowCounts();
/**
* Construct a new node with the given parent link and item.
*
* @param parent a pointer to the parent of this node.
* @param item the item at this node.
*/
public FPTreeNode(FPTreeNode parent, BinaryItem item) {
m_parent = parent;
m_item = item;
}
/**
* Insert an item set into the tree at this node. Removes the first item
* from the supplied item set and makes a recursive call to insert the
* remaining items.
*
* @param itemSet the item set to insert.
* @param headerTable the header table for the tree.
* @param incr the amount by which to increase counts.
*/
public void addItemSet(Collection itemSet,
Map headerTable, int incr) {
Iterator i = itemSet.iterator();
if (i.hasNext()) {
BinaryItem first = i.next();
FPTreeNode aChild;
if (!m_children.containsKey(first)) {
// not in the tree, so add it.
aChild = new FPTreeNode(this, first);
m_children.put(first, aChild);
// update the header
if (!headerTable.containsKey(first)) {
headerTable.put(first, new FPTreeRoot.Header());
}
// append new node to header list
headerTable.get(first).addToList(aChild);
} else {
// get the appropriate child node
aChild = m_children.get(first);
}
// update counts in header table
headerTable.get(first).getProjectedCounts().increaseCount(0, incr);
// increase the child's count
aChild.increaseProjectedCount(0, incr);
// proceed recursively
itemSet.remove(first);
aChild.addItemSet(itemSet, headerTable, incr);
}
}
/**
* Increase the projected count at the given recursion level at this node
*
* @param recursionLevel the recursion level to increase the node count at.
* @param incr the amount by which to increase the count.
*/
public void increaseProjectedCount(int recursionLevel, int incr) {
m_projectedCounts.increaseCount(recursionLevel, incr);
}
/**
* Remove the projected count at the given recursion level for this node.
*
* @param recursionLevel the recursion level at which to remove the count.
*/
public void removeProjectedCount(int recursionLevel) {
m_projectedCounts.removeCount(recursionLevel);
}
/**
* Get the projected count at the given recursion level for this node.
*
* @param recursionLevel the recursion level at which to get the count.
* @return the count.
*/
public int getProjectedCount(int recursionLevel) {
return m_projectedCounts.getCount(recursionLevel);
}
/**
* Get the parent node.
*
* @return the parent node.
*/
public FPTreeNode getParent() {
return m_parent;
}
/**
* Get the item at this node.
*
* @return the item at this node.
*/
public BinaryItem getItem() {
return m_item;
}
/**
* Return a textual description of this node for a given recursion level.
*
* @param recursionLevel the recursion depth to use.
* @return a textual description of this node.
*/
public String toString(int recursionLevel) {
return toString("", recursionLevel);
}
/**
* Return a textual description of this node for a given recursion level.
*
* @param prefix a prefix string to prepend.
* @param recursionLevel the recursion level to use.
* @return a textual description of this node.
*/
public String toString(String prefix, int recursionLevel) {
StringBuffer buffer = new StringBuffer();
buffer.append(prefix);
buffer.append("| ");
buffer.append(m_item.toString());
buffer.append(" (");
buffer.append(m_projectedCounts.getCount(recursionLevel));
buffer.append(")\n");
for (FPTreeNode node : m_children.values()) {
buffer.append(node.toString(prefix + "| ", recursionLevel));
}
return buffer.toString();
}
protected int assignIDs(int lastID) {
int currentLastID = lastID + 1;
m_ID = currentLastID;
if (m_children != null) {
Collection kids = m_children.values();
for (FPTreeNode n : kids) {
currentLastID = n.assignIDs(currentLastID);
}
}
return currentLastID;
}
/**
* Generate a dot graph description string for the tree.
*
* @param text a StringBuffer to store the graph description in.
*/
public void graphFPTree(StringBuffer text) {
if (m_children != null) {
Collection kids = m_children.values();
for (FPTreeNode n : kids) {
text.append("N" + n.m_ID);
text.append(" [label=\"");
text.append(n.getItem().toString() + " (" + n.getProjectedCount(0)
+ ")\\n");
text.append("\"]\n");
n.graphFPTree(text);
text.append("N" + m_ID + "->" + "N" + n.m_ID + "\n");
}
}
}
}
/**
* Root of the FPTree
*/
private static class FPTreeRoot extends FPTreeNode {
/** For serialization */
private static final long serialVersionUID = 632150939785333297L;
/**
* Stores a header entry for an FPTree
*/
protected static class Header implements Serializable {
/** For serialization */
private static final long serialVersionUID = -6583156284891368909L;
/** The list of pointers into the tree structure */
protected List m_headerList = new LinkedList();
/** Projected header counts for this entry */
protected ShadowCounts m_projectedHeaderCounts = new ShadowCounts();
/**
* Add a tree node into the list for this header entry.
*
* @param toAdd the node to add.
*/
public void addToList(FPTreeNode toAdd) {
m_headerList.add(toAdd);
}
/**
* Get the list of nodes for this header entry.
*
* @return the list of nodes for this header entry.
*/
public List getHeaderList() {
return m_headerList;
}
/**
* Get the projected counts for this header entry.
*
* @return the projected counts for this header entry.
*/
public ShadowCounts getProjectedCounts() {
return m_projectedHeaderCounts;
}
}
/** Stores the header table as mapped Header entries */
protected Map m_headerTable = new HashMap();
/**
* Create a new FPTreeRoot.
*/
public FPTreeRoot() {
super(null, null);
}
/**
* Insert an item set into the tree.
*
* @param itemSet the item set to insert into the tree.
* @param incr the increment by which to increase counters.
*/
public void addItemSet(Collection itemSet, int incr) {
super.addItemSet(itemSet, m_headerTable, incr);
}
/**
* Get the header table for this tree.
*
* @return the header table for this tree.
*/
public Map getHeaderTable() {
return m_headerTable;
}
public boolean isEmpty(int recursionLevel) {
for (FPTreeNode c : m_children.values()) {
if (c.getProjectedCount(recursionLevel) > 0) {
return false;
}
}
return true;
}
/**
* Get a textual description of the tree at a given recursion (projection)
* level.
*
* @param pad the string to use as a prefix for indenting nodes.
* @param recursionLevel the recursion level (projection) to use.
* @return the textual description of the tree.
*/
@Override
public String toString(String pad, int recursionLevel) {
StringBuffer result = new StringBuffer();
result.append(pad);
result.append("+ ROOT\n");
for (FPTreeNode node : m_children.values()) {
result.append(node.toString(pad + "| ", recursionLevel));
}
return result.toString();
}
}
private static void nextSubset(boolean[] subset) {
for (int i = 0; i < subset.length; i++) {
if (!subset[i]) {
subset[i] = true;
break;
} else {
subset[i] = false;
}
}
}
private static Collection- getPremise(FrequentBinaryItemSet fis,
boolean[] subset) {
boolean ok = false;
for (int i = 0; i < subset.length; i++) {
if (!subset[i]) {
ok = true;
break;
}
}
if (!ok) {
return null;
}
List
- premise = new ArrayList
- ();
ArrayList
- items = new ArrayList
- (fis.getItems());
for (int i = 0; i < subset.length; i++) {
if (subset[i]) {
premise.add(items.get(i));
}
}
return premise;
}
private static Collection
- getConsequence(FrequentBinaryItemSet fis,
boolean[] subset) {
List
- consequence = new ArrayList
- ();
ArrayList
- items = new ArrayList
- (fis.getItems());
for (int i = 0; i < subset.length; i++) {
if (!subset[i]) {
consequence.add(items.get(i));
}
}
return consequence;
}
/**
* Generate all association rules, from the supplied frequet item sets, that
* meet a given minimum metric threshold. Uses a brute force approach.
*
* @param largeItemSets the set of frequent item sets
* @param metricToUse the metric to use
* @param metricThreshold the threshold value that a rule must meet
* @param upperBoundMinSuppAsInstances the upper bound on the support in order
* to accept the rule
* @param lowerBoundMinSuppAsInstances the lower bound on the support in order
* to accept the rule
* @param totalTransactions the total number of transactions in the data
* @return a list of association rules
*/
public static List
generateRulesBruteForce(
FrequentItemSets largeItemSets,
DefaultAssociationRule.METRIC_TYPE metricToUse, double metricThreshold,
int upperBoundMinSuppAsInstances, int lowerBoundMinSuppAsInstances,
int totalTransactions) {
List rules = new ArrayList();
largeItemSets.sort();
Map, Integer> frequencyLookup = new HashMap, Integer>();
Iterator setI = largeItemSets.iterator();
// process each large item set
while (setI.hasNext()) {
FrequentBinaryItemSet fis = setI.next();
frequencyLookup.put(fis.getItems(), fis.getSupport());
if (fis.getItems().size() > 1) {
// generate all the possible subsets for the premise
boolean[] subset = new boolean[fis.getItems().size()];
Collection- premise = null;
Collection
- consequence = null;
while ((premise = getPremise(fis, subset)) != null) {
if (premise.size() > 0 && premise.size() < fis.getItems().size()) {
consequence = getConsequence(fis, subset);
int totalSupport = fis.getSupport();
int supportPremise = frequencyLookup.get(premise).intValue();
int supportConsequence = frequencyLookup.get(consequence)
.intValue();
// a candidate rule
DefaultAssociationRule candidate = new DefaultAssociationRule(
premise, consequence, metricToUse, supportPremise,
supportConsequence, totalSupport, totalTransactions);
if (candidate.getPrimaryMetricValue() > metricThreshold
&& candidate.getTotalSupport() >= lowerBoundMinSuppAsInstances
&& candidate.getTotalSupport() <= upperBoundMinSuppAsInstances) {
// accept this rule
rules.add(candidate);
}
}
nextSubset(subset);
}
}
}
return rules;
}
public static List
pruneRules(
List rulesToPrune, ArrayList- itemsToConsider,
boolean useOr) {
ArrayList
result = new ArrayList();
for (AssociationRule r : rulesToPrune) {
if (r.containsItems(itemsToConsider, useOr)) {
result.add(r);
}
}
return result;
}
/** The number of rules to find */
protected int m_numRulesToFind = 10;
// protected double m_upperBoundMinSupport = 0.36;
/** The upper bound on the minimum support */
protected double m_upperBoundMinSupport = 1.0;
/** The lower bound on minimum support */
protected double m_lowerBoundMinSupport = 0.1;
/** The amount by which to decrease the support in each iteration */
protected double m_delta = 0.05;
/** The number of instances in the data */
protected int m_numInstances;
/**
* When processing data off of disk report progress this frequently (number of
* instances).
*/
protected int m_offDiskReportingFrequency = 10000;
/**
* If true, just all rules meeting the lower bound on the minimum support will
* be found. The number of rules to find will be ignored and the iterative
* reduction of support will not be done.
*/
protected boolean m_findAllRulesForSupportLevel = false;
// protected double m_lowerBoundMinSupport = 0.0;
/** The index (1 based) of binary attributes to treat as the positive value */
protected int m_positiveIndex = 2;
protected DefaultAssociationRule.METRIC_TYPE m_metric = DefaultAssociationRule.METRIC_TYPE.CONFIDENCE;
protected double m_metricThreshold = 0.9;
/** Holds the large item sets found */
protected FrequentItemSets m_largeItemSets;
/** Holds the rules */
protected List m_rules;
// maximum number of items in a large item set (zero means no limit)
protected int m_maxItems = -1;
/**
* If set, limit the transactions (instances) input to the algorithm to those
* that contain these items
*/
protected String m_transactionsMustContain = "";
/** Use OR rather than AND when considering must contain lists */
protected boolean m_mustContainOR = false;
/** If set, then only output rules containing these itmes */
protected String m_rulesMustContain = "";
/**
* Returns default capabilities of the classifier.
*
* @return the capabilities of this classifier
*/
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
result.disableAll();
// enable what we can handle
// attributes
result.enable(Capability.UNARY_ATTRIBUTES);
result.enable(Capability.BINARY_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
result.enable(Capability.NO_CLASS);
return result;
}
/**
* Returns a string describing this associator
*
* @return a description of the evaluator suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "Class implementing the FP-growth algorithm for finding"
+ " large item sets without candidate generation. Iteratively"
+ " reduces the minimum support until it finds the required"
+ " number of rules with the given minimum metric."
+ " For more information see:\n\n" + getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing detailed
* information about the technical background of this class, e.g., paper
* reference or book this class is based on.
*
* @return the technical information about this class
*/
@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
result = new TechnicalInformation(Type.INPROCEEDINGS);
result.setValue(Field.AUTHOR, "J. Han and J.Pei and Y. Yin");
result.setValue(Field.TITLE,
"Mining frequent patterns without candidate generation");
result.setValue(Field.BOOKTITLE,
"Proceedings of the 2000 ACM-SIGMID International"
+ " Conference on Management of Data");
result.setValue(Field.YEAR, "2000");
result.setValue(Field.PAGES, "1-12");
return result;
}
private boolean passesMustContain(Instance inst,
boolean[] transactionsMustContainIndexes,
int numInTransactionsMustContainList) {
boolean result = false;
if (inst instanceof SparseInstance) {
int containsCount = 0;
for (int i = 0; i < inst.numValues(); i++) {
int attIndex = inst.index(i);
if (m_mustContainOR) {
if (transactionsMustContainIndexes[attIndex]) {
// break here since the operator is OR and this
// instance contains at least one of the items
return true;
}
} else {
if (transactionsMustContainIndexes[attIndex]) {
containsCount++;
}
}
}
if (!m_mustContainOR) {
if (containsCount == numInTransactionsMustContainList) {
return true;
}
}
} else {
int containsCount = 0;
for (int i = 0; i < transactionsMustContainIndexes.length; i++) {
if (transactionsMustContainIndexes[i]) {
if ((int) inst.value(i) == m_positiveIndex - 1) {
if (m_mustContainOR) {
// break here since the operator is OR and
// this instance contains at least one of the
// requested items
return true;
} else {
containsCount++;
}
}
}
}
if (!m_mustContainOR) {
if (containsCount == numInTransactionsMustContainList) {
return true;
}
}
}
return result;
}
private void processSingleton(Instance current,
ArrayList singletons) throws Exception {
if (current instanceof SparseInstance) {
for (int j = 0; j < current.numValues(); j++) {
int attIndex = current.index(j);
singletons.get(attIndex).increaseFrequency();
}
} else {
for (int j = 0; j < current.numAttributes(); j++) {
if (!current.isMissing(j)) {
if (current.attribute(j).numValues() == 1
|| current.value(j) == m_positiveIndex - 1) {
singletons.get(j).increaseFrequency();
}
}
}
}
}
/**
* Get the singleton items in the data
*
* @param source the source of the data (either Instances or an ArffLoader).
* @return a list of singleton item sets
* @throws Exception if the singletons can't be found for some reason
*/
protected ArrayList getSingletons(Object source) throws Exception {
ArrayList singletons = new ArrayList();
Instances data = null;
if (source instanceof Instances) {
data = (Instances) source;
} else if (source instanceof weka.core.converters.ArffLoader) {
data = ((weka.core.converters.ArffLoader) source).getStructure();
}
for (int i = 0; i < data.numAttributes(); i++) {
singletons.add(new BinaryItem(data.attribute(i), m_positiveIndex - 1));
}
if (source instanceof Instances) {
// set the number of instances
m_numInstances = data.numInstances();
for (int i = 0; i < data.numInstances(); i++) {
Instance current = data.instance(i);
processSingleton(current, singletons);
}
} else if (source instanceof weka.core.converters.ArffLoader) {
weka.core.converters.ArffLoader loader = (weka.core.converters.ArffLoader) source;
Instance current = null;
int count = 0;
while ((current = loader.getNextInstance(data)) != null) {
processSingleton(current, singletons);
count++;
if (count % m_offDiskReportingFrequency == 0) {
System.err.println("Singletons: done " + count);
}
}
// set the number of instances
m_numInstances = count;
loader.reset();
}
return singletons;
}
/**
* Get the singleton items in the data
*
* @param data the Instances to process
* @return a list of singleton item sets
* @throws Exception if the singletons can't be found for some reason
*/
protected ArrayList getSingletons(Instances data)
throws Exception {
return getSingletons((Object) data);
/*
* ArrayList singletons = new ArrayList();
*
* for (int i = 0; i < data.numAttributes(); i++) { singletons.add(new
* BinaryItem(data.attribute(i), m_positiveIndex - 1)); }
*
* for (int i = 0; i < data.numInstances(); i++) { Instance current =
* data.instance(i); if (current instanceof SparseInstance) { for (int j =
* 0; j < current.numValues(); j++) { int attIndex = current.index(j);
* singletons.get(attIndex).increaseFrequency(); } } else { for (int j = 0;
* j < data.numAttributes(); j++) { if (!current.isMissing(j)) { if
* (current.attribute(j).numValues() == 1 || current.value(j) ==
* m_positiveIndex - 1) { singletons.get(j).increaseFrequency(); } } } } }
*
* return singletons;
*/
}
/*
* protected ArrayList getFrequent(ArrayList items,
* int minSupport) { ArrayList frequent = new
* ArrayList(); for (BinaryItem b : items) { if (b.getFrequency()
* > minSupport) { frequent.add(b); } }
*
* // sort in descending order of support Collections.sort(frequent); return
* frequent; }
*/
/**
* Inserts a single instance into the FPTree.
*
* @param current the instance to insert
* @param singletons the singleton item sets
* @param tree the tree to insert into
* @param minSupport the minimum support threshold
*/
private void insertInstance(Instance current,
ArrayList singletons, FPTreeRoot tree, int minSupport) {
ArrayList transaction = new ArrayList();
if (current instanceof SparseInstance) {
for (int j = 0; j < current.numValues(); j++) {
int attIndex = current.index(j);
if (singletons.get(attIndex).getFrequency() >= minSupport) {
transaction.add(singletons.get(attIndex));
}
}
Collections.sort(transaction);
tree.addItemSet(transaction, 1);
} else {
for (int j = 0; j < current.numAttributes(); j++) {
if (!current.isMissing(j)) {
if (current.attribute(j).numValues() == 1
|| current.value(j) == m_positiveIndex - 1) {
if (singletons.get(j).getFrequency() >= minSupport) {
transaction.add(singletons.get(j));
}
}
}
}
Collections.sort(transaction);
tree.addItemSet(transaction, 1);
}
}
/**
* Construct the frequent pattern tree by inserting each transaction in the
* data into the tree. Only those items from each transaction that meet the
* minimum support threshold are inserted.
*
* @param singletons the singleton item sets
* @param data the Instances containing the transactions
* @param minSupport the minimum support
* @return the root of the tree
*/
protected FPTreeRoot buildFPTree(ArrayList singletons,
Object dataSource, int minSupport) throws Exception {
FPTreeRoot tree = new FPTreeRoot();
Instances data = null;
if (dataSource instanceof Instances) {
data = (Instances) dataSource;
} else if (dataSource instanceof weka.core.converters.ArffLoader) {
data = ((weka.core.converters.ArffLoader) dataSource).getStructure();
}
if (dataSource instanceof Instances) {
for (int i = 0; i < data.numInstances(); i++) {
insertInstance(data.instance(i), singletons, tree, minSupport);
}
} else if (dataSource instanceof weka.core.converters.ArffLoader) {
weka.core.converters.ArffLoader loader = (weka.core.converters.ArffLoader) dataSource;
Instance current = null;
int count = 0;
while ((current = loader.getNextInstance(data)) != null) {
insertInstance(current, singletons, tree, minSupport);
count++;
if (count % m_offDiskReportingFrequency == 0) {
System.err.println("build tree done: " + count);
}
}
}
return tree;
}
/**
* Construct the frequent pattern tree by inserting each transaction in the
* data into the tree. Only those items from each transaction that meet the
* minimum support threshold are inserted.
*
* @param singletons the singleton item sets
* @param data the Instances containing the transactions
* @param minSupport the minimum support
* @return the root of the tree
*/
/*
* protected FPTreeRoot buildFPTree(ArrayList singletons,
* Instances data, int minSupport) {
*
* FPTreeRoot tree = new FPTreeRoot();
*
* for (int i = 0; i < data.numInstances(); i++) { Instance current =
* data.instance(i); ArrayList transaction = new
* ArrayList(); if (current instanceof SparseInstance) { for (int
* j = 0; j < current.numValues(); j++) { int attIndex = current.index(j); if
* (singletons.get(attIndex).getFrequency() >= minSupport) {
* transaction.add(singletons.get(attIndex)); } }
* Collections.sort(transaction); tree.addItemSet(transaction, 1); } else {
* for (int j = 0; j < data.numAttributes(); j++) { if (!current.isMissing(j))
* { if (current.attribute(j).numValues() == 1 || current.value(j) ==
* m_positiveIndex - 1) { if (singletons.get(j).getFrequency() >= minSupport)
* { transaction.add(singletons.get(j)); } } } }
* Collections.sort(transaction); tree.addItemSet(transaction, 1); } }
*
* return tree; }
*/
/**
* Find large item sets in the FP-tree.
*
* @param tree the root of the tree to mine
* @param largeItemSets holds the large item sets found
* @param recursionLevel the recursion level for the current projected counts
* @param conditionalItems the current set of items that the current
* (projected) tree is conditional on
* @param minSupport the minimum acceptable support
*/
protected void mineTree(FPTreeRoot tree, FrequentItemSets largeItemSets,
int recursionLevel, FrequentBinaryItemSet conditionalItems, int minSupport) {
if (!tree.isEmpty(recursionLevel)) {
if (m_maxItems > 0 && recursionLevel >= m_maxItems) {
// don't mine any further
return;
}
Map headerTable = tree.getHeaderTable();
Set keys = headerTable.keySet();
// System.err.println("Number of freq item sets collected " +
// largeItemSets.size());
Iterator i = keys.iterator();
while (i.hasNext()) {
BinaryItem item = i.next();
FPTreeRoot.Header itemHeader = headerTable.get(item);
// check for minimum support at this level
int support = itemHeader.getProjectedCounts().getCount(recursionLevel);
if (support >= minSupport) {
// process header list at this recursion level
for (FPTreeNode n : itemHeader.getHeaderList()) {
// push count up path to root
int currentCount = n.getProjectedCount(recursionLevel);
if (currentCount > 0) {
FPTreeNode temp = n.getParent();
while (temp != tree) {
// set/increase for the node
temp.increaseProjectedCount(recursionLevel + 1, currentCount);
// set/increase for the header table
headerTable.get(temp.getItem()).getProjectedCounts()
.increaseCount(recursionLevel + 1, currentCount);
temp = temp.getParent();
}
}
}
FrequentBinaryItemSet newConditional = (FrequentBinaryItemSet) conditionalItems
.clone();
// this item gets added to the conditional items
newConditional.addItem(item);
newConditional.setSupport(support);
// now add this conditional item set to the list of large item sets
largeItemSets.addItemSet(newConditional);
// now recursively process the new tree
mineTree(tree, largeItemSets, recursionLevel + 1, newConditional,
minSupport);
// reverse the propagated counts
for (FPTreeNode n : itemHeader.getHeaderList()) {
FPTreeNode temp = n.getParent();
while (temp != tree) {
temp.removeProjectedCount(recursionLevel + 1);
temp = temp.getParent();
}
}
// reverse the propagated counts in the header list
// at this recursion level
for (FPTreeRoot.Header h : headerTable.values()) {
h.getProjectedCounts().removeCount(recursionLevel + 1);
}
}
}
}
}
/**
* Construct a new FPGrowth object.
*/
public FPGrowth() {
resetOptions();
}
/**
* Reset all options to their default values.
*/
public void resetOptions() {
m_delta = 0.05;
m_metricThreshold = 0.9;
m_numRulesToFind = 10;
m_lowerBoundMinSupport = 0.1;
m_upperBoundMinSupport = 1.0;
// m_minSupport = -1;
m_positiveIndex = 2;
m_transactionsMustContain = "";
m_rulesMustContain = "";
m_mustContainOR = false;
}
/**
* Tip text for this property suitable for displaying in the GUI.
*
* @return the tip text for this property.
*/
public String positiveIndexTipText() {
return "Set the index of binary valued attributes that is to be considered"
+ " the positive index. Has no effect for sparse data (in this case"
+ " the first index (i.e. non-zero values) is always treated as "
+ " positive. Also has no effect for unary valued attributes (i.e."
+ " when using the Weka Apriori-style format for market basket data,"
+ " which uses missing value \"?\" to indicate" + " absence of an item.";
}
/**
* Set the index of the attribute value to consider as positive for binary
* attributes in normal dense instances. Index 1 is always used for sparse
* instances.
*
* @param index the index to use for positive values in binary attributes.
*/
public void setPositiveIndex(int index) {
m_positiveIndex = index;
}
/**
* Get the index of the attribute value to consider as positive for binary
* attributes in normal dense instances. Index 1 is always used for sparse
* instances.
*
* @return the index to use for positive values in binary attributes.
*/
public int getPositiveIndex() {
return m_positiveIndex;
}
/**
* Set the desired number of rules to find.
*
* @param numR the number of rules to find.
*/
public void setNumRulesToFind(int numR) {
m_numRulesToFind = numR;
}
/**
* Get the number of rules to find.
*
* @return the number of rules to find.
*/
public int getNumRulesToFind() {
return m_numRulesToFind;
}
/**
* Tip text for this property suitable for displaying in the GUI.
*
* @return the tip text for this property.
*/
public String numRulesToFindTipText() {
return "The number of rules to output";
}
/**
* Set the metric type to use.
*
* @param d the metric type
*/
public void setMetricType(SelectedTag d) {
int ordinal = d.getSelectedTag().getID();
for (DefaultAssociationRule.METRIC_TYPE m : DefaultAssociationRule.METRIC_TYPE
.values()) {
if (m.ordinal() == ordinal) {
m_metric = m;
break;
}
}
}
/**
* Set the maximum number of items to include in large items sets.
*
* @param max the maxim number of items to include in large item sets.
*/
public void setMaxNumberOfItems(int max) {
m_maxItems = max;
}
/**
* Gets the maximum number of items to be included in large item sets.
*
* @return the maximum number of items to be included in large items sets.
*/
public int getMaxNumberOfItems() {
return m_maxItems;
}
/**
* Tip text for this property suitable for displaying in the GUI.
*
* @return the tip text for this property.
*/
public String maxNumberOfItemsTipText() {
return "The maximum number of items to include in frequent item sets. -1 "
+ "means no limit.";
}
/**
* Get the metric type to use.
*
* @return the metric type to use.
*/
public SelectedTag getMetricType() {
return new SelectedTag(m_metric.ordinal(),
DefaultAssociationRule.TAGS_SELECTION);
}
/**
* Tip text for this property suitable for displaying in the GUI.
*
* @return the tip text for this property.
*/
public String metricTypeTipText() {
return "Set the type of metric by which to rank rules. Confidence is "
+ "the proportion of the examples covered by the premise that are also "
+ "covered by the consequence(Class association rules can only be mined using confidence). Lift is confidence divided by the "
+ "proportion of all examples that are covered by the consequence. This "
+ "is a measure of the importance of the association that is independent "
+ "of support. Leverage is the proportion of additional examples covered "
+ "by both the premise and consequence above those expected if the "
+ "premise and consequence were independent of each other. The total "
+ "number of examples that this represents is presented in brackets "
+ "following the leverage. Conviction is "
+ "another measure of departure from independence.";
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String minMetricTipText() {
return "Minimum metric score. Consider only rules with scores higher than "
+ "this value.";
}
/**
* Get the value of minConfidence.
*
* @return Value of minConfidence.
*/
public double getMinMetric() {
return m_metricThreshold;
}
/**
* Set the value of minConfidence.
*
* @param v Value to assign to minConfidence.
*/
public void setMinMetric(double v) {
m_metricThreshold = v;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String transactionsMustContainTipText() {
return "Limit input to FPGrowth to those transactions (instances)"
+ " that contain these items. Provide a comma separated"
+ " list of attribute names.";
}
/**
* Set the comma separated list of items that transactions must contain in
* order to be considered for large item sets and rules.
*
* @param list a comma separated list of items (empty string indicates no
* restriction on the transactions).
*/
public void setTransactionsMustContain(String list) {
m_transactionsMustContain = list;
}
/**
* Gets the comma separated list of items that transactions must contain in
* order to be considered for large item sets and rules.
*
* @return return the comma separated list of items that transactions must
* contain.
*/
public String getTransactionsMustContain() {
return m_transactionsMustContain;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String rulesMustContainTipText() {
return "Only print rules that contain these items. Provide "
+ "a comma separated list of attribute names.";
}
/**
* Set the comma separated list of items that rules must contain in order to
* be output.
*
* @param list a comma separated list of items (empty string indicates no
* restriction on the rules).
*/
public void setRulesMustContain(String list) {
m_rulesMustContain = list;
}
/**
* Get the comma separated list of items that rules must contain in order to
* be output.
*
* @return the comma separated list of items that rules must contain in order
* to be output.
*/
public String getRulesMustContain() {
return m_rulesMustContain;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String useORForMustContainListTipText() {
return "Use OR instead of AND for transactions/rules must contain lists.";
}
/**
* Set whether to use OR rather than AND when considering must contain lists.
*
* @param b true if OR should be used instead of AND when considering
* transaction and rules must contain lists.
*/
public void setUseORForMustContainList(boolean b) {
m_mustContainOR = b;
}
/**
* Gets whether OR is to be used rather than AND when considering must contain
* lists.
*
* @return true if OR is used instead of AND.
*/
public boolean getUseORForMustContainList() {
return m_mustContainOR;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying, in the
* explorer/experimenter gui
*/
public String deltaTipText() {
return "Iteratively decrease support by this factor. Reduces support "
+ "until min support is reached or required number of rules has been "
+ "generated.";
}
/**
* Get the value of delta.
*
* @return Value of delta.
*/
public double getDelta() {
return m_delta;
}
/**
* Set the value of delta.
*
* @param v Value to assign to delta.
*/
public void setDelta(double v) {
m_delta = v;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String lowerBoundMinSupportTipText() {
return "Lower bound for minimum support as a fraction or number of instances.";
}
/**
* Get the value of lowerBoundMinSupport.
*
* @return Value of lowerBoundMinSupport.
*/
public double getLowerBoundMinSupport() {
return m_lowerBoundMinSupport;
}
/**
* Set the value of lowerBoundMinSupport.
*
* @param v Value to assign to lowerBoundMinSupport.
*/
public void setLowerBoundMinSupport(double v) {
m_lowerBoundMinSupport = v;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String upperBoundMinSupportTipText() {
return "Upper bound for minimum support as a fraction or number of instances. "
+ "Start iteratively decreasing " + "minimum support from this value.";
}
/**
* Get the value of upperBoundMinSupport.
*
* @return Value of upperBoundMinSupport.
*/
public double getUpperBoundMinSupport() {
return m_upperBoundMinSupport;
}
/**
* Set the value of upperBoundMinSupport.
*
* @param v Value to assign to upperBoundMinSupport.
*/
public void setUpperBoundMinSupport(double v) {
m_upperBoundMinSupport = v;
}
/**
* Tip text for this property suitable for displaying in the GUI.
*
* @return the tip text for this property.
*/
public String findAllRulesForSupportLevelTipText() {
return "Find all rules that meet "
+ "the lower bound on minimum support and the minimum metric constraint. "
+ "Turning this mode on will disable the iterative support reduction "
+ "procedure to find the specified number of rules.";
}
/**
* If true then turn off the iterative support reduction method of finding x
* rules that meet the minimum support and metric thresholds and just return
* all the rules that meet the lower bound on minimum support and the minimum
* metric.
*
* @param s true if all rules meeting the lower bound on the support and
* minimum metric thresholds are to be found.
*/
public void setFindAllRulesForSupportLevel(boolean s) {
m_findAllRulesForSupportLevel = s;
}
/**
* Get whether all rules meeting the lower bound on min support and the
* minimum metric threshold are to be found.
*
* @return true if all rules meeting the lower bound on min support and the
* min metric threshold are to be found.
*/
public boolean getFindAllRulesForSupportLevel() {
return m_findAllRulesForSupportLevel;
}
/**
* Set how often to report some progress when the data is being read
* incrementally off of the disk rather than loaded into memory.
*
* @param freq the frequency to print progress.
*/
public void setOffDiskReportingFrequency(int freq) {
m_offDiskReportingFrequency = freq;
}
/*
* public void setMinimumSupport(double minSupp) { m_minSupport = minSupp; }
*
* public double getMinimumSupport() { return m_minSupport; }
*/
/**
* Gets the list of mined association rules.
*
* @return the list of association rules discovered during mining. Returns
* null if mining hasn't been performed yet.
*/
@Override
public AssociationRules getAssociationRules() {
List rulesToReturn = new ArrayList();
int count = 0;
for (AssociationRule r : m_rules) {
rulesToReturn.add(r);
count++;
if (!m_findAllRulesForSupportLevel && count == m_numRulesToFind) {
break;
}
}
return new AssociationRules(rulesToReturn, this);
}
/**
* Gets a list of the names of the metrics output for each rule. This list
* should be the same (in terms of the names and order thereof) as that
* produced by AssociationRule.getMetricNamesForRule().
*
* @return an array of the names of the metrics available for each rule
* learned by this producer.
*/
@Override
public String[] getRuleMetricNames() {
String[] metricNames = new String[DefaultAssociationRule.TAGS_SELECTION.length];
for (int i = 0; i < DefaultAssociationRule.TAGS_SELECTION.length; i++) {
metricNames[i] = DefaultAssociationRule.TAGS_SELECTION[i].getReadable();
}
return metricNames;
}
/**
* Returns true if this AssociationRulesProducer can actually produce rules.
* Most implementing classes will always return true from this method
* (obviously :-)). However, an implementing class that actually acts as a
* wrapper around things that may or may not implement
* AssociationRulesProducer will want to return false if the thing they wrap
* can't produce rules.
*
* @return true if this producer can produce rules in its current
* configuration
*/
@Override
public boolean canProduceRules() {
return true;
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*/
@Override
public Enumeration