All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.association.FPGrowth Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/

package smile.association;

import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;

import smile.association.FPTree.HeaderTableItem;
import smile.association.FPTree.Node;
import smile.util.MulticoreExecutor;

/**
 * Frequent item set mining based on the FP-growth (frequent pattern growth)
 * algorithm, which employs an extended prefix-tree (FP-tree) structure to
 * store the database in a compressed form. The FP-growth algorithm is
 * currently one of the fastest approaches to discover frequent item sets.
 * FP-growth adopts a divide-and-conquer approach to decompose both the mining
 * tasks and the databases. It uses a pattern fragment growth method to avoid
 * the costly process of candidate generation and testing used by Apriori.
 * 

* The basic idea of the FP-growth algorithm can be described as a * recursive elimination scheme: in a preprocessing step delete * all items from the transactions that are not frequent individually, * i.e., do not appear in a user-specified minimum * number of transactions. Then select all transactions that * contain the least frequent item (least frequent among those * that are frequent) and delete this item from them. Recurse * to process the obtained reduced (also known as projected) * database, remembering that the item sets found in the recursion * share the deleted item as a prefix. On return, remove * the processed item from the database of all transactions * and start over, i.e., process the second frequent item etc. In * these processing steps the prefix tree, which is enhanced by * links between the branches, is exploited to quickly find the * transactions containing a given item and also to remove this * item from the transactions after it has been processed. * *

References

*
    *
  1. Jiawei Han, Jian Pei, Yiwen Yin, and Runying Mao. Mining frequent patterns without candidate generation. Data Mining and Knowledge Discovery 8:53-87, 2004.
  2. *
  3. Gosta Grahne and Jianfei Zhu. Fast algorithms for frequent itemset mining using FP-trees. IEEE TRANS. ON KNOWLEDGE AND DATA ENGINEERING 17(10):1347-1362, 2005.
  4. *
  5. Christian Borgelt. An Implementation of the FP-growth Algorithm. OSDM, 1-5, 2005.
  6. *
* * @author Haifeng Li */ public class FPGrowth { /** * The required minimum support of item sets. */ private int minSupport; /** * FP-tree. */ private FPTree T0; /** * Constructor. This is for mining frequent item sets by scanning database * twice. The user first scans the database to obtains the frequency of * single items and calls this constructor. Then the user add item sets to * the object by {@link #add(int[])} during the second scan of the database. * In this way, we don't need load the whole database into the main memory. * In the database, the item identifiers have to be in [0, n), where n is * the number of items. * @param frequency the frequency of single items. * @param minSupport the required minimum support of item sets in terms * of frequency. */ public FPGrowth(int[] frequency, int minSupport) { this.minSupport = minSupport; T0 = new FPTree(frequency, minSupport); } /** * Constructor. This is a one-step construction of FP-tree if the database * is available in main memory. * @param itemsets the item set dataset. Each row is a item set, which * may have different length. The item identifiers have to be in [0, n), * where n is the number of items. * @param minSupport the required minimum support of item sets in terms * of percentage. */ public FPGrowth(int[][] itemsets, double minSupport) { this(itemsets, (int) Math.ceil(itemsets.length * minSupport)); } /** * Constructor. This is a one-step construction of FP-tree if the database * is available in main memory. * @param itemsets the item set database. Each row is a item set, which * may have different length. The item identifiers have to be in [0, n), * where n is the number of items. Item set should NOT contain duplicated * items. Note that it is reordered after the call. * @param minSupport the required minimum support of item sets in terms * of frequency. */ public FPGrowth(int[][] itemsets, int minSupport) { this.minSupport = minSupport; T0 = new FPTree(itemsets, minSupport); } /** * Add an item set into the object. * @param itemset an item set, which should NOT contain duplicated items. * Note that it is reordered after the call. */ public void add(int[] itemset) { T0.add(itemset); } /** * Returns the number transactions in the database. * @return the number transactions in the database */ public int size() { return T0.size(); } /** * Mines the frequent item sets. The discovered frequent item sets * will be returned in a list. * @return the list of frequent item sets */ public List learn() { List list = new ArrayList(); learn(null, list, null); return list; } /** * Mines the frequent item sets. The discovered frequent item sets * will be printed out to the provided stream. * @param out a print stream for output of frequent item sets. * @return the number of discovered frequent item sets. */ public long learn(PrintStream out) { return learn(out, null, null); } /** * Mines the frequent item sets. The discovered frequent item sets * will be stored in a total support tree. */ TotalSupportTree buildTotalSupportTree() { TotalSupportTree ttree = new TotalSupportTree(minSupport, T0.numFreqItems, T0.order); learn(null, null, ttree); return ttree; } /** * Mines the frequent item sets. The discovered frequent item sets * will be printed out to the provided stream. * @param out a print stream for output of frequent item sets. * @return the number of discovered frequent item sets. */ private long learn(PrintStream out, List list, TotalSupportTree ttree) { if (MulticoreExecutor.getThreadPoolSize() > 1) { return grow(out, list, ttree, T0, null, null, null); } else { return grow(out, list, ttree, T0, null); } } /** * FP-Growth task to execute on each frequent item in the header table. */ class FPGrowthTask implements Callable { /** * The header table item to start. */ List headers; /** * A print stream for output of frequent item sets */ PrintStream out; /** * A list to store frequent item sets. */ List list; /** * Total support tree to store frequent item sets. Used later for * association rule generation. */ TotalSupportTree ttree; /** * A temporary buffer to store prefix of current item set during FP-growth. */ int[] prefixItemset = null; /** * The local item support to generate conditional FP-tree. */ int[] localItemSupport = null; /** * Constructor. */ FPGrowthTask(List headers, PrintStream out, List list, TotalSupportTree ttree) { this.headers = headers; this.out = out; this.list = list; this.ttree = ttree; prefixItemset = new int[T0.maxItemSetSize]; localItemSupport = new int[T0.numItems]; } @Override public Long call() { long n = 0; for (HeaderTableItem header : headers) { n += grow(out, list, ttree, header, null, localItemSupport, prefixItemset); } return n; } } /** * Mines frequent item sets. Start with the bottom of the header table and * work upwards. For each available FP tree node: *
    *
  1. Count the support. *
  2. Build up item set sofar. *
  3. Add to supported sets. *
  4. Build a new FP tree: (i) create a new local root, (ii) create a * new local header table and (iii) populate with ancestors. *
  5. If new local FP tree is not empty repeat mining operation. *
* Otherwise end. * @param header the header table item to start. * @param itemset the current item sets as generated so far (null at start). */ private long grow(PrintStream out, List list, TotalSupportTree ttree, FPTree fptree, int[] itemset) { long n = 0; int[] prefixItemset = new int[T0.maxItemSetSize]; int[] localItemSupport = new int[T0.numItems]; // Loop through header table from end to start, item by item for (int i = fptree.headerTable.length; i-- > 0;) { n += grow(out, list, ttree, fptree.headerTable[i], itemset, localItemSupport, prefixItemset); } return n; } /** * Mines frequent item sets. Start with the bottom of the header table and * work upwards. For each available FP tree node: *
    *
  1. Count the support. *
  2. Build up item set sofar. *
  3. Add to supported sets. *
  4. Build a new FP tree: (i) create a new local root, (ii) create a * new local header table and (iii) populate with ancestors. *
  5. If new local FP tree is not empty repeat mining operation. *
* Otherwise end. * @param header the header table item to start. * @param itemset the current item sets as generated so far (null at start). */ private long grow(PrintStream out, List list, TotalSupportTree ttree, FPTree fptree, int[] itemset, int[] localItemSupport, int[] prefixItemset) { if (fptree == T0) { int nprocs = MulticoreExecutor.getThreadPoolSize(); List> headers = new ArrayList>(); for (int i = 0; i < 2*nprocs; i++) { headers.add(new ArrayList()); } for (int i = fptree.headerTable.length; i-- > 0;) { headers.get(i % headers.size()).add(fptree.headerTable[i]); } List tasks = new ArrayList(); // Loop through header table from end to start, item by item for (int i = 0; i < headers.size(); i++) { // process trail of links from header table element tasks.add(new FPGrowthTask(headers.get(i), out, list, ttree)); } long n = 0; try { List results = MulticoreExecutor.run(tasks); for (long i : results) { n += i; } } catch (Exception e) { System.err.println(e.getMessage()); } return n; } else { long n = 0; // Loop through header table from end to start, item by item for (int i = fptree.headerTable.length; i-- > 0;) { n += grow(out, list, ttree, fptree.headerTable[i], itemset, localItemSupport, prefixItemset); } return n; } } /** * Mines FP-tree with respect to a single element in the header table. * @param header the header table item of interest. * @param itemset the item set represented by the current FP-tree. */ private long grow(PrintStream out, List list, TotalSupportTree ttree, HeaderTableItem header, int[] itemset, int[] localItemSupport, int[] prefixItemset) { long n = 1; int support = header.count; int item = header.id; itemset = insert(itemset, item); if (list != null) { synchronized (list) { list.add(new ItemSet(itemset, support)); } } if (out != null) { synchronized (out) { for (int i = 0; i < itemset.length; i++) { out.format("%d ", itemset[i]); } out.format("(%d)\n", support); } } if (ttree != null) { synchronized (ttree) { ttree.add(itemset, support); } } if (header.node.next == null) { FPTree.Node node = header.node; while (node != null) { FPTree.Node parent = node.parent; int[] newItemset = itemset; while (parent != null) { n++; newItemset = insert(newItemset, parent.id); if (list != null) { synchronized (list) { list.add(new ItemSet(newItemset, support)); } } if (out != null) { synchronized (out) { for (int i = 0; i < newItemset.length; i++) { out.format("%d ", newItemset[i]); } out.format("(%d)\n", support); } } if (ttree != null) { synchronized (ttree) { ttree.add(newItemset, support); } } parent = parent.parent; } node = node.parent; } } else { // Count singles in linked list if (getLocalItemSupport(header.node, localItemSupport)) { // Create local FP tree FPTree fptree = getLocalFPTree(header.node, localItemSupport, prefixItemset); // Mine new FP-tree n += grow(out, list, ttree, fptree, itemset, localItemSupport, prefixItemset); } } return n; } /** * Counts the supports of single items in ancestor item sets linked list. * @return true if there are condition patterns given this node */ private boolean getLocalItemSupport(FPTree.Node node, int[] localItemSupport) { boolean end = true; Arrays.fill(localItemSupport, 0); while (node != null) { int support = node.count; Node parent = node.parent; while (parent != null) { localItemSupport[parent.id] += support; parent = parent.parent; end = false; } node = node.next; } return !end; } /** * Generates a local FP tree * @param node the conditional patterns given this node to construct the local FP-tree. * @rerurn the local FP-tree. */ private FPTree getLocalFPTree(FPTree.Node node, int[] localItemSupport, int[] prefixItemset) { FPTree tree = new FPTree(localItemSupport, minSupport); while (node != null) { Node parent = node.parent; int i = prefixItemset.length; while (parent != null) { if (localItemSupport[parent.id] >= minSupport) { prefixItemset[--i] = parent.id; } parent = parent.parent; } if (i < prefixItemset.length) { tree.add(i, prefixItemset.length, prefixItemset, node.count); } node = node.next; } return tree; } /** * Insert a item to the front of an item set. * @param itemset the original item set. * @param item the new item to be inserted. * @return the combined item set */ static int[] insert(int[] itemset, int item) { if (itemset == null) { int[] newItemset = {item}; return newItemset; } else { int n = itemset.length + 1; int[] newItemset = new int[n]; newItemset[0] = item; System.arraycopy(itemset, 0, newItemset, 1, n - 1); return newItemset; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy