org.conqat.lib.commons.datamining.FrequentItemSetMiner Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of teamscale-lib-commons Show documentation
Show all versions of teamscale-lib-commons Show documentation
Provides common utility functions
/*
* Copyright (c) CQSE GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.conqat.lib.commons.datamining;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.conqat.lib.commons.assertion.CCSMAssert;
import org.conqat.lib.commons.collections.CollectionUtils;
/**
* A-priori algorithm for mining frequent item sets from shopping baskets. See
* http://en.wikipedia.org/wiki/Apriori_algorithm
*/
public class FrequentItemSetMiner {
/** Threshold for support */
private final double supportThreshold;
/**
* Constructs a new {@link FrequentItemSetMiner}.
*
* @param supportThreshold
* [0..1] denotes in what fraction of baskets an item set must occur to be considered
* frequent.
*/
public FrequentItemSetMiner(double supportThreshold) {
CCSMAssert.isTrue(supportThreshold >= 0 && supportThreshold <= 1, "supportThreshold must be in [0,1]");
this.supportThreshold = supportThreshold;
}
/**
* Mines frequent item sets from the given shopping baskets. The support threshold is the fraction
* of baskets from which a frequent item set is a subset. The commodity factor is used to ignore
* items that are purchased extremely often. If an item occurs in the fraction commodityFactor of
* the baskets, it is ignored for identifying frequent item sets. Elements in baskets must be
* hashable.
*
* @param baskets
* the baskets to be analyzed.
*/
public Set> mineFrequentItemSets(Set> baskets) {
Set> result = new HashSet<>();
// The choice of the names for the identifiers of the local variables
// are intended and originate from the Wikipedia page (see class
// comment).
Map>> L_ks = new HashMap<>();
// compute all item sets of size 1 with support >= supportThreshold
HashSet> singletonItemSets = new HashSet<>();
L_ks.put(1, singletonItemSets);
for (T item : CollectionUtils.unionSetAll(baskets)) {
Set singletonItemSet = Collections.singleton(item);
double support = support(singletonItemSet, baskets);
if (support >= supportThreshold) {
singletonItemSets.add(singletonItemSet);
result.add(new FrequentItemSet<>(singletonItemSet, support));
}
}
int k = 1;
while (true) {
// generate frequent item sets of size k+1 from frequent item sets
// of size k
Set> candidates = apriori_gen(L_ks.get(k), k + 1);
Set> L_k = new HashSet<>();
L_ks.put(k + 1, L_k);
for (Set candidate : candidates) {
double support = support(candidate, baskets);
if (support >= supportThreshold) {
L_k.add(candidate);
result.add(new FrequentItemSet(candidate, support));
}
}
if (L_k.isEmpty()) {
// We're done.
break;
}
k++;
}
return result;
}
/** Generate candidate item sets */
private Set> apriori_gen(Set> L_k_1, int k) {
Set> C_k = new HashSet<>();
List> asList = new ArrayList<>(L_k_1);
for (int i = 0; i < asList.size(); i++) {
inner: for (int j = i + 1; j < asList.size(); j++) {
Set union = CollectionUtils.unionSet(asList.get(i), asList.get(j));
if (union.size() == k) {
for (T item : union) {
Set check = new HashSet<>(union);
check.remove(item);
if (!L_k_1.contains(check)) {
continue inner;
}
}
C_k.add(union);
}
}
}
return C_k;
}
/** Returns the support of itemSet within the baskets. */
private static double support(Set itemSet, Set> baskets) {
int count = 0;
for (Set basket : baskets) {
if (basket.containsAll(itemSet)) {
count++;
}
}
return (double) count / baskets.size();
}
}