
stream.counter.LossyCounting Maven / Gradle / Ivy
package stream.counter;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import stream.learner.Model;
/**
*
* {@link Model}-part of the implementation of the Lossy Counting algorithm
* described in the paper "Approximate Frequency Counts over Data Streams"
* written by 'Rajeev Motwani' and 'Gurmeet Singh Manku'.
*
*
* @author Marcin Skirzynski (main work), Benedikt Kulmann (modifications)
* @see LossyCounting
*
* @param
*/
public class LossyCounting implements
DynamicFrequentItemModel {
private transient static final Logger logger = LoggerFactory
.getLogger(LossyCounting.class);
private static final long serialVersionUID = 1L;
/**
* The data structures which holds all counting information.
*/
private Map> dataStructure;
/**
* The total count of all counted elements in the stream so far.
*/
private long elementsCounted;
/**
* The maximum error set be the user at the beginning.
*/
private double maxError;
public LossyCounting(double maxError) {
this.elementsCounted = 0;
this.maxError = maxError;
this.dataStructure = new ConcurrentHashMap>();
}
public boolean containsItem(T item) {
return dataStructure.containsKey(item);
}
/**
* @see stream.counter.Counter#count(java.lang.Object)
*/
@Override
public void count(T item) {
dataStructure.get(item).frequency++;
elementsCounted++;
}
void insertNewItem(T item, long initialFrequency, long maxError) {
dataStructure.put(item, new CountEntryWithMaxError(item,
initialFrequency, maxError));
elementsCounted++;
}
Map> getDataStructure() {
return dataStructure;
}
/**
* {@inheritDoc}
*/
@Override
public Long getTotalCount() {
return elementsCounted;
}
/**
* {@inheritDoc}
*/
@Override
public Collection getFrequentItems(double minSupport) {
if (!(maxError < minSupport)) {
logger.warn(
"LossyCounting strongly recommends that the maximum error is much lower than the min-support; currently set: error={}, min-support={}",
maxError, minSupport);
}
Collection result = new ArrayList();
for (T element : dataStructure.keySet()) {
CountEntry entry = dataStructure.get(element);
if (entry.frequency >= (minSupport - maxError) * elementsCounted) {
result.add(element);
}
}
return result;
}
/**
*
* Returns the estimated frequency of the given element.
*
*
*
* The LossyCounting algorithm compresses the internal data structure which
* means that an element will be deleted if it doesn't emerge frequently
* enough. That means that even when the element appeared in the stream the
* estimated frequency can be 0.
*
*
* @param item
* the item for which the estimated frequency will be returned
* @return the estimated frequency of the given item
*/
@Override
public Long getCount(T item) {
if (dataStructure.containsKey(item)) {
return dataStructure.get(item).frequency;
}
return 0L;
}
@Override
public Set keySet() {
return dataStructure.keySet();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy