
stream.counter.CountSketch Maven / Gradle / Ivy
package stream.counter;
import java.util.Set;
import stream.counter.hashing.SimpleHashFactory;
import stream.learner.Learner;
/**
*
* {@link Learner}-part of the implementation of the CountSketch algorithm from
* the paper 'Finding frequent items in data streams' written by 'Charikar, M.,
* Chen, K., and Farach-colton, M. (2002)'.
*
*
* @author Marcin Skirzynski (main work), Benedikt Kulmann (modifications)
* @see CountSketchModel
*
* @param
*/
public class CountSketch implements Counter {
private static final long serialVersionUID = 1L;
protected CountSketchModel model;
/**
*
* Constructor of the CountSketch algorithm. This construction can take
* quite a long time since the construction of the hashfunctions is rather
* time-consuming.
*
*
* @param domain
* The (estim.) domain, i.e. how many different items are
* expected
* @param numberOfHashFunctions
* The number of hashfunctions which determine a bucket
* @param numberOfBuckets
* The number of buckets where a counter will be maintained
* @param k
* parameter for the top-k variant. If you want to disable the
* top-k overhead, than set k to 0 or lower
*/
public CountSketch(int domain, int numberOfHashFunctions,
int numberOfBuckets, int k) {
model = new CountSketchModel(domain, numberOfHashFunctions,
numberOfBuckets, k, new SimpleHashFactory());
}
/**
* Same as {@link #CountSketch(int, int, int, int)} but with disabled top-k.
*/
public CountSketch(int domain, int numberOfHashFunctions,
int numberOfBuckets) {
this(domain, numberOfHashFunctions, numberOfBuckets, 0);
}
protected CountSketch() {
// a class extending this class might want to use another model type...
}
public void init() {
}
/**
*
* Counts the item by passing it to the internal data strucutre.
*
*
*
* If a k greater than zero was set, the top-k map will be maintained also.
*
*
* @param item
* The item to count
*/
@Override
public void count(T item) {
boolean kGreaterZero = model.updateData(item);
if (!kGreaterZero) {
return;
}
if (model.isTopItem(item)) {
model.incrementCount(item);
} else if (model.notYetKItems()) {
model.insertTopItem(item, 1L);
} else {
/**
* Remove the item with the lowest frequency if the new item has a
* higher frequency.
*/
CountEntry lowFreqItem = model.getItemWithLowestCount();
long estimatedFreq = model.estimateFrequency(item);
if (lowFreqItem.frequency < estimatedFreq) {
model.removeTopItem(lowFreqItem.item);
model.insertTopItem(item, estimatedFreq);
}
}
System.out.println("number of elements: " + model.topItems.size());
}
/**
* @see stream.counter.Counter#getTotalCount()
*/
@Override
public Long getTotalCount() {
return model.getTotalCount();
}
/**
* @see stream.counter.Counter#keySet()
*/
@Override
public Set keySet() {
return model.keySet();
}
/**
* @see stream.counter.Counter#getCount(java.lang.Object)
*/
@Override
public Long getCount(T element) {
return model.getCount(element);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy