All Downloads are FREE. Search and download functionalities are using the official Maven repository.

stream.counter.LossyCounting Maven / Gradle / Ivy

package stream.counter;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import stream.learner.Model;

/**
 * 

* {@link Model}-part of the implementation of the Lossy Counting algorithm * described in the paper "Approximate Frequency Counts over Data Streams" * written by 'Rajeev Motwani' and 'Gurmeet Singh Manku'. *

* * @author Marcin Skirzynski (main work), Benedikt Kulmann (modifications) * @see LossyCounting * * @param */ public class LossyCounting implements DynamicFrequentItemModel { private transient static final Logger logger = LoggerFactory .getLogger(LossyCounting.class); private static final long serialVersionUID = 1L; /** * The data structures which holds all counting information. */ private Map> dataStructure; /** * The total count of all counted elements in the stream so far. */ private long elementsCounted; /** * The maximum error set be the user at the beginning. */ private double maxError; public LossyCounting(double maxError) { this.elementsCounted = 0; this.maxError = maxError; this.dataStructure = new ConcurrentHashMap>(); } public boolean containsItem(T item) { return dataStructure.containsKey(item); } /** * @see stream.counter.Counter#count(java.lang.Object) */ @Override public void count(T item) { dataStructure.get(item).frequency++; elementsCounted++; } void insertNewItem(T item, long initialFrequency, long maxError) { dataStructure.put(item, new CountEntryWithMaxError(item, initialFrequency, maxError)); elementsCounted++; } Map> getDataStructure() { return dataStructure; } /** * {@inheritDoc} */ @Override public Long getTotalCount() { return elementsCounted; } /** * {@inheritDoc} */ @Override public Collection getFrequentItems(double minSupport) { if (!(maxError < minSupport)) { logger.warn( "LossyCounting strongly recommends that the maximum error is much lower than the min-support; currently set: error={}, min-support={}", maxError, minSupport); } Collection result = new ArrayList(); for (T element : dataStructure.keySet()) { CountEntry entry = dataStructure.get(element); if (entry.frequency >= (minSupport - maxError) * elementsCounted) { result.add(element); } } return result; } /** *

* Returns the estimated frequency of the given element. *

* *

* The LossyCounting algorithm compresses the internal data structure which * means that an element will be deleted if it doesn't emerge frequently * enough. That means that even when the element appeared in the stream the * estimated frequency can be 0. *

* * @param item * the item for which the estimated frequency will be returned * @return the estimated frequency of the given item */ @Override public Long getCount(T item) { if (dataStructure.containsKey(item)) { return dataStructure.get(item).frequency; } return 0L; } @Override public Set keySet() { return dataStructure.keySet(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy