All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.clearspring.analytics.stream.StreamSummary Maven / Gradle / Ivy

Go to download

A library for summarizing data in streams for which it is infeasible to store all events

The newest version!
/*
 * Copyright (C) 2011 Clearspring Technologies, Inc. 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.clearspring.analytics.stream;

import java.io.ByteArrayInputStream;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import com.clearspring.analytics.util.DoublyLinkedList;
import com.clearspring.analytics.util.ExternalizableUtil;
import com.clearspring.analytics.util.ListNode2;
import com.clearspring.analytics.util.Pair;

/**
 * Based on the Space-Saving algorithm and the Stream-Summary
 * data structure as described in:
 * Efficient Computation of Frequent and Top-k Elements in Data Streams
 * by Metwally, Agrawal, and Abbadi
 *
 * @param  type of data in the stream to be summarized
 */
public class StreamSummary implements ITopK, Externalizable {

    protected class Bucket {

        protected DoublyLinkedList> counterList;

        private long count;

        public Bucket(long count) {
            this.count = count;
            this.counterList = new DoublyLinkedList>();
        }
    }

    protected int capacity;
    private HashMap>> counterMap;
    protected DoublyLinkedList bucketList;

    /**
     * @param capacity maximum size (larger capacities improve accuracy)
     */
    public StreamSummary(int capacity) {
        this.capacity = capacity;
        counterMap = new HashMap>>();
        bucketList = new DoublyLinkedList();
    }

    public int getCapacity() {
        return capacity;
    }

    /**
     * Algorithm: Space-Saving
     *
     * @param item stream element (e)
     * @return false if item was already in the stream summary, true otherwise
     */
    @Override
    public boolean offer(T item) {
        return offer(item, 1);
    }

    /**
     * Algorithm: Space-Saving
     *
     * @param item stream element (e)
     * @return false if item was already in the stream summary, true otherwise
     */
    @Override
    public boolean offer(T item, int incrementCount) {
        return offerReturnAll(item, incrementCount).left;
    }

    /**
     * @param item stream element (e)
     * @return item dropped from summary if an item was dropped, null otherwise
     */
    public T offerReturnDropped(T item, int incrementCount) {
        return offerReturnAll(item, incrementCount).right;
    }

    /**
     * @param item stream element (e)
     * @return Pair where isNewItem is the return value of offer() and itemDropped is null if no item was dropped
     */
    public Pair offerReturnAll(T item, int incrementCount) {
        ListNode2> counterNode = counterMap.get(item);
        boolean isNewItem = (counterNode == null);
        T droppedItem = null;
        if (isNewItem) {

            if (size() < capacity) {
                counterNode = bucketList.enqueue(new Bucket(0)).getValue().counterList.add(new Counter(bucketList.tail(), item));
            } else {
                Bucket min = bucketList.first();
                counterNode = min.counterList.tail();
                Counter counter = counterNode.getValue();
                droppedItem = counter.item;
                counterMap.remove(droppedItem);
                counter.item = item;
                counter.error = min.count;
            }
            counterMap.put(item, counterNode);
        }

        incrementCounter(counterNode, incrementCount);

        return new Pair(isNewItem, droppedItem);
    }

    protected void incrementCounter(ListNode2> counterNode, int incrementCount) {
        Counter counter = counterNode.getValue();       // count_i
        ListNode2 oldNode = counter.bucketNode;
        Bucket bucket = oldNode.getValue();         // Let Bucket_i be the bucket of count_i
        bucket.counterList.remove(counterNode);            // Detach count_i from Bucket_i's child-list
        counter.count = counter.count + incrementCount;

        // Finding the right bucket for count_i
        // Because we allow a single call to increment count more than once, this may not be the adjacent bucket. 
        ListNode2 bucketNodePrev = oldNode;
        ListNode2 bucketNodeNext = bucketNodePrev.getNext();
        while (bucketNodeNext != null) {
            Bucket bucketNext = bucketNodeNext.getValue(); // Let Bucket_i^+ be Bucket_i's neighbor of larger value
            if (counter.count == bucketNext.count) {
                bucketNext.counterList.add(counterNode);    // Attach count_i to Bucket_i^+'s child-list
                break;
            } else if (counter.count > bucketNext.count) {
                bucketNodePrev = bucketNodeNext;
                bucketNodeNext = bucketNodePrev.getNext();  // Continue hunting for an appropriate bucket
            } else {
                // A new bucket has to be created
                bucketNodeNext = null;
            }
        }

        if (bucketNodeNext == null) {
            Bucket bucketNext = new Bucket(counter.count);
            bucketNext.counterList.add(counterNode);
            bucketNodeNext = bucketList.addAfter(bucketNodePrev, bucketNext);
        }
        counter.bucketNode = bucketNodeNext;

        //Cleaning up
        if (bucket.counterList.isEmpty())           // If Bucket_i's child-list is empty
        {
            bucketList.remove(oldNode);         // Detach Bucket_i from the Stream-Summary
        }
    }

    @Override
    public List peek(int k) {
        List topK = new ArrayList(k);

        for (ListNode2 bNode = bucketList.head(); bNode != null; bNode = bNode.getPrev()) {
            Bucket b = bNode.getValue();
            for (Counter c : b.counterList) {
                if (topK.size() == k) {
                    return topK;
                }
                topK.add(c.item);
            }
        }

        return topK;
    }

    public List> topK(int k) {
        List> topK = new ArrayList>(k);

        for (ListNode2 bNode = bucketList.head(); bNode != null; bNode = bNode.getPrev()) {
            Bucket b = bNode.getValue();
            for (Counter c : b.counterList) {
                if (topK.size() == k) {
                    return topK;
                }
                topK.add(c);
            }
        }

        return topK;
    }

    /**
     * @return number of items stored
     */
    public int size() {
        return counterMap.size();
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append('[');
        for (ListNode2 bNode = bucketList.head(); bNode != null; bNode = bNode.getPrev()) {
            Bucket b = bNode.getValue();
            sb.append('{');
            sb.append(b.count);
            sb.append(":[");
            for (Counter c : b.counterList) {
                sb.append('{');
                sb.append(c.item);
                sb.append(':');
                sb.append(c.error);
                sb.append("},");
            }
            if (b.counterList.size() > 0) {
                sb.deleteCharAt(sb.length() - 1);
            }
            sb.append("]},");
        }
        if (bucketList.size() > 0) {
            sb.deleteCharAt(sb.length() - 1);
        }
        sb.append(']');
        return sb.toString();
    }

    @SuppressWarnings("unchecked")
    @Override
    public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
        this.bucketList = new DoublyLinkedList();
        this.capacity = in.readInt();

        int size = in.readInt();
        this.counterMap = new HashMap>>(size);

        Bucket currentBucket = null;
        ListNode2 currentBucketNode = null;
        for (int i = 0; i < size; i++) {
            Counter c = (Counter) in.readObject();
            if (currentBucket == null || c.count != currentBucket.count) {
                currentBucket = new Bucket(c.count);
                currentBucketNode = bucketList.add(currentBucket);
            }
            c.bucketNode = currentBucketNode;
            counterMap.put(c.item, currentBucket.counterList.add(c));
        }
    }

    @Override
    public void writeExternal(ObjectOutput out) throws IOException {
        out.writeInt(this.capacity);
        out.writeInt(this.size());
        for (ListNode2 bNode = bucketList.tail(); bNode != null; bNode = bNode.getNext()) {
            Bucket b = bNode.getValue();
            for (Counter c : b.counterList) {
                out.writeObject(c);
            }
        }
    }

    /**
     * For de-serialization
     */
    public StreamSummary() {
    }

    /**
     * For de-serialization
     *
     * @param bytes
     * @throws IOException
     * @throws ClassNotFoundException
     */
    public StreamSummary(byte[] bytes) throws IOException, ClassNotFoundException {
        fromBytes(bytes);
    }

    public void fromBytes(byte[] bytes) throws IOException, ClassNotFoundException {
        readExternal(new ObjectInputStream(new ByteArrayInputStream(bytes)));
    }

    public byte[] toBytes() throws IOException {
        return ExternalizableUtil.toBytes(this);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy