org.apache.hadoop.hbase.util.LossyCounting Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hbase-server Show documentation
Show all versions of hbase-server Show documentation
Server functionality for HBase
/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.util;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* LossyCounting utility, bounded data structure that maintains approximate high frequency
* elements in data stream.
*
* Bucket size is 1 / error rate. (Error rate is 0.02 by default)
* Lemma If element does not appear in set, then is frequency is less than e * N
* (N is total element counts until now.)
* Based on paper:
* http://www.vldb.org/conf/2002/S10P03.pdf
*/
@InterfaceAudience.Private
public class LossyCounting {
private static final Logger LOG = LoggerFactory.getLogger(LossyCounting.class);
private long bucketSize;
private long currentTerm;
private double errorRate;
private Map data;
private long totalDataCount;
private String name;
public LossyCounting(double errorRate, String name) {
this.errorRate = errorRate;
this.name = name;
if (errorRate < 0.0 || errorRate > 1.0) {
throw new IllegalArgumentException(" Lossy Counting error rate should be within range [0,1]");
}
this.bucketSize = (long) Math.ceil(1 / errorRate);
this.currentTerm = 1;
this.totalDataCount = 0;
this.data = new ConcurrentHashMap<>();
calculateCurrentTerm();
}
public LossyCounting(String name) {
this(HBaseConfiguration.create().getDouble(HConstants.DEFAULT_LOSSY_COUNTING_ERROR_RATE, 0.02),
name);
}
public Set addByOne(String key) {
data.put(key, data.getOrDefault(key, 0) + 1);
totalDataCount++;
calculateCurrentTerm();
Set dataToBeSwept = new HashSet<>();
if(totalDataCount % bucketSize == 0) {
dataToBeSwept = sweep();
}
return dataToBeSwept;
}
/**
* sweep low frequency data
* @return Names of elements got swept
*/
private Set sweep() {
Set dataToBeSwept = new HashSet<>();
for(Map.Entry entry : data.entrySet()) {
if(entry.getValue() + errorRate < currentTerm) {
dataToBeSwept.add(entry.getKey());
}
}
for(String key : dataToBeSwept) {
data.remove(key);
}
LOG.trace(String.format("%s swept %d elements.", name, dataToBeSwept.size()));
return dataToBeSwept;
}
/**
* Calculate and set current term
*/
private void calculateCurrentTerm() {
this.currentTerm = (int) Math.ceil(1.0 * totalDataCount / bucketSize);
}
public long getBucketSize(){
return bucketSize;
}
public long getDataSize() {
return data.size();
}
public boolean contains(String key) {
return data.containsKey(key);
}
public long getCurrentTerm() {
return currentTerm;
}
}