All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.tencent.angel.utils.HLLC Maven / Gradle / Ivy

There is a newer version: 3.2.0
Show newest version
/*
 * Tencent is pleased to support the open source community by making Angel available.
 *
 * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 
 * compliance with the License. You may obtain a copy of the License at
 *
 * https://opensource.org/licenses/Apache-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 *
 */


package com.tencent.angel.utils;

import it.unimi.dsi.fastutil.longs.Long2DoubleMap;
import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import net.agkn.hll.HLL;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.RecursiveTask;

/**
 * A distinct estimate tool
 */
public class HLLC {
  private final static Log LOG = LogFactory.getLog(HLLC.class);

  public static long distinct(Long2DoubleOpenHashMap[] maps) {
    DistinctOp op = new DistinctOp(maps, 0, maps.length);
    ForkJoinPool pool = new ForkJoinPool(16);
    pool.execute(op);
    long value = op.join().cardinality();
    pool.shutdownNow();
    return value;
  }

  static class DistinctOp extends RecursiveTask {
    private final Long2DoubleOpenHashMap[] splits;
    private final int startPos;
    private final int endPos;

    public DistinctOp(Long2DoubleOpenHashMap[] splits, int startPos, int endPos) {
      this.splits = splits;
      this.startPos = startPos;
      this.endPos = endPos;
    }

    @Override protected HLL compute() {
      if (endPos <= startPos) {
        return new HLL(13, 5);
      }

      if (endPos - startPos == 1) {
        if (splits[startPos] != null) {
          return genHll(splits[startPos]);
        } else {
          return new HLL(13, 5);
        }
      } else {
        int middle = (startPos + endPos) / 2;
        DistinctOp opLeft = new DistinctOp(splits, startPos, middle);
        DistinctOp opRight = new DistinctOp(splits, middle, endPos);
        invokeAll(opLeft, opRight);

        try {
          HLL left = opLeft.get();
          left.union(opRight.get());
          return left;
        } catch (InterruptedException | ExecutionException e) {
          LOG.error("DistinctOp failed " + e.getMessage());
          return new HLL(13, 5);
        }
      }
    }

    private HLL genHll(Long2DoubleOpenHashMap map) {
      HLL hll = new HLL(13, 5);
      ObjectIterator iter = map.long2DoubleEntrySet().fastIterator();
      while (iter.hasNext()) {
        hll.addRaw(hash(iter.next().getLongKey()));
      }
      return hll;
    }

    private static long hash(long value) {
      String str = String.valueOf(value);
      byte[] data = str.getBytes();
      return MurmurHash3.murmurhash3_x64_64(data, data.length, 123456);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy