All Downloads are FREE. Search and download functionalities are using the official Maven repository.

co.cask.cdap.dq.functions.HistogramWithBucketing Maven / Gradle / Ivy

/*
 * Copyright © 2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.dq.functions;

import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.dq.DataQualityWritable;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;

import java.lang.reflect.Type;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

/**
 * Aggregation function creates a histogram with custom bucketing
 * for numbers - no categorical data
 */
public class HistogramWithBucketing implements BasicAggregationFunction, Integer>> {
  private static final Gson GSON = new Gson();
  private static final Type TOKEN_TYPE_MAP_MAP_ENTRY_DOUBLE_DOUBLE_LONG =
    new TypeToken, Long>>() { }.getType();

  private ArrayList values = new ArrayList<>();
  private Double max = Double.MIN_VALUE;
  private Double min = Double.MAX_VALUE;
  public Map, Long> histogram = new HashMap<>();

  @Override
  public void add(DataQualityWritable value) {
    Double newValue = Double.parseDouble(value.get().toString());
    max = newValue > max ? newValue : max;
    min = newValue < min ? newValue : min;
    values.add(newValue);
  }

  @Override
  public byte[] aggregate() {
    Bucketing bucketing = new Bucketing("automatic", null);
    bucketing.doBucketing();
    for (Double value : values) {
      for (Map.Entry, Long> bucketMapEntry : histogram.entrySet()) {
        if (value >= bucketMapEntry.getKey().getKey() && value <= bucketMapEntry.getKey().getValue()) {
          bucketMapEntry.setValue(bucketMapEntry.getValue() + 1);
          break;
        }
      }
    }
    String aggregationJSON = GSON.toJson(histogram);
    return Bytes.toBytes(aggregationJSON);
  }

  @Override
  public Map, Integer> deserialize(byte[] serializedValue) {
    String valueJSON = Bytes.toString(serializedValue);
    return GSON.fromJson(valueJSON, TOKEN_TYPE_MAP_MAP_ENTRY_DOUBLE_DOUBLE_LONG);
  }

  private class Bucketing {
    String bucketingStrategy;
    Integer maxBucketSize;
    private Bucketing(String bucketingStrategy, Integer maxBucketSize) {
      this.maxBucketSize = maxBucketSize == null ? 10 : maxBucketSize;
      this.bucketingStrategy = bucketingStrategy;
    }

    private void doBucketing() {
      if ("automatic".equals(bucketingStrategy)) {
        automaticallyGenerateBuckets();
      } else {
        if (maxBucketSize > 0) {
          manuallyGenerateBuckets(maxBucketSize);
        }
      }
    }

    /**
     * Generates buckets using the Freedman-Diaconis rule
     * Which says: Bin size = 2 * IQR(x) n^(-1/3)
     */
    private void automaticallyGenerateBuckets() {
      Collections.sort(values);
      long valuesListSize = (long) values.size();
      long quartile = (long) Math.floor(valuesListSize / 4.0);
      Double firstQuartile = values.get((int) quartile);
      Double thirdQuartile = values.get((int) quartile * 3);
      Double interquartileRange = thirdQuartile - firstQuartile;
      Long maxBucketSize = (long) Math.ceil(2 * interquartileRange * Math.pow(valuesListSize, -1 / 3));
      if (maxBucketSize == 0L) {
        maxBucketSize = 1L;
      }
      for (double i = min; i < max; i += maxBucketSize) {
        Map.Entry mapEntry =
          new AbstractMap.SimpleEntry<>(i, i + maxBucketSize);
        histogram.put(mapEntry, 0L);
      }
    }

    /**
     * Generates buckets by simply allowing the max span of a
     * bucket to be maxBucketSize
     */
    private void manuallyGenerateBuckets(long maxBucketSize) {
      Collections.sort(values);
      for (double i = min; i < max; i += maxBucketSize) {
        Map.Entry mapEntry =
          new AbstractMap.SimpleEntry<>(i, i + maxBucketSize);
        histogram.put(mapEntry, 0L);
      }
    }
  }
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy