org.apache.datasketches.fdt.PostProcessor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of datasketches-java Show documentation
Core sketch algorithms used alone and by other Java repositories in the DataSketches library.
There is a newer version: 6.1.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.fdt;

import static org.apache.datasketches.common.Util.ceilingIntPowerOf2;
import static org.apache.datasketches.thetacommon.HashOperations.hashSearchOrInsert;
import static org.apache.datasketches.tuple.Util.stringHash;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;

import org.apache.datasketches.tuple.TupleSketchIterator;
import org.apache.datasketches.tuple.strings.ArrayOfStringsSummary;

/**
 * This processes the contents of a FDT sketch to extract the
 * primary keys with the most frequent unique combinations of the non-primary dimensions.
 * The source sketch is not modified.
 *
 * @author Lee Rhodes
 */
public class PostProcessor {
  private final FdtSketch sketch;
  private final char sep;
  private int groupCount;
  @SuppressWarnings("unused")
  private Group group; //uninitialized

  //simple hash-map
  private boolean mapValid;
  private final int mapArrSize;
  private final long[] hashArr;
  private final String[] priKeyArr;
  private final int[] counterArr;

  /**
   * Construct with a populated FdtSketch
   * @param sketch the given sketch to query.
   * @param group the Group
   * @param sep the separator character
   */
  public PostProcessor(final FdtSketch sketch, final Group group, final char sep) {
    Objects.requireNonNull(sketch, "sketch must be non-null");
    Objects.requireNonNull(group, "group must be non-null");
    this.sketch = sketch.copy();
    this.sep = sep;
    final int numEntries = sketch.getRetainedEntries();
    mapArrSize = ceilingIntPowerOf2((int)(numEntries / 0.75));
    hashArr = new long[mapArrSize];
    priKeyArr = new String[mapArrSize];
    counterArr = new int[mapArrSize];
    mapValid = false;
    this.group = group;
  }

  /**
   * Returns the number of groups in the final sketch.
   * @return the number of groups in the final sketch.
   */
  public int getGroupCount() {
    return groupCount;
  }

  /**
   * Return the most frequent Groups associated with Primary Keys based on the size of the groups.
   * @param priKeyIndices the indices of the primary dimensions
   * @param numStdDev the number of standard deviations for the error bounds, this value is an
   * integer and must be one of 1, 2, or 3.
   * See Number of Standard Deviations
   * @param limit the maximum number of rows to return. If ≤ 0, all rows will be returned.
   * @return the most frequent Groups associated with Primary Keys based on the size of the groups.
   */
  public List getGroupList(final int[] priKeyIndices, final int numStdDev,
      final int limit) {
    //allows subsequent queries with different priKeyIndices without rebuilding the map
    if (!mapValid) { populateMap(priKeyIndices); }
    return populateList(numStdDev, limit);
  }

  /**
   * Scan each entry in the sketch. Count the number of duplicate occurrences of each
   * primary key in a hash map.
   * @param priKeyIndices identifies the primary key indices
   */
  private void populateMap(final int[] priKeyIndices) {
    final TupleSketchIterator it = sketch.iterator();
    Arrays.fill(hashArr, 0L);
    Arrays.fill(priKeyArr, null);
    Arrays.fill(counterArr, 0);
    groupCount = 0;
    final int lgMapArrSize = Integer.numberOfTrailingZeros(mapArrSize);

    while (it.next()) {
      //getSummary() is not a copy, but getValue() is
      final String[] arr = it.getSummary().getValue();
      final String priKey = getPrimaryKey(arr, priKeyIndices, sep);
      final long hash = stringHash(priKey);
      final int index = hashSearchOrInsert(hashArr, lgMapArrSize, hash);
      if (index < 0) { //was empty, hash inserted
        final int idx = -(index + 1); //actual index
        counterArr[idx] = 1;
        groupCount++;
        priKeyArr[idx] = priKey;
      } else { //found, duplicate
        counterArr[index]++; //increment
      }
    }
    mapValid = true;
  }

  /**
   * Create the list of groups along with the error statistics
   * @param numStdDev number of standard deviations
   * @param limit the maximum size of the list to return
   * @return the list of groups along with the error statistics
   */
  private List populateList(final int numStdDev, final int limit) {
    final List list = new ArrayList<>();
    for (int i = 0; i < mapArrSize; i++) {
      if (hashArr[i] != 0) {
        final String priKey = priKeyArr[i];
        final int count = counterArr[i];
        final double est = sketch.getEstimate(count);
        final double ub = sketch.getUpperBound(numStdDev, count);
        final double lb = sketch.getLowerBound(numStdDev, count);
        final double thresh = (double) count / sketch.getRetainedEntries();
        final double rse = (sketch.getUpperBound(1, count) / est) - 1.0;
        final Group gp = new Group();
        gp.init(priKey, count, est, ub, lb, thresh, rse);
        list.add(gp);
      }
    }
    list.sort(null); //Comparable implemented in Group
    final int totLen = list.size();

    final List returnList;
    if ((limit > 0) && (limit < totLen)) {
      returnList = list.subList(0, limit);
    } else {
      returnList = list;
    }
    return returnList;
  }

  /**
   * Extract simple string Primary Key defined by the priKeyIndices from the given tuple.
   * @param tuple the given tuple containing the Primary Key
   * @param priKeyIndices the indices indicating the ordering and selection of dimensions defining
   * the Primary Key
   * @param sep the separator character
   * @return a simple string Primary Key defined by the priKeyIndices from the given tuple.
   */
  //also used by test
  private static String getPrimaryKey(final String[] tuple, final int[] priKeyIndices,
      final char sep) {
    assert priKeyIndices.length < tuple.length;
    final StringBuilder sb = new StringBuilder();
    final int keys = priKeyIndices.length;
    for (int i = 0; i < keys; i++) {
      final int idx = priKeyIndices[i];
      sb.append(tuple[idx]);
      if ((i + 1) < keys) { sb.append(sep); }
    }
    return sb.toString();
  }

}