org.apache.iceberg.flink.sink.shuffle.SketchUtil Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.flink.sink.shuffle;

import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;
import java.util.function.Consumer;
import org.apache.datasketches.sampling.ReservoirItemsSketch;
import org.apache.iceberg.SortKey;
import org.apache.iceberg.StructLike;

class SketchUtil {
  static final int COORDINATOR_MIN_RESERVOIR_SIZE = 10_000;
  static final int COORDINATOR_MAX_RESERVOIR_SIZE = 1_000_000;
  static final int COORDINATOR_TARGET_PARTITIONS_MULTIPLIER = 100;
  static final int OPERATOR_OVER_SAMPLE_RATIO = 10;

  // switch the statistics tracking from map to sketch if the cardinality of the sort key is over
  // this threshold. It is hardcoded for now, we can revisit in the future if config is needed.
  static final int OPERATOR_SKETCH_SWITCH_THRESHOLD = 10_000;
  static final int COORDINATOR_SKETCH_SWITCH_THRESHOLD = 100_000;

  private SketchUtil() {}

  /**
   * The larger the reservoir size, the more accurate for range bounds calculation and the more
   * balanced range distribution.
   *
   * Here are the heuristic rules
   * 
Target size: numPartitions x 100 to achieve good accuracy and is easier to calculate the
   *     range bounds
   * 
Min is 10K to achieve good accuracy while memory footprint is still relatively small
   * 
Max is 1M to cap the memory footprint on coordinator
   *
   * @param numPartitions number of range partitions which equals to downstream operator parallelism
   * @return reservoir size
   */
  static int determineCoordinatorReservoirSize(int numPartitions) {
    int reservoirSize = numPartitions * COORDINATOR_TARGET_PARTITIONS_MULTIPLIER;

    if (reservoirSize < COORDINATOR_MIN_RESERVOIR_SIZE) {
      // adjust it up and still make reservoirSize divisible by numPartitions
      int remainder = COORDINATOR_MIN_RESERVOIR_SIZE % numPartitions;
      reservoirSize = COORDINATOR_MIN_RESERVOIR_SIZE + (numPartitions - remainder);
    } else if (reservoirSize > COORDINATOR_MAX_RESERVOIR_SIZE) {
      // adjust it down and still make reservoirSize divisible by numPartitions
      int remainder = COORDINATOR_MAX_RESERVOIR_SIZE % numPartitions;
      reservoirSize = COORDINATOR_MAX_RESERVOIR_SIZE - remainder;
    }

    return reservoirSize;
  }

  /**
   * Determine the sampling reservoir size where operator subtasks collect data statistics.
   *
   * Here are the heuristic rules
   * 
Target size is "coordinator reservoir size * over sampling ration (10) / operator
   *     parallelism"
   * 
Min is 1K to achieve good accuracy while memory footprint is still relatively small
   * 
Max is 100K to cap the memory footprint on coordinator
   *
   * @param numPartitions number of range partitions which equals to downstream operator parallelism
   * @param operatorParallelism data statistics operator parallelism
   * @return reservoir size
   */
  static int determineOperatorReservoirSize(int operatorParallelism, int numPartitions) {
    int coordinatorReservoirSize = determineCoordinatorReservoirSize(numPartitions);
    int totalOperatorSamples = coordinatorReservoirSize * OPERATOR_OVER_SAMPLE_RATIO;
    return (int) Math.ceil((double) totalOperatorSamples / operatorParallelism);
  }

  /**
   * To understand how range bounds are used in range partitioning, here is an example for human
   * ages with 4 partitions: [15, 32, 60]. The 4 ranges would be
   * 
age <= 15
   * 
age > 15 && age <= 32
   * 
age >32 && age <= 60
   * age > 60
   *
   * @param numPartitions number of partitions which maps to downstream operator parallelism
   * @param sketch aggregated reservoir sampling sketch
   * @return list of range partition bounds. It should be a sorted list (ascending). Number of items
   *     should be {@code numPartitions - 1}. if numPartitions is 1, return an empty list
   */
  static SortKey[] rangeBounds(
      int numPartitions, Comparator comparator, ReservoirItemsSketch sketch) {
    SortKey[] sortKeys = sketch.getSamples();
    return determineBounds(Math.min(numPartitions, sortKeys.length), comparator, sortKeys);
  }

  /**
   * This assumes the sort keys have equal weight, which is usually the case for high-cardinality
   * scenarios (like device_id, user_id, uuid etc.).
   */
  static SortKey[] determineBounds(
      int numPartitions, Comparator comparator, SortKey[] sortKeys) {
    // sort the keys first
    Arrays.sort(sortKeys, comparator);
    int numCandidates = numPartitions - 1;
    SortKey[] candidates = new SortKey[numCandidates];
    int step = (int) Math.ceil((double) sortKeys.length / numPartitions);
    int position = step - 1;
    int numChosen = 0;
    while (position < sortKeys.length && numChosen < numCandidates) {
      SortKey candidate = sortKeys[position];
      // skip duplicate values
      if (numChosen > 0 && candidate.equals(candidates[numChosen - 1])) {
        // linear probe for the next distinct value
        position += 1;
      } else {
        candidates[numChosen] = candidate;
        position += step;
        numChosen += 1;
      }
    }

    return candidates;
  }

  /** This can be a bit expensive since it is quadratic. */
  static void convertMapToSketch(
      Map taskMapStats, Consumer sketchConsumer) {
    taskMapStats.forEach(
        (sortKey, count) -> {
          for (int i = 0; i < count; ++i) {
            sketchConsumer.accept(sortKey);
          }
        });
  }
}