
org.apache.iceberg.flink.sink.shuffle.SketchUtil Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.flink.sink.shuffle;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;
import java.util.function.Consumer;
import org.apache.datasketches.sampling.ReservoirItemsSketch;
import org.apache.iceberg.SortKey;
import org.apache.iceberg.StructLike;
class SketchUtil {
static final int COORDINATOR_MIN_RESERVOIR_SIZE = 10_000;
static final int COORDINATOR_MAX_RESERVOIR_SIZE = 1_000_000;
static final int COORDINATOR_TARGET_PARTITIONS_MULTIPLIER = 100;
static final int OPERATOR_OVER_SAMPLE_RATIO = 10;
// switch the statistics tracking from map to sketch if the cardinality of the sort key is over
// this threshold. It is hardcoded for now, we can revisit in the future if config is needed.
static final int OPERATOR_SKETCH_SWITCH_THRESHOLD = 10_000;
static final int COORDINATOR_SKETCH_SWITCH_THRESHOLD = 100_000;
private SketchUtil() {}
/**
* The larger the reservoir size, the more accurate for range bounds calculation and the more
* balanced range distribution.
*
* Here are the heuristic rules
*
Target size: numPartitions x 100 to achieve good accuracy and is easier to calculate the
* range bounds
* Min is 10K to achieve good accuracy while memory footprint is still relatively small
* Max is 1M to cap the memory footprint on coordinator
*
* @param numPartitions number of range partitions which equals to downstream operator parallelism
* @return reservoir size
*/
static int determineCoordinatorReservoirSize(int numPartitions) {
int reservoirSize = numPartitions * COORDINATOR_TARGET_PARTITIONS_MULTIPLIER;
if (reservoirSize < COORDINATOR_MIN_RESERVOIR_SIZE) {
// adjust it up and still make reservoirSize divisible by numPartitions
int remainder = COORDINATOR_MIN_RESERVOIR_SIZE % numPartitions;
reservoirSize = COORDINATOR_MIN_RESERVOIR_SIZE + (numPartitions - remainder);
} else if (reservoirSize > COORDINATOR_MAX_RESERVOIR_SIZE) {
// adjust it down and still make reservoirSize divisible by numPartitions
int remainder = COORDINATOR_MAX_RESERVOIR_SIZE % numPartitions;
reservoirSize = COORDINATOR_MAX_RESERVOIR_SIZE - remainder;
}
return reservoirSize;
}
/**
* Determine the sampling reservoir size where operator subtasks collect data statistics.
*
* Here are the heuristic rules
*
Target size is "coordinator reservoir size * over sampling ration (10) / operator
* parallelism"
* Min is 1K to achieve good accuracy while memory footprint is still relatively small
* Max is 100K to cap the memory footprint on coordinator
*
* @param numPartitions number of range partitions which equals to downstream operator parallelism
* @param operatorParallelism data statistics operator parallelism
* @return reservoir size
*/
static int determineOperatorReservoirSize(int operatorParallelism, int numPartitions) {
int coordinatorReservoirSize = determineCoordinatorReservoirSize(numPartitions);
int totalOperatorSamples = coordinatorReservoirSize * OPERATOR_OVER_SAMPLE_RATIO;
return (int) Math.ceil((double) totalOperatorSamples / operatorParallelism);
}
/**
* To understand how range bounds are used in range partitioning, here is an example for human
* ages with 4 partitions: [15, 32, 60]. The 4 ranges would be
* age <= 15
* age > 15 && age <= 32
* age >32 && age <= 60
* age > 60
*
* @param numPartitions number of partitions which maps to downstream operator parallelism
* @param sketch aggregated reservoir sampling sketch
* @return list of range partition bounds. It should be a sorted list (ascending). Number of items
* should be {@code numPartitions - 1}. if numPartitions is 1, return an empty list
*/
static SortKey[] rangeBounds(
int numPartitions, Comparator comparator, ReservoirItemsSketch sketch) {
SortKey[] sortKeys = sketch.getSamples();
return determineBounds(Math.min(numPartitions, sortKeys.length), comparator, sortKeys);
}
/**
* This assumes the sort keys have equal weight, which is usually the case for high-cardinality
* scenarios (like device_id, user_id, uuid etc.).
*/
static SortKey[] determineBounds(
int numPartitions, Comparator comparator, SortKey[] sortKeys) {
// sort the keys first
Arrays.sort(sortKeys, comparator);
int numCandidates = numPartitions - 1;
SortKey[] candidates = new SortKey[numCandidates];
int step = (int) Math.ceil((double) sortKeys.length / numPartitions);
int position = step - 1;
int numChosen = 0;
while (position < sortKeys.length && numChosen < numCandidates) {
SortKey candidate = sortKeys[position];
// skip duplicate values
if (numChosen > 0 && candidate.equals(candidates[numChosen - 1])) {
// linear probe for the next distinct value
position += 1;
} else {
candidates[numChosen] = candidate;
position += step;
numChosen += 1;
}
}
return candidates;
}
/** This can be a bit expensive since it is quadratic. */
static void convertMapToSketch(
Map taskMapStats, Consumer sketchConsumer) {
taskMapStats.forEach(
(sortKey, count) -> {
for (int i = 0; i < count; ++i) {
sketchConsumer.accept(sortKey);
}
});
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy