![JAR search and dependency download from the Maven repository](/logo.png)
org.dinky.shaded.paimon.table.source.MergeTreeSplitGenerator Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dinky.shaded.paimon.table.source;
import org.dinky.shaded.paimon.data.InternalRow;
import org.dinky.shaded.paimon.io.DataFileMeta;
import org.dinky.shaded.paimon.mergetree.SortedRun;
import org.dinky.shaded.paimon.mergetree.compact.IntervalPartition;
import org.dinky.shaded.paimon.utils.BinPacking;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
/** Merge tree implementation of {@link SplitGenerator}. */
public class MergeTreeSplitGenerator implements SplitGenerator {
private final Comparator keyComparator;
private final long targetSplitSize;
private final long openFileCost;
public MergeTreeSplitGenerator(
Comparator keyComparator, long targetSplitSize, long openFileCost) {
this.keyComparator = keyComparator;
this.targetSplitSize = targetSplitSize;
this.openFileCost = openFileCost;
}
@Override
public List> splitForBatch(List files) {
/*
* The generator aims to parallel the scan execution by slicing the files of each bucket
* into multiple splits. The generation has one constraint: files with intersected key
* ranges (within one section) must go to the same split. Therefore, the files are first to go
* through the interval partition algorithm to generate sections and then through the
* OrderedPack algorithm. Note that the item to be packed here is each section, the capacity
* is denoted as the targetSplitSize, and the final number of the bins is the number of
* splits generated.
*
* For instance, there are files: [1, 2] [3, 4] [5, 180] [5, 190] [200, 600] [210, 700]
* with targetSplitSize 128M. After interval partition, there are four sections:
* - section1: [1, 2]
* - section2: [3, 4]
* - section3: [5, 180], [5, 190]
* - section4: [200, 600], [210, 700]
*
* After OrderedPack, section1 and section2 will be put into one bin (split), so the final result will be:
* - split1: [1, 2] [3, 4]
* - split2: [5, 180] [5,190]
* - split3: [200, 600] [210, 700]
*/
List> sections =
new IntervalPartition(files, keyComparator)
.partition().stream().map(this::flatRun).collect(Collectors.toList());
return packSplits(sections);
}
@Override
public List> splitForStreaming(List files) {
// We don't split streaming scan files
return Collections.singletonList(files);
}
private List> packSplits(List> sections) {
Function, Long> weightFunc =
file -> Math.max(totalSize(file), openFileCost);
return BinPacking.packForOrdered(sections, weightFunc, targetSplitSize).stream()
.map(this::flatFiles)
.collect(Collectors.toList());
}
private long totalSize(List section) {
long size = 0L;
for (DataFileMeta file : section) {
size += file.fileSize();
}
return size;
}
private List flatRun(List section) {
List files = new ArrayList<>();
section.forEach(run -> files.addAll(run.files()));
return files;
}
private List flatFiles(List> section) {
List files = new ArrayList<>();
section.forEach(files::addAll);
return files;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy