org.apache.paimon.flink.sink.index.GlobalDynamicBucketSink Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.paimon.flink.sink.index;
import org.apache.paimon.CoreOptions;
import org.apache.paimon.crosspartition.IndexBootstrap;
import org.apache.paimon.crosspartition.KeyPartOrRow;
import org.apache.paimon.data.InternalRow;
import org.apache.paimon.flink.sink.Committable;
import org.apache.paimon.flink.sink.DynamicBucketRowWriteOperator;
import org.apache.paimon.flink.sink.FlinkWriteSink;
import org.apache.paimon.flink.sink.RowWithBucketChannelComputer;
import org.apache.paimon.flink.sink.StoreSinkWrite;
import org.apache.paimon.flink.utils.InternalRowTypeSerializer;
import org.apache.paimon.flink.utils.InternalTypeInfo;
import org.apache.paimon.schema.TableSchema;
import org.apache.paimon.table.FileStoreTable;
import org.apache.paimon.types.RowType;
import org.apache.paimon.utils.MathUtils;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSink;
import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory;
import javax.annotation.Nullable;
import java.util.List;
import java.util.Map;
import static org.apache.paimon.CoreOptions.createCommitUser;
import static org.apache.paimon.crosspartition.IndexBootstrap.bootstrapType;
import static org.apache.paimon.flink.FlinkConnectorOptions.SINK_CROSS_PARTITION_MANAGED_MEMORY;
import static org.apache.paimon.flink.sink.FlinkStreamPartitioner.partition;
import static org.apache.paimon.flink.utils.ManagedMemoryUtils.declareManagedMemory;
/** Sink for global dynamic bucket table. */
public class GlobalDynamicBucketSink extends FlinkWriteSink> {
private static final long serialVersionUID = 1L;
public GlobalDynamicBucketSink(
FileStoreTable table, @Nullable Map overwritePartition) {
super(table, overwritePartition);
}
@Override
protected OneInputStreamOperatorFactory, Committable>
createWriteOperatorFactory(StoreSinkWrite.Provider writeProvider, String commitUser) {
return new DynamicBucketRowWriteOperator.Factory(table, writeProvider, commitUser);
}
public DataStreamSink> build(DataStream input, @Nullable Integer parallelism) {
TableSchema schema = table.schema();
CoreOptions options = table.coreOptions();
RowType rowType = schema.logicalRowType();
List primaryKeys = schema.primaryKeys();
InternalRowTypeSerializer rowSerializer = new InternalRowTypeSerializer(rowType);
RowType bootstrapType = bootstrapType(schema);
InternalRowTypeSerializer bootstrapSerializer =
new InternalRowTypeSerializer(bootstrapType);
// Topology:
// input -- bootstrap -- shuffle by key hash --> bucket-assigner -- shuffle by bucket -->
// writer --> committer
DataStream> bootstraped =
input.transform(
"INDEX_BOOTSTRAP",
new InternalTypeInfo<>(
new KeyWithRowSerializer<>(
bootstrapSerializer, rowSerializer)),
new IndexBootstrapOperator.Factory<>(
new IndexBootstrap(table), r -> r))
.setParallelism(input.getParallelism());
// 1. shuffle by key hash
Integer assignerParallelism =
MathUtils.max(
options.dynamicBucketInitialBuckets(),
options.dynamicBucketAssignerParallelism());
if (assignerParallelism == null) {
assignerParallelism = parallelism;
}
KeyPartRowChannelComputer channelComputer =
new KeyPartRowChannelComputer(rowType, bootstrapType, primaryKeys);
DataStream> partitionByKeyHash =
partition(bootstraped, channelComputer, assignerParallelism);
// 2. bucket-assigner
TupleTypeInfo> rowWithBucketType =
new TupleTypeInfo<>(input.getType(), BasicTypeInfo.INT_TYPE_INFO);
DataStream> bucketAssigned =
partitionByKeyHash
.transform(
"cross-partition-bucket-assigner",
rowWithBucketType,
GlobalIndexAssignerOperator.forRowData(table))
.setParallelism(partitionByKeyHash.getParallelism());
// declare managed memory for RocksDB
declareManagedMemory(
bucketAssigned, options.toConfiguration().get(SINK_CROSS_PARTITION_MANAGED_MEMORY));
// 3. shuffle by bucket
DataStream> partitionByBucket =
partition(bucketAssigned, new RowWithBucketChannelComputer(schema), parallelism);
// 4. writer and committer
return sinkFrom(partitionByBucket, createCommitUser(options.toConfiguration()));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy