io.delta.flink.sink.internal.DeltaBucketAssigner Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.delta.flink.sink.internal;
import java.util.LinkedHashMap;
import io.delta.flink.sink.RowDataDeltaSinkBuilder;
import org.apache.flink.core.io.SimpleVersionedSerializer;
import org.apache.flink.streaming.api.functions.sink.filesystem.BucketAssigner;
import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.SimpleVersionedStringSerializer;
import org.apache.flink.table.utils.PartitionPathUtils;
/**
* Custom implementation of {@link BucketAssigner} class required to provide behaviour on how
* to map particular events to buckets (aka partitions).
*
* This implementation can be perceived as a utility class for complying to the DeltaLake's
* partitioning style (that follows Apache Hive's partitioning style by providing the partitioning
* column's and its values as FS directories paths, e.g. "/some_path/table_1/date=2020-01-01")
* It's still possible for users to roll out their own version of {@link BucketAssigner}
* and pass it to the {@link DeltaSinkBuilder} during creation of the sink.
*
* This {@link DeltaBucketAssigner} is applicable only to {@link DeltaSinkBuilder} and not to
* {@link RowDataDeltaSinkBuilder}. The former lets you use this
* {@link DeltaBucketAssigner} to provide the required custom bucketing behaviour, while the latter
* doesn't expose a custom bucketing API, and you can provide the partition column keys only.
*
* Thus, this {@link DeltaBucketAssigner} is currently not exposed to the user through any public
* API.
*
* In the future, if you'd like to implement your own custom bucketing...
*
* /////////////////////////////////////////////////////////////////////////////////
* // implements a custom partition computer
* /////////////////////////////////////////////////////////////////////////////////
* static class CustomPartitionColumnComputer implements DeltaPartitionComputer<RowData> {
*
* @Override
* public LinkedHashMap<String, String> generatePartitionValues(
* RowData element, BucketAssigner.Context context) {
* String f1 = element.getString(0).toString();
* int f3 = element.getInt(2);
* LinkedHashMap<String, String> partitionSpec = new LinkedHashMap<>();
* partitionSpec.put("f1", f1);
* partitionSpec.put("f3", Integer.toString(f3));
* return partitionSpec;
* }
* }
* ...
* /////////////////////////////////////////
* // creates partition assigner for a custom partition computer
* /////////////////////////////////////////
* DeltaBucketAssignerInternal<RowData> partitionAssigner =
* new DeltaBucketAssignerInternal<>(new CustomPartitionColumnComputer());
*
* ...
*
* /////////////////////////////////////////////////////////////////////////////////
* // create the builder
* /////////////////////////////////////////////////////////////////////////////////
*
* DeltaSinkBuilder<RowData></RowData> foo =
* new DeltaSinkBuilder.DefaultDeltaFormatBuilder<>(
* ...,
* partitionAssigner,
* ...)
*
*
* @param The type of input elements.
*/
public class DeltaBucketAssigner implements BucketAssigner {
private static final long serialVersionUID = -6033643154550226022L;
private final DeltaPartitionComputer partitionComputer;
public DeltaBucketAssigner(DeltaPartitionComputer partitionComputer) {
this.partitionComputer = partitionComputer;
}
@Override
public String getBucketId(T element, BucketAssigner.Context context) {
LinkedHashMap partitionValues =
this.partitionComputer.generatePartitionValues(element, context);
return PartitionPathUtils.generatePartitionPath(partitionValues);
}
@Override
public SimpleVersionedSerializer getSerializer() {
return SimpleVersionedStringSerializer.INSTANCE;
}
@Override
public String toString() {
return "DeltaBucketAssigner";
}
}