All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.delta.flink.sink.internal.DeltaSinkBuilder Maven / Gradle / Ivy

There is a newer version: 3.2.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.delta.flink.sink.internal;

import java.io.IOException;
import java.io.Serializable;
import java.util.UUID;

import io.delta.flink.internal.options.DeltaConnectorConfiguration;
import io.delta.flink.sink.DeltaSink;
import io.delta.flink.sink.internal.committables.DeltaCommittable;
import io.delta.flink.sink.internal.committables.DeltaCommittableSerializer;
import io.delta.flink.sink.internal.committables.DeltaGlobalCommittable;
import io.delta.flink.sink.internal.committables.DeltaGlobalCommittableSerializer;
import io.delta.flink.sink.internal.committer.DeltaCommitter;
import io.delta.flink.sink.internal.committer.DeltaGlobalCommitter;
import io.delta.flink.sink.internal.writer.DeltaWriter;
import io.delta.flink.sink.internal.writer.DeltaWriterBucketState;
import io.delta.flink.sink.internal.writer.DeltaWriterBucketStateSerializer;
import org.apache.flink.api.connector.sink.Committer;
import org.apache.flink.api.connector.sink.GlobalCommitter;
import org.apache.flink.api.connector.sink.Sink;
import org.apache.flink.api.connector.sink.Sink.InitContext;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.core.io.SimpleVersionedSerializer;
import org.apache.flink.formats.parquet.ParquetWriterFactory;
import org.apache.flink.formats.parquet.utils.SerializableConfiguration;
import org.apache.flink.streaming.api.functions.sink.filesystem.BucketAssigner;
import org.apache.flink.streaming.api.functions.sink.filesystem.BucketWriter;
import org.apache.flink.streaming.api.functions.sink.filesystem.DeltaBulkBucketWriter;
import org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.CheckpointRollingPolicy;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.types.logical.RowType;
import org.apache.hadoop.conf.Configuration;
import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * A builder class for {@link DeltaSinkInternal}.
 * 

* For most common use cases use {@link DeltaSink#forRowData} utility method to instantiate the * sink. This builder should be used only if you need to provide custom writer factory instance * or configure some low level settings for the sink. *

* Example how to use this class for the stream of {@link RowData}: *

 *     RowType rowType = ...;
 *     Configuration conf = new Configuration();
 *     conf.set("parquet.compression", "SNAPPY");
 *     ParquetWriterFactory<RowData> writerFactory =
 *         ParquetRowDataBuilder.createWriterFactory(rowType, conf, true);
 *
 *     DeltaSinkBuilder<RowData> sinkBuilder = new DeltaSinkBuilder(
 *         basePath,
 *         conf,
 *         bucketCheckInterval,
 *         writerFactory,
 *         new BasePathBucketAssigner<>(),
 *         OnCheckpointRollingPolicy.build(),
 *         OutputFileConfig.builder().withPartSuffix(".snappy.parquet").build(),
 *         appId,
 *         rowType,
 *         mergeSchema
 *     );
 *
 *     DeltaSink<RowData> sink = sinkBuilder.build();
 *
 * 
* * @param The type of input elements. */ public class DeltaSinkBuilder implements Serializable { private static final long serialVersionUID = 7493169281026370228L; protected static final long DEFAULT_BUCKET_CHECK_INTERVAL = 60L * 1000L; private static String generateNewAppId() { return UUID.randomUUID().toString(); } /////////////////////////////////////////////////////////////////////////// // DeltaLake-specific fields /////////////////////////////////////////////////////////////////////////// /** * Delta table's root path */ private final Path tableBasePath; /** * Flink's logical type to indicate the structure of the events in the stream */ private final RowType rowType; /** * Unique identifier of the current Flink's app. Value from this builder will be read * only during the fresh start of the application. For restarts or failure recovery * it will be resolved from the snapshoted state. */ private final String appId; /** * Indicator whether we should try to update table's schema with stream's schema in case * those will not match. The update is not guaranteed as there will be still some checks * performed whether the updates to the schema are compatible. */ private boolean mergeSchema; /** * Configuration options for delta sink. */ private final DeltaConnectorConfiguration sinkConfiguration; /** * Serializable wrapper for {@link Configuration} object */ private final SerializableConfiguration serializableConfiguration; /////////////////////////////////////////////////////////////////////////// // FileSink-specific fields /////////////////////////////////////////////////////////////////////////// /** * Interval for triggering {@link Sink.ProcessingTimeService} within * {@code io.delta.flink.sink.internal.writer.DeltaWriter} instance. *

* In some scenarios, the open buckets are required to change based on time. In these cases, * the user can specify a bucketCheckInterval and the sink will check * periodically and roll the part file if the specified rolling policy says so. */ private final long bucketCheckInterval; private final ParquetWriterFactory writerFactory; private BucketAssigner bucketAssigner; private final CheckpointRollingPolicy rollingPolicy; private final OutputFileConfig outputFileConfig; /** * Creates instance of the builder for {@link DeltaSink}. * * @param basePath path to a Delta table * @param conf Hadoop's conf object * @param writerFactory a factory that in runtime is used to create instances of * {@link org.apache.flink.api.common.serialization.BulkWriter} * @param assigner {@link BucketAssigner} used with a Delta sink to determine the * bucket each incoming element should be put into * @param policy instance of {@link CheckpointRollingPolicy} which rolls on every * checkpoint by default * @param rowType Flink's logical type to indicate the structure of the events in * the stream * @param mergeSchema indicator whether we should try to update table's schema with * stream's schema in case those will not match. The update is not * guaranteed as there will be still some checks performed whether * the updates to the schema are compatible. */ protected DeltaSinkBuilder( Path basePath, Configuration conf, ParquetWriterFactory writerFactory, BucketAssigner assigner, CheckpointRollingPolicy policy, RowType rowType, boolean mergeSchema, DeltaConnectorConfiguration sinkConfiguration) { this( basePath, conf, DEFAULT_BUCKET_CHECK_INTERVAL, writerFactory, assigner, policy, OutputFileConfig.builder().withPartSuffix(".snappy.parquet").build(), generateNewAppId(), rowType, mergeSchema, sinkConfiguration ); } /** * Creates instance of the builder for {@link DeltaSink}. * * @param basePath path to a Delta table * @param conf Hadoop's conf object * @param bucketCheckInterval interval (in milliseconds) for triggering * {@link Sink.ProcessingTimeService} within internal * {@code io.delta.flink.sink.internal.writer.DeltaWriter} instance * @param writerFactory a factory that in runtime is used to create instances of * {@link org.apache.flink.api.common.serialization.BulkWriter} * @param assigner {@link BucketAssigner} used with a Delta sink to determine the * bucket each incoming element should be put into * @param policy instance of {@link CheckpointRollingPolicy} which rolls on every * checkpoint by default * @param outputFileConfig part file name configuration. This allow to define a prefix and a * suffix to the part file name. * @param appId unique identifier of the Flink application that will be used as a * part of transactional id in Delta's transactions. It is crucial * for this value to be unique across all applications committing to * a given Delta table * @param rowType Flink's logical type to indicate the structure of the events in * the stream * @param mergeSchema indicator whether we should try to update table's schema with * stream's schema in case those will not match. The update is not * guaranteed as there will be still some checks performed whether * the updates to the schema are compatible. */ protected DeltaSinkBuilder( Path basePath, Configuration conf, long bucketCheckInterval, ParquetWriterFactory writerFactory, BucketAssigner assigner, CheckpointRollingPolicy policy, OutputFileConfig outputFileConfig, String appId, RowType rowType, boolean mergeSchema, DeltaConnectorConfiguration sinkConfiguration) { this.tableBasePath = checkNotNull(basePath); this.serializableConfiguration = new SerializableConfiguration(checkNotNull(conf)); this.bucketCheckInterval = bucketCheckInterval; this.writerFactory = writerFactory; this.bucketAssigner = checkNotNull(assigner); this.rollingPolicy = checkNotNull(policy); this.outputFileConfig = checkNotNull(outputFileConfig); this.appId = appId; this.rowType = rowType; this.mergeSchema = mergeSchema; this.sinkConfiguration = sinkConfiguration; } /** * Sets the sink's option whether in case of any differences between stream's schema and Delta * table's schema we should try to update it during commit to the * {@link io.delta.standalone.DeltaLog}. The update is not guaranteed as there will be some * compatibility checks performed. * * @param mergeSchema whether we should try to update table's schema with stream's * schema in case those will not match. See * {@link DeltaSinkBuilder#mergeSchema} for details. * @return builder for {@link DeltaSink} */ public DeltaSinkBuilder withMergeSchema(final boolean mergeSchema) { this.mergeSchema = mergeSchema; return this; } Committer createCommitter() throws IOException { return new DeltaCommitter(createBucketWriter()); } GlobalCommitter createGlobalCommitter() { return new DeltaGlobalCommitter( serializableConfiguration.conf(), tableBasePath, rowType, mergeSchema); } protected Path getTableBasePath() { return tableBasePath; } protected String getAppId() { return appId; } protected SerializableConfiguration getSerializableConfiguration() { return serializableConfiguration; } /////////////////////////////////////////////////////////////////////////// // FileSink-specific methods /////////////////////////////////////////////////////////////////////////// /** * Sets bucket assigner responsible for mapping events to its partitions. * * @param assigner bucket assigner instance for this sink * @return builder for {@link DeltaSink} */ public DeltaSinkBuilder withBucketAssigner(BucketAssigner assigner) { this.bucketAssigner = checkNotNull(assigner); return this; } /** * Creates the actual sink. * * @return constructed {@link DeltaSink} object */ public DeltaSinkInternal build() { return new DeltaSinkInternal<>(this); } DeltaWriter createWriter( InitContext context, String appId, long nextCheckpointId) throws IOException { return new DeltaWriter<>( tableBasePath, bucketAssigner, createBucketWriter(), rollingPolicy, outputFileConfig, context.getProcessingTimeService(), context.metricGroup(), bucketCheckInterval, appId, nextCheckpointId); } SimpleVersionedSerializer getWriterStateSerializer() throws IOException { return new DeltaWriterBucketStateSerializer(); } SimpleVersionedSerializer getCommittableSerializer() throws IOException { BucketWriter bucketWriter = createBucketWriter(); return new DeltaCommittableSerializer( bucketWriter.getProperties().getPendingFileRecoverableSerializer()); } SimpleVersionedSerializer getGlobalCommittableSerializer() throws IOException { BucketWriter bucketWriter = createBucketWriter(); return new DeltaGlobalCommittableSerializer( bucketWriter.getProperties().getPendingFileRecoverableSerializer()); } private DeltaBulkBucketWriter createBucketWriter() throws IOException { return new DeltaBulkBucketWriter<>( FileSystem.get(tableBasePath.toUri()).createRecoverableWriter(), writerFactory); } /** * Default builder for {@link DeltaSink}. */ public static final class DefaultDeltaFormatBuilder extends DeltaSinkBuilder { private static final long serialVersionUID = 2818087325120827526L; public DefaultDeltaFormatBuilder( Path basePath, final Configuration conf, ParquetWriterFactory writerFactory, BucketAssigner assigner, CheckpointRollingPolicy policy, RowType rowType, boolean mergeSchema, DeltaConnectorConfiguration sinkConfiguration) { super(basePath, conf, writerFactory, assigner, policy, rowType, mergeSchema, sinkConfiguration); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy