All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.delta.flink.source.RowDataContinuousDeltaSourceBuilder Maven / Gradle / Ivy

There is a newer version: 3.2.1
Show newest version
package io.delta.flink.source;

import java.util.Arrays;
import java.util.List;

import io.delta.flink.source.internal.builder.ContinuousDeltaSourceBuilder;
import io.delta.flink.source.internal.builder.DeltaBulkFormat;
import io.delta.flink.source.internal.builder.RowDataFormat;
import io.delta.flink.source.internal.enumerator.supplier.ContinuousSnapshotSupplierFactory;
import io.delta.flink.source.internal.utils.SourceSchema;
import org.apache.flink.core.fs.Path;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.types.logical.RowType;
import org.apache.hadoop.conf.Configuration;
import static io.delta.flink.source.internal.DeltaSourceOptions.LOADED_SCHEMA_SNAPSHOT_VERSION;

/**
 * A builder class for {@link DeltaSource} for a stream of {@link RowData} where the created source
 * instance will operate in Continuous mode.
 * 

* In Continuous mode, the {@link DeltaSource} will, by default, load the full state of the latest * table version, and then start monitoring for changes. If you use either the * {@link RowDataContinuousDeltaSourceBuilder#startingVersion} or * {@link RowDataContinuousDeltaSourceBuilder#startingTimestamp} APIs, then the {@link DeltaSource} * will start monitoring for changes from that historical version. It will not load the full table * state at that historical table version. *

* For most common use cases use {@link DeltaSource#forContinuousRowData} utility method to * instantiate the source. After instantiation of this builder you can either call {@link * RowDataBoundedDeltaSourceBuilder#build()} method to get the instance of a {@link DeltaSource} or * configure additional options using builder's API. */ public class RowDataContinuousDeltaSourceBuilder extends ContinuousDeltaSourceBuilder { RowDataContinuousDeltaSourceBuilder( Path tablePath, Configuration hadoopConfiguration, ContinuousSnapshotSupplierFactory snapshotSupplierFactory) { super(tablePath, hadoopConfiguration, snapshotSupplierFactory); } ////////////////////////////////////////////////////////// /// We have to override methods from base class /// /// to include them in javadoc generated by sbt-unidoc /// ////////////////////////////////////////////////////////// /** * Specifies a {@link List} of column names that should be read from Delta table. If this method * is not used, Source will read all columns from Delta table. *

* If provided List is null or contains null, empty or blank elements it will throw a * {@code DeltaSourceValidationException} by builder after calling {@code build()} method. * * @param columnNames column names that should be read. */ @Override public RowDataContinuousDeltaSourceBuilder columnNames(List columnNames) { return super.columnNames(columnNames); } /** * Specifies an array of column names that should be read from Delta table. If this method * is not used, Source will read all columns from Delta table. *

* If provided List is null or contains null, empty or blank elements it will throw a * {@code DeltaSourceValidationException} by builder after calling {@code build()} method. * * @param columnNames column names that should be read. */ public RowDataContinuousDeltaSourceBuilder columnNames(String... columnNames) { return super.columnNames(Arrays.asList(columnNames)); } /** * Sets value of "startingVersion" option. This option specifies the starting table version from * which we want to start reading changes. * *

* This option is mutually exclusive with {@link #startingTimestamp(String)} option. * * @param startingVersion Delta table version to start reading changes from. The values can be * string numbers like "1", "10" etc. or keyword "latest", where in that * case, changes from the latest Delta table version will be read. */ @Override public RowDataContinuousDeltaSourceBuilder startingVersion(String startingVersion) { return super.startingVersion(startingVersion); } /** * Sets value of "startingVersion" option. This option specifies the starting table version from * which we want to start reading changes. * *

* This option is mutually exclusive with {@link #startingTimestamp(String)} option. * * @param startingVersion Delta table version to start reading changes from. */ @Override public RowDataContinuousDeltaSourceBuilder startingVersion(long startingVersion) { return super.startingVersion(startingVersion); } /** * Sets value of "startingTimestamp" option. This option is used to read only changes starting * from the table version that was generated at or after the given timestamp. * *

* This option is mutually exclusive with {@link #startingVersion(String)} and {@link * #startingVersion(long)} option. * * @param startingTimestamp The timestamp of the table from which we start reading changes. * Supported formats are: *

    *
  • 2022-02-24
  • *
  • 2022-02-24 04:55:00
  • *
  • 2022-02-24 04:55:00.001
  • *
  • 2022-02-24T04:55:00
  • *
  • 2022-02-24T04:55:00.001
  • *
  • 2022-02-24T04:55:00.001Z
  • *
*/ @Override public RowDataContinuousDeltaSourceBuilder startingTimestamp(String startingTimestamp) { return super.startingTimestamp(startingTimestamp); } /** * Sets the value for "updateCheckIntervalMillis" option. This option is used to specify the * check interval (in milliseconds) used for periodic Delta table changes checks. * *

* The default value for this option is 5000 ms. * * @param updateCheckInterval The update check internal in milliseconds. */ @Override public RowDataContinuousDeltaSourceBuilder updateCheckIntervalMillis( long updateCheckInterval) { return super.updateCheckIntervalMillis(updateCheckInterval); } /** * Sets the "ignoreDeletes" option. When set to true, this option allows processing Delta table * versions where data is deleted. *

* The default value for this option is false. */ @Override public RowDataContinuousDeltaSourceBuilder ignoreDeletes(boolean ignoreDeletes) { return super.ignoreDeletes(ignoreDeletes); } /** * Sets the "ignoreChanges" option. When set to true, this option allows processing Delta table * versions where data is changed (i.e. updated) or deleted. *

* Note that setting this option to true can lead to duplicate processing of data, as, in the * case of updates, existing rows may be rewritten in new files, and those new files will be * treated as new data and be fully reprocessed. *

* This option subsumes {@link #ignoreDeletes} option. Therefore, if you set "ignoreChanges" to * true, your stream will not be disrupted by either deletions or updates to the source table. *

* The default value for this option is false. */ @Override public RowDataContinuousDeltaSourceBuilder ignoreChanges(boolean ignoreChanges) { return super.ignoreChanges(ignoreChanges); } /** * Sets a configuration option. * * @param optionName Option name to set. * @param optionValue Option {@link String} value to set. */ @Override public RowDataContinuousDeltaSourceBuilder option(String optionName, String optionValue) { return super.option(optionName, optionValue); } /** * Sets a configuration option. * * @param optionName Option name to set. * @param optionValue Option boolean value to set. */ @Override public RowDataContinuousDeltaSourceBuilder option(String optionName, boolean optionValue) { return super.option(optionName, optionValue); } /** * Sets a configuration option. * * @param optionName Option name to set. * @param optionValue Option int value to set. */ @Override public RowDataContinuousDeltaSourceBuilder option(String optionName, int optionValue) { return super.option(optionName, optionValue); } /** * Sets a configuration option. * * @param optionName Option name to set. * @param optionValue Option long value to set. */ @Override public RowDataContinuousDeltaSourceBuilder option(String optionName, long optionValue) { return super.option(optionName, optionValue); } /** * Creates an instance of {@link DeltaSource} for a stream of {@link RowData}. Created source * will work in Continuous mode, actively monitoring Delta table for new changes. * *

* This method can throw {@code DeltaSourceValidationException} in case of invalid arguments * passed to Delta source builder. * * @return New {@link DeltaSource} instance. */ @Override @SuppressWarnings("unchecked") public DeltaSource build() { validate(); // In this step, the Delta table schema discovery is made. // We load the snapshot corresponding to the latest/startingVersion/startingTimestamp // commit. // We are using this snapshot to extract the metadata and discover table's column names // and data types. SourceSchema sourceSchema = getSourceSchema(); sourceConfiguration.addOption( LOADED_SCHEMA_SNAPSHOT_VERSION, sourceSchema.getSnapshotVersion() ); DeltaBulkFormat format = RowDataFormat.builder( RowType.of(sourceSchema.getColumnTypes(), sourceSchema.getColumnNames()), hadoopConfiguration) .partitionColumns(sourceSchema.getPartitionColumns()) .build(); return new DeltaSource<>( tablePath, format, DEFAULT_CONTINUOUS_SPLIT_ENUMERATOR_PROVIDER, hadoopConfiguration, sourceConfiguration ); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy