io.delta.flink.source.DeltaSource Maven / Gradle / Ivy
package io.delta.flink.source;
import io.delta.flink.internal.options.DeltaConnectorConfiguration;
import io.delta.flink.source.internal.DeltaSourceInternal;
import io.delta.flink.source.internal.enumerator.SplitEnumeratorProvider;
import io.delta.flink.source.internal.enumerator.supplier.BoundedSnapshotSupplierFactory;
import io.delta.flink.source.internal.enumerator.supplier.ContinuousSnapshotSupplierFactory;
import io.delta.flink.source.internal.state.DeltaSourceSplit;
import org.apache.flink.connector.file.src.reader.BulkFormat;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.table.data.RowData;
import org.apache.hadoop.conf.Configuration;
/**
* A unified data source that reads Delta table - both in batch and in streaming mode.
*
* This source supports all (distributed) file systems and object stores that can be accessed
* via the Flink's {@link FileSystem} class.
*
* To create a new instance of {@link DeltaSource} for a Delta table that will produce
* {@link RowData} records that contain all table columns:
*
* StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
* ...
* // Bounded mode.
* DeltaSource<RowData> deltaSource = DeltaSource.forBoundedRowData(
* new Path("s3://some/path"),
* new Configuration()
* )
* .versionAsOf(10)
* .build();
*
* env.fromSource(deltaSource, WatermarkStrategy.noWatermarks(), "delta-source")
*
* ..........
* // Continuous mode.
* DeltaSource<RowData> deltaSource = DeltaSource.forContinuousRowData(
* new Path("s3://some/path"),
* new Configuration()
* )
* .updateCheckIntervalMillis(1000)
* .startingVersion(10)
* .build();
*
* env.fromSource(deltaSource, WatermarkStrategy.noWatermarks(), "delta-source")
*
*
* To create a new instance of {@link DeltaSource} for a Delta table that will produce
* {@link RowData} records with user-selected columns:
*
* StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
* ...
* // Bounded mode.
* DeltaSource<RowData> deltaSource = DeltaSource.forBoundedRowData(
* new Path("s3://some/path"),
* new Configuration()
* )
* .columnNames(Arrays.asList("col1", "col2"))
* .versionAsOf(10)
* .build();
*
* env.fromSource(deltaSource, WatermarkStrategy.noWatermarks(), "delta-source")
*
* ..........
* // Continuous mode.
* DeltaSource<RowData> deltaSource = DeltaSource.forContinuousRowData(
* new Path("s3://some/path"),
* new Configuration()
* )
* .columnNames(Arrays.asList("col1", "col2"))
* .updateCheckIntervalMillis(1000)
* .startingVersion(10)
* .build();
*
* env.fromSource(deltaSource, WatermarkStrategy.noWatermarks(), "delta-source")
*
* When using {@code columnNames(...)} method, the source will discover the data types for the
* given columns from the Delta log.
*
* @param The type of the events/records produced by this source.
* @implNote Batch and Streaming
*
* This source supports both bounded/batch and continuous/streaming modes. For the
* bounded/batch case, the Delta Source processes the full state of the Delta table. In
* the continuous/streaming case, the default Delta Source will also process the full state of the
* table, and then begin to periodically check the Delta table for any appending changes and read
* them. Using either of the {@link RowDataContinuousDeltaSourceBuilder#startingVersion} or
* {@link RowDataContinuousDeltaSourceBuilder#startingTimestamp} APIs will cause the Delta Source,
* in continuous mode, to stream only the changes from that historical version.
*
*
Format Types
*
* The reading of each file happens through file readers defined by file format. These
* define the parsing logic for the contents of the underlying Parquet files.
*
*
A {@link BulkFormat} reads batches of records from a file at a time.
* @implNote
Discovering / Enumerating Files
* The way that the source lists the files to be processes is defined by the {@code
* AddFileEnumerator}. The {@code AddFileEnumerator} is responsible to select the relevant {@code
* AddFile} and to optionally splits files into multiple regions (file source splits) that can be
* read in parallel.
*/
public class DeltaSource extends DeltaSourceInternal {
DeltaSource(
Path tablePath,
BulkFormat readerFormat,
SplitEnumeratorProvider splitEnumeratorProvider,
Configuration configuration,
DeltaConnectorConfiguration sourceConfiguration) {
super(tablePath, readerFormat, splitEnumeratorProvider, configuration, sourceConfiguration);
}
/**
* Creates an instance of Delta source builder for Bounded mode and for {@code RowData}
* elements.
* @param tablePath Path to Delta table to read data from.
* @param hadoopConfiguration Hadoop configuration.
*/
public static RowDataBoundedDeltaSourceBuilder forBoundedRowData(
Path tablePath,
Configuration hadoopConfiguration) {
return new RowDataBoundedDeltaSourceBuilder(
tablePath,
hadoopConfiguration,
new BoundedSnapshotSupplierFactory());
}
/**
* Creates an instance of Delta source builder for Continuous mode and for {@code RowData}
* elements.
* @param tablePath Path to Delta table to read data from.
* @param hadoopConfiguration Hadoop configuration.
*/
public static RowDataContinuousDeltaSourceBuilder forContinuousRowData(
Path tablePath,
Configuration hadoopConfiguration) {
return new RowDataContinuousDeltaSourceBuilder(
tablePath,
hadoopConfiguration,
new ContinuousSnapshotSupplierFactory());
}
}