All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.delta.flink.source.internal.DeltaSourceInternal Maven / Gradle / Ivy

There is a newer version: 3.2.1
Show newest version
package io.delta.flink.source.internal;

import io.delta.flink.internal.options.DeltaConnectorConfiguration;
import io.delta.flink.source.internal.enumerator.SplitEnumeratorProvider;
import io.delta.flink.source.internal.state.DeltaEnumeratorStateCheckpoint;
import io.delta.flink.source.internal.state.DeltaPendingSplitsCheckpointSerializer;
import io.delta.flink.source.internal.state.DeltaSourceSplit;
import io.delta.flink.source.internal.state.DeltaSourceSplitSerializer;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.connector.source.Boundedness;
import org.apache.flink.api.connector.source.Source;
import org.apache.flink.api.connector.source.SourceReader;
import org.apache.flink.api.connector.source.SourceReaderContext;
import org.apache.flink.api.connector.source.SplitEnumerator;
import org.apache.flink.api.connector.source.SplitEnumeratorContext;
import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
import org.apache.flink.connector.file.src.impl.FileSourceReader;
import org.apache.flink.connector.file.src.reader.BulkFormat;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.core.io.SimpleVersionedSerializer;
import org.apache.flink.formats.parquet.utils.SerializableConfiguration;
import org.apache.hadoop.conf.Configuration;

import io.delta.standalone.DeltaLog;
import io.delta.standalone.actions.AddFile;

/**
 * This is an internal class to hide {@link Source} API from the end user.
 * 

* A unified data source that reads Delta table - both in batch and in streaming mode. * *

This source supports all (distributed) file systems and object stores that can be accessed * via the Flink's {@link FileSystem} class. * *

* * @param The type of the events/records produced by this source. * @implNote

Batch and Streaming

* *

This source supports both bounded/batch and continuous/streaming modes. For the * bounded/batch case, the Delta Source processes all {@link AddFile} from Delta table Snapshot. In * the continuous/streaming case, the source periodically checks the Delta Table for any appending * changes and reads them. * *

Format Types

* *

The reading of each file happens through file readers defined by file format. These * define the parsing logic for the contents of the underlying Parquet files. * *

A {@link BulkFormat} reads batches of records from a file at a time. *

* @implNote

Discovering / Enumerating Files

*

The way that the source lists the files to be processes is defined by the {@code * AddFileEnumerator}. The {@link io.delta.flink.source.internal.file.AddFileEnumerator} is * responsible to select the relevant {@link AddFile} and to optionally splits files into multiple * regions (file source splits) that can be read in parallel. */ public class DeltaSourceInternal implements Source>, ResultTypeQueryable { // --------------------------------------------------------------------------------------------- // ALL NON TRANSIENT FIELDS HAVE TO BE SERIALIZABLE // --------------------------------------------------------------------------------------------- private static final long serialVersionUID = 1L; /** * Path to Delta Table from which this {@code DeltaSourceInternal} should read. */ private final Path tablePath; /** * A reader format used for this Source. */ private final BulkFormat readerFormat; /** * Factory for {@link SplitEnumerator} */ private final SplitEnumeratorProvider splitEnumeratorProvider; /** * A Flink Serialization Wrapper around Hadoop Configuration needed for {@link DeltaLog} */ private final SerializableConfiguration serializableConf; /** * Source Options used for {@code DeltaSourceInternal} creation. */ private final DeltaConnectorConfiguration sourceConfiguration; // --------------------------------------------------------------------------------------------- protected DeltaSourceInternal(Path tablePath, BulkFormat readerFormat, SplitEnumeratorProvider splitEnumeratorProvider, Configuration configuration, DeltaConnectorConfiguration sourceConfiguration) { this.tablePath = tablePath; this.readerFormat = readerFormat; this.splitEnumeratorProvider = splitEnumeratorProvider; this.serializableConf = new SerializableConfiguration(configuration); this.sourceConfiguration = sourceConfiguration; } @Override public SimpleVersionedSerializer getSplitSerializer() { return DeltaSourceSplitSerializer.INSTANCE; } @Override public SimpleVersionedSerializer> getEnumeratorCheckpointSerializer() { return new DeltaPendingSplitsCheckpointSerializer<>(DeltaSourceSplitSerializer.INSTANCE); } @Override public Boundedness getBoundedness() { return splitEnumeratorProvider.getBoundedness(); } @Override public SourceReader createReader(SourceReaderContext readerContext) throws Exception { return new FileSourceReader<>(readerContext, readerFormat, readerContext.getConfiguration()); } @Override public SplitEnumerator> createEnumerator(SplitEnumeratorContext enumContext) { return splitEnumeratorProvider.createInitialStateEnumerator(tablePath, serializableConf.conf(), enumContext, sourceConfiguration); } @Override public SplitEnumerator> restoreEnumerator(SplitEnumeratorContext enumContext, DeltaEnumeratorStateCheckpoint checkpoint) throws Exception { return splitEnumeratorProvider.createEnumeratorForCheckpoint( checkpoint, serializableConf.conf(), enumContext, sourceConfiguration); } @Override public TypeInformation getProducedType() { return readerFormat.getProducedType(); } @VisibleForTesting public Path getTablePath() { return tablePath; } @VisibleForTesting public DeltaConnectorConfiguration getSourceConfiguration() { return sourceConfiguration; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy