All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.delta.flink.source.internal.enumerator.DeltaSourceSplitEnumerator Maven / Gradle / Ivy

There is a newer version: 3.2.1
Show newest version
package io.delta.flink.source.internal.enumerator;

import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.Optional;

import javax.annotation.Nullable;

import io.delta.flink.source.internal.state.DeltaEnumeratorStateCheckpoint;
import io.delta.flink.source.internal.state.DeltaSourceSplit;
import org.apache.flink.api.connector.source.SplitEnumerator;
import org.apache.flink.api.connector.source.SplitEnumeratorContext;
import org.apache.flink.connector.file.src.FileSourceSplit;
import org.apache.flink.connector.file.src.assigners.FileSplitAssigner;
import org.apache.flink.core.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static io.delta.flink.source.internal.enumerator.DeltaSourceSplitEnumerator.AssignSplitStatus.NO_MORE_READERS;
import static io.delta.flink.source.internal.enumerator.DeltaSourceSplitEnumerator.AssignSplitStatus.NO_MORE_SPLITS;

/**
 * A base class for {@link SplitEnumerator} used by {@link io.delta.flink.source.DeltaSource}
 * 

* The implementations that will choose to extend this class will have to implement abstract method * {@link DeltaSourceSplitEnumerator#handleNoMoreSplits(int)} */ public abstract class DeltaSourceSplitEnumerator implements SplitEnumerator> { private static final Logger LOG = LoggerFactory.getLogger(DeltaSourceSplitEnumerator.class); /** * Path to Delta table that should be processed. */ protected final Path deltaTablePath; /** * A {@link FileSplitAssigner} that should be used by this {@code SourceEnumerator}. */ protected final FileSplitAssigner splitAssigner; /** * A {@link SplitEnumeratorContext} assigned to this {@code SourceEnumerator}. */ protected final SplitEnumeratorContext enumContext; /** * Map containing all readers that have requested the split. *

* The key is the subtask id of the source reader who sent the source event. requesterHostname *

* The value is an optional hostname where the requesting task is running. This can be used to * make split assignments locality-aware. * * @implNote The type contract for this map comes from {@link #handleSplitRequest(int, String)} * method. */ protected final LinkedHashMap readersAwaitingSplit; protected DeltaSourceSplitEnumerator( Path deltaTablePath, FileSplitAssigner splitAssigner, SplitEnumeratorContext enumContext) { this.deltaTablePath = deltaTablePath; this.splitAssigner = splitAssigner; this.enumContext = enumContext; this.readersAwaitingSplit = new LinkedHashMap<>(); } @Override public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { if (!enumContext.registeredReaders().containsKey(subtaskId)) { // reader failed between sending the request and now. skip this request. return; } if (LOG.isInfoEnabled()) { String hostInfo = requesterHostname == null ? "(no host locality info)" : "(on host '" + requesterHostname + "')"; LOG.info("Subtask {} {} is requesting a file source split", subtaskId, hostInfo); } readersAwaitingSplit.put(subtaskId, requesterHostname); assignSplits(subtaskId); } @Override public void addSplitsBack(List splits, int subtaskId) { LOG.debug("Bounded Delta Source Enumerator adds splits back: {}", splits); addSplits(splits); } @Override public void addReader(int subtaskId) { // this source is purely lazy-pull-based, nothing to do upon registration } @Override public void close() throws IOException { // no resources to close } /** * The implementation of this method should handle case, where there is no more splits that * could be assigned to Source Readers. *

* This method is called by {@link DeltaSourceSplitEnumerator#handleSplitRequest(int, String)} * method. * * @param subtaskId the subtask id of the source reader who sent the source spit request event. */ protected abstract void handleNoMoreSplits(int subtaskId); @SuppressWarnings("unchecked") protected Collection getRemainingSplits() { // The Flink's SplitAssigner interface uses FileSourceSplit // in its signatures. // This "trick" is also used in Flink source code by bundled Hive connector - // https://github.com/apache/flink/blob/release-1.14/flink-connectors/flink-connector-hive/src/main/java/org/apache/flink/connectors/hive/ContinuousHiveSplitEnumerator.java#L137 return (Collection) (Collection) splitAssigner.remainingSplits(); } @SuppressWarnings("unchecked") protected void addSplits(List splits) { // We are doing this double cast trick here because Flink's SplitAssigner interface uses // FileSourceSplit in its signatures instead something like // There is no point for construction our custom Interface and Implementation // for splitAssigner just to have needed type. splitAssigner.addSplits((Collection) (Collection) splits); } protected AssignSplitStatus assignSplits() { final Iterator> awaitingReader = readersAwaitingSplit.entrySet().iterator(); while (awaitingReader.hasNext()) { Entry nextAwaiting = awaitingReader.next(); // if the reader that requested another split has failed in the meantime, remove // it from the list of waiting readers - FLINK-20261 if (!enumContext.registeredReaders().containsKey(nextAwaiting.getKey())) { awaitingReader.remove(); continue; } String hostname = nextAwaiting.getValue(); int awaitingSubtask = nextAwaiting.getKey(); Optional nextSplit = splitAssigner.getNext(hostname); if (nextSplit.isPresent()) { FileSourceSplit split = nextSplit.get(); enumContext.assignSplit((DeltaSourceSplit) split, awaitingSubtask); LOG.info("Assigned split to subtask {} : {}", awaitingSubtask, split); awaitingReader.remove(); } else { // TODO for chunking load we will have to modify this to get a new chunk from Delta. return NO_MORE_SPLITS; } } return NO_MORE_READERS; } private void assignSplits(int subtaskId) { AssignSplitStatus assignSplitStatus = assignSplits(); if (NO_MORE_SPLITS.equals(assignSplitStatus)) { LOG.info("No more splits available for subtasks"); handleNoMoreSplits(subtaskId); } } public enum AssignSplitStatus { NO_MORE_SPLITS, NO_MORE_READERS } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy