org.apache.iceberg.flink.source.assigner.SplitAssigner Maven / Gradle / Ivy
Show all versions of iceberg-flink-1.18 Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.flink.source.assigner;
import java.io.Closeable;
import java.util.Collection;
import java.util.concurrent.CompletableFuture;
import javax.annotation.Nullable;
import org.apache.flink.api.connector.source.SplitEnumeratorContext;
import org.apache.iceberg.flink.source.ScanContext;
import org.apache.iceberg.flink.source.split.IcebergSourceSplit;
import org.apache.iceberg.flink.source.split.IcebergSourceSplitState;
/**
* SplitAssigner interface is extracted out as a separate component so that we can plug in different
* split assignment strategy for different requirements. E.g.
*
*
* - Simple assigner with no ordering guarantee or locality aware optimization.
*
- Locality aware assigner that prefer splits that are local.
*
- Snapshot aware assigner that assign splits based on the order they are committed.
*
- Event time alignment assigner that assign splits satisfying certain time ordering within a
* single source or across sources.
*
*
* Assigner implementation needs to be thread safe. Enumerator call the assigner APIs mostly from
* the coordinator thread. But enumerator may call the {@link SplitAssigner#pendingSplitCount()}
* from the I/O threads.
*/
public interface SplitAssigner extends Closeable {
/**
* Some assigners may need to start background threads or perform other activity such as
* registering as listeners to updates from other event sources e.g., watermark tracker.
*/
default void start() {}
/**
* Some assigners may need to perform certain actions when their corresponding enumerators are
* closed
*/
@Override
default void close() {}
/**
* Request a new split from the assigner when enumerator trying to assign splits to awaiting
* readers.
*
*
If enumerator wasn't able to assign the split (e.g., reader disconnected), enumerator should
* call {@link SplitAssigner#onUnassignedSplits} to return the split.
*/
GetSplitResult getNext(@Nullable String hostname);
/** Add new splits discovered by enumerator */
void onDiscoveredSplits(Collection splits);
/** Forward addSplitsBack event (for failed reader) to assigner */
void onUnassignedSplits(Collection splits);
/**
* Some assigner (like event time alignment) may rack in-progress splits to advance watermark upon
* completed splits
*/
default void onCompletedSplits(Collection completedSplitIds) {}
/**
* Get assigner state for checkpointing. This is a super-set API that works for all currently
* imagined assigners.
*/
Collection state();
/**
* Enumerator can get a notification via CompletableFuture when the assigner has more splits
* available later. Enumerator should schedule assignment in the thenAccept action of the future.
*
* Assigner will return the same future if this method is called again before the previous
* future is completed.
*
*
The future can be completed from other thread, e.g. the coordinator thread from another
* thread for event time alignment.
*
*
If enumerator need to trigger action upon the future completion, it may want to run it in
* the coordinator thread using {@link SplitEnumeratorContext#runInCoordinatorThread(Runnable)}.
*/
CompletableFuture isAvailable();
/**
* Return the number of pending splits that haven't been assigned yet.
*
* The enumerator can poll this API to publish a metric on the number of pending splits.
*
*
The enumerator can also use this information to throttle split discovery for streaming read.
* If there are already many pending splits tracked by the assigner, it is undesirable to discover
* more splits and track them in the assigner. That will increase the memory footprint and
* enumerator checkpoint size.
*
*
Throttling works better together with {@link ScanContext#maxPlanningSnapshotCount()}.
* Otherwise, the next split discovery after throttling will just discover all non-enumerated
* snapshots and splits, which defeats the purpose of throttling.
*/
int pendingSplitCount();
/**
* Return the number of pending records, which can act as a measure of the source lag. This value
* could be an estimation if the exact number of records cannot be accurately computed.
*/
long pendingRecords();
}