org.apache.iceberg.flink.source.assigner.SplitAssigner Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-flink-1.18 Show documentation
A table format for huge analytic datasets
There is a newer version: 1.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.flink.source.assigner;

import java.io.Closeable;
import java.util.Collection;
import java.util.concurrent.CompletableFuture;
import javax.annotation.Nullable;
import org.apache.flink.api.connector.source.SplitEnumeratorContext;
import org.apache.iceberg.flink.source.ScanContext;
import org.apache.iceberg.flink.source.split.IcebergSourceSplit;
import org.apache.iceberg.flink.source.split.IcebergSourceSplitState;

/**
 * SplitAssigner interface is extracted out as a separate component so that we can plug in different
 * split assignment strategy for different requirements. E.g.
 *
 * 
 *   Simple assigner with no ordering guarantee or locality aware optimization.
 *   
Locality aware assigner that prefer splits that are local.
 *   
Snapshot aware assigner that assign splits based on the order they are committed.
 *   
Event time alignment assigner that assign splits satisfying certain time ordering within a
 *       single source or across sources.
 * 
 *
 * Assigner implementation needs to be thread safe. Enumerator call the assigner APIs mostly from
 * the coordinator thread. But enumerator may call the {@link SplitAssigner#pendingSplitCount()}
 * from the I/O threads.
 */
public interface SplitAssigner extends Closeable {

  /**
   * Some assigners may need to start background threads or perform other activity such as
   * registering as listeners to updates from other event sources e.g., watermark tracker.
   */
  default void start() {}

  /**
   * Some assigners may need to perform certain actions when their corresponding enumerators are
   * closed
   */
  @Override
  default void close() {}

  /**
   * Request a new split from the assigner when enumerator trying to assign splits to awaiting
   * readers.
   *
   * 
If enumerator wasn't able to assign the split (e.g., reader disconnected), enumerator should
   * call {@link SplitAssigner#onUnassignedSplits} to return the split.
   */
  GetSplitResult getNext(@Nullable String hostname);

  /** Add new splits discovered by enumerator */
  void onDiscoveredSplits(Collection splits);

  /** Forward addSplitsBack event (for failed reader) to assigner */
  void onUnassignedSplits(Collection splits);

  /**
   * Some assigner (like event time alignment) may rack in-progress splits to advance watermark upon
   * completed splits
   */
  default void onCompletedSplits(Collection completedSplitIds) {}

  /**
   * Get assigner state for checkpointing. This is a super-set API that works for all currently
   * imagined assigners.
   */
  Collection state();

  /**
   * Enumerator can get a notification via CompletableFuture when the assigner has more splits
   * available later. Enumerator should schedule assignment in the thenAccept action of the future.
   *
   * 
Assigner will return the same future if this method is called again before the previous
   * future is completed.
   *
   * 
The future can be completed from other thread, e.g. the coordinator thread from another
   * thread for event time alignment.
   *
   * 
If enumerator need to trigger action upon the future completion, it may want to run it in
   * the coordinator thread using {@link SplitEnumeratorContext#runInCoordinatorThread(Runnable)}.
   */
  CompletableFuture isAvailable();

  /**
   * Return the number of pending splits that haven't been assigned yet.
   *
   * 
The enumerator can poll this API to publish a metric on the number of pending splits.
   *
   * 
The enumerator can also use this information to throttle split discovery for streaming read.
   * If there are already many pending splits tracked by the assigner, it is undesirable to discover
   * more splits and track them in the assigner. That will increase the memory footprint and
   * enumerator checkpoint size.
   *
   * Throttling works better together with {@link ScanContext#maxPlanningSnapshotCount()}.
   * Otherwise, the next split discovery after throttling will just discover all non-enumerated
   * snapshots and splits, which defeats the purpose of throttling.
   */
  int pendingSplitCount();

  /**
   * Return the number of pending records, which can act as a measure of the source lag. This value
   * could be an estimation if the exact number of records cannot be accurately computed.
   */
  long pendingRecords();
}