org.apache.kafka.connect.sink.SinkTask Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of connect-api Show documentation
There is a newer version: 3.8.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.kafka.connect.sink;

import org.apache.kafka.clients.consumer.OffsetAndMetadata;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.connect.connector.Task;

import java.util.Collection;
import java.util.Map;

/**
 * SinkTask is a Task that takes records loaded from Kafka and sends them to another system. Each task
 * instance is assigned a set of partitions by the Connect framework and will handle all records received
 * from those partitions. As records are fetched from Kafka, they will be passed to the sink task using the
 * {@link #put(Collection)} API, which should either write them to the downstream system or batch them for
 * later writing. Periodically, Connect will call {@link #flush(Map)} to ensure that batched records are
 * actually pushed to the downstream system..
 *
 * Below we describe the lifecycle of a SinkTask.
 *
 * 
 *     Initialization: SinkTasks are first initialized using {@link #initialize(SinkTaskContext)}
 *     to prepare the task's context and {@link #start(Map)} to accept configuration and start any services
 *     needed for processing.
 *     Partition Assignment: After initialization, Connect will assign the task a set of partitions
 *     using {@link #open(Collection)}. These partitions are owned exclusively by this task until they
 *     have been closed with {@link #close(Collection)}.
 *     Record Processing: Once partitions have been opened for writing, Connect will begin forwarding
 *     records from Kafka using the {@link #put(Collection)} API. Periodically, Connect will ask the task
 *     to flush records using {@link #flush(Map)} as described above.
 *     Partition Rebalancing: Occasionally, Connect will need to change the assignment of this task.
 *     When this happens, the currently assigned partitions will be closed with {@link #close(Collection)} and
 *     the new assignment will be opened using {@link #open(Collection)}.
 *     Shutdown: When the task needs to be shutdown, Connect will close active partitions (if there
 *     are any) and stop the task using {@link #stop()}
  * 
 *
 */
public abstract class SinkTask implements Task {

    /**
     * 
     * The configuration key that provides the list of topics that are inputs for this
     * SinkTask.
     * 
     */
    public static final String TOPICS_CONFIG = "topics";

    protected SinkTaskContext context;

    /**
     * Initialize the context of this task. Note that the partition assignment will be empty until
     * Connect has opened the partitions for writing with {@link #open(Collection)}.
     * @param context The sink task's context
     */
    public void initialize(SinkTaskContext context) {
        this.context = context;
    }

    /**
     * Start the Task. This should handle any configuration parsing and one-time setup of the task.
     * @param props initial configuration
     */
    @Override
    public abstract void start(Map props);

    /**
     * Put the records in the sink. Usually this should send the records to the sink asynchronously
     * and immediately return.
     *
     * If this operation fails, the SinkTask may throw a {@link org.apache.kafka.connect.errors.RetriableException} to
     * indicate that the framework should attempt to retry the same call again. Other exceptions will cause the task to
     * be stopped immediately. {@link SinkTaskContext#timeout(long)} can be used to set the maximum time before the
     * batch will be retried.
     *
     * @param records the set of records to send
     */
    public abstract void put(Collection records);

    /**
     * Flush all records that have been {@link #put(Collection)} for the specified topic-partitions.
     *
     * @param currentOffsets the current offset state as of the last call to {@link #put(Collection)}},
     *                       provided for convenience but could also be determined by tracking all offsets included in the {@link SinkRecord}s
     *                       passed to {@link #put}.
     */
    public void flush(Map currentOffsets) {
    }

    /**
     * Pre-commit hook invoked prior to an offset commit.
     *
     * The default implementation simply invokes {@link #flush(Map)} and is thus able to assume all {@code currentOffsets} are safe to commit.
     *
     * @param currentOffsets the current offset state as of the last call to {@link #put(Collection)}},
     *                       provided for convenience but could also be determined by tracking all offsets included in the {@link SinkRecord}s
     *                       passed to {@link #put}.
     *
     * @return an empty map if Connect-managed offset commit is not desired, otherwise a map of offsets by topic-partition that are safe to commit.
     */
    public Map preCommit(Map currentOffsets) {
        flush(currentOffsets);
        return currentOffsets;
    }

    /**
     * The SinkTask use this method to create writers for newly assigned partitions in case of partition
     * rebalance. This method will be called after partition re-assignment completes and before the SinkTask starts
     * fetching data. Note that any errors raised from this method will cause the task to stop.
     * @param partitions The list of partitions that are now assigned to the task (may include
     *                   partitions previously assigned to the task)
     */
    public void open(Collection partitions) {
        this.onPartitionsAssigned(partitions);
    }

    /**
     * @deprecated Use {@link #open(Collection)} for partition initialization.
     */
    @Deprecated
    public void onPartitionsAssigned(Collection partitions) {
    }

    /**
     * The SinkTask use this method to close writers for partitions that are no
     * longer assigned to the SinkTask. This method will be called before a rebalance operation starts
     * and after the SinkTask stops fetching data. After being closed, Connect will not write
     * any records to the task until a new set of partitions has been opened. Note that any errors raised
     * from this method will cause the task to stop.
     * @param partitions The list of partitions that should be closed
     */
    public void close(Collection partitions) {
        this.onPartitionsRevoked(partitions);
    }

    /**
     * @deprecated Use {@link #close(Collection)} instead for partition cleanup.
     */
    @Deprecated
    public void onPartitionsRevoked(Collection partitions) {
    }

    /**
     * Perform any cleanup to stop this task. In SinkTasks, this method is invoked only once outstanding calls to other
     * methods have completed (e.g., {@link #put(Collection)} has returned) and a final {@link #flush(Map)} and offset
     * commit has completed. Implementations of this method should only need to perform final cleanup operations, such
     * as closing network connections to the sink system.
     */
    public abstract void stop();
}