org.apache.kafka.connect.util.KafkaBasedLog Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of connect-runtime Show documentation
There is a newer version: 3.9.0
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 **/

package org.apache.kafka.connect.util;

import org.apache.kafka.clients.consumer.Consumer;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.common.errors.WakeupException;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.KafkaException;
import org.apache.kafka.common.PartitionInfo;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.utils.Time;
import org.apache.kafka.common.utils.Utils;
import org.apache.kafka.connect.errors.ConnectException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.Future;

/**
 * 
 *     KafkaBasedLog provides a generic implementation of a shared, compacted log of records stored in Kafka that all
 *     clients need to consume and, at times, agree on their offset / that they have read to the end of the log.
 * 
 * 
 *     This functionality is useful for storing different types of data that all clients may need to agree on --
 *     offsets or config for example. This class runs a consumer in a background thread to continuously tail the target
 *     topic, accepts write requests which it writes to the topic using an internal producer, and provides some helpful
 *     utilities like checking the current log end offset and waiting until the current end of the log is reached.
 * 
 * 
 *     To support different use cases, this class works with either single- or multi-partition topics.
 * 
 * 
 *     Since this class is generic, it delegates the details of data storage via a callback that is invoked for each
 *     record that is consumed from the topic. The invocation of callbacks is guaranteed to be serialized -- if the
 *     calling class keeps track of state based on the log and only writes to it when consume callbacks are invoked
 *     and only reads it in {@link #readToEnd(Callback)} callbacks then no additional synchronization will be required.
 * 
 */
public class KafkaBasedLog {
    private static final Logger log = LoggerFactory.getLogger(KafkaBasedLog.class);
    private static final long CREATE_TOPIC_TIMEOUT_MS = 30000;

    private Time time;
    private final String topic;
    private final Map producerConfigs;
    private final Map consumerConfigs;
    private final Callback> consumedCallback;
    private Consumer consumer;
    private Producer producer;

    private Thread thread;
    private boolean stopRequested;
    private Queue> readLogEndOffsetCallbacks;

    /**
     * Create a new KafkaBasedLog object. This does not start reading the log and writing is not permitted until
     * {@link #start()} is invoked.
     *
     * @param topic the topic to treat as a log
     * @param producerConfigs configuration options to use when creating the internal producer. At a minimum this must
     *                        contain compatible serializer settings for the generic types used on this class. Some
     *                        setting, such as the number of acks, will be overridden to ensure correct behavior of this
     *                        class.
     * @param consumerConfigs configuration options to use when creating the internal consumer. At a minimum this must
     *                        contain compatible serializer settings for the generic types used on this class. Some
     *                        setting, such as the auto offset reset policy, will be overridden to ensure correct
     *                        behavior of this class.
     * @param consumedCallback callback to invoke for each {@link ConsumerRecord} consumed when tailing the log
     * @param time Time interface
     */
    public KafkaBasedLog(String topic, Map producerConfigs, Map consumerConfigs,
                         Callback> consumedCallback, Time time) {
        this.topic = topic;
        this.producerConfigs = producerConfigs;
        this.consumerConfigs = consumerConfigs;
        this.consumedCallback = consumedCallback;
        this.stopRequested = false;
        this.readLogEndOffsetCallbacks = new ArrayDeque<>();
        this.time = time;
    }

    public void start() {
        log.info("Starting KafkaBasedLog with topic " + topic);

        producer = createProducer();
        consumer = createConsumer();

        List partitions = new ArrayList<>();

        // Until we have admin utilities we can use to check for the existence of this topic and create it if it is missing,
        // we rely on topic auto-creation
        List partitionInfos = null;
        long started = time.milliseconds();
        while (partitionInfos == null && time.milliseconds() - started < CREATE_TOPIC_TIMEOUT_MS) {
            partitionInfos = consumer.partitionsFor(topic);
            Utils.sleep(Math.min(time.milliseconds() - started, 1000));
        }
        if (partitionInfos == null)
            throw new ConnectException("Could not look up partition metadata for offset backing store topic in" +
                    " allotted period. This could indicate a connectivity issue, unavailable topic partitions, or if" +
                    " this is your first use of the topic it may have taken too long to create.");

        for (PartitionInfo partition : partitionInfos)
            partitions.add(new TopicPartition(partition.topic(), partition.partition()));
        consumer.assign(partitions);

        readToLogEnd();

        thread = new WorkThread();
        thread.start();

        log.info("Finished reading KafakBasedLog for topic " + topic);

        log.info("Started KafakBasedLog for topic " + topic);
    }

    public void stop() {
        log.info("Stopping KafkaBasedLog for topic " + topic);

        synchronized (this) {
            stopRequested = true;
        }
        consumer.wakeup();

        try {
            thread.join();
        } catch (InterruptedException e) {
            throw new ConnectException("Failed to stop KafkaBasedLog. Exiting without cleanly shutting " +
                    "down it's producer and consumer.", e);
        }

        try {
            producer.close();
        } catch (KafkaException e) {
            log.error("Failed to stop KafkaBasedLog producer", e);
        }

        try {
            consumer.close();
        } catch (KafkaException e) {
            log.error("Failed to stop KafkaBasedLog consumer", e);
        }

        log.info("Stopped KafkaBasedLog for topic " + topic);
    }

    /**
     * Flushes any outstanding writes and then reads to the current end of the log and invokes the specified callback.
     * Note that this checks the current, offsets, reads to them, and invokes the callback regardless of whether
     * additional records have been written to the log. If the caller needs to ensure they have truly reached the end
     * of the log, they must ensure there are no other writers during this period.
     *
     * This waits until the end of all partitions has been reached.
     *
     * This method is asynchronous. If you need a synchronous version, pass an instance of
     * {@link org.apache.kafka.connect.util.FutureCallback} as the {@param callback} parameter and wait on it to block.
     *
     * @param callback the callback to invoke once the end of the log has been reached.
     */
    public void readToEnd(Callback callback) {
        producer.flush();
        synchronized (this) {
            readLogEndOffsetCallbacks.add(callback);
        }
        consumer.wakeup();
    }

    /**
     * Same as {@link #readToEnd(Callback)} but provides a {@link Future} instead of using a callback.
     * @return the future associated with the operation
     */
    public Future readToEnd() {
        FutureCallback future = new FutureCallback<>(null);
        readToEnd(future);
        return future;
    }

    public void send(K key, V value) {
        send(key, value, null);
    }

    public void send(K key, V value, org.apache.kafka.clients.producer.Callback callback) {
        producer.send(new ProducerRecord<>(topic, key, value), callback);
    }


    private Producer createProducer() {
        // Always require producer acks to all to ensure durable writes
        producerConfigs.put(ProducerConfig.ACKS_CONFIG, "all");
        return new KafkaProducer<>(producerConfigs);
    }

    private Consumer createConsumer() {
        // Always force reset to the beginning of the log since this class wants to consume all available log data
        consumerConfigs.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
        return new KafkaConsumer<>(consumerConfigs);
    }

    private void poll(long timeoutMs) {
        try {
            ConsumerRecords records = consumer.poll(timeoutMs);
            for (ConsumerRecord record : records)
                consumedCallback.onCompletion(null, record);
        } catch (WakeupException e) {
            // Expected on get() or stop(). The calling code should handle this
            throw e;
        } catch (KafkaException e) {
            log.error("Error polling: " + e);
        }
    }

    private void readToLogEnd() {
        log.trace("Reading to end of offset log");

        Set assignment = consumer.assignment();

        // This approach to getting the current end offset is hacky until we have an API for looking these up directly
        Map offsets = new HashMap<>();
        for (TopicPartition tp : assignment) {
            long offset = consumer.position(tp);
            offsets.put(tp, offset);
            consumer.seekToEnd(tp);
        }

        Map endOffsets = new HashMap<>();
        try {
            poll(0);
        } finally {
            // If there is an exception, even a possibly expected one like WakeupException, we need to make sure
            // the consumers position is reset or it'll get into an inconsistent state.
            for (TopicPartition tp : assignment) {
                long startOffset = offsets.get(tp);
                long endOffset = consumer.position(tp);
                if (endOffset > startOffset) {
                    endOffsets.put(tp, endOffset);
                    consumer.seek(tp, startOffset);
                }
                log.trace("Reading to end of log for {}: starting offset {} to ending offset {}", tp, startOffset, endOffset);
            }
        }

        while (!endOffsets.isEmpty()) {
            poll(Integer.MAX_VALUE);

            Iterator> it = endOffsets.entrySet().iterator();
            while (it.hasNext()) {
                Map.Entry entry = it.next();
                if (consumer.position(entry.getKey()) >= entry.getValue())
                    it.remove();
                else
                    break;
            }
        }
    }


    private class WorkThread extends Thread {
        @Override
        public void run() {
            try {
                while (true) {
                    int numCallbacks;
                    synchronized (KafkaBasedLog.this) {
                        if (stopRequested)
                            break;
                        numCallbacks = readLogEndOffsetCallbacks.size();
                    }

                    if (numCallbacks > 0) {
                        try {
                            readToLogEnd();
                        } catch (WakeupException e) {
                            // Either received another get() call and need to retry reading to end of log or stop() was
                            // called. Both are handled by restarting this loop.
                            continue;
                        }
                    }

                    synchronized (KafkaBasedLog.this) {
                        // Only invoke exactly the number of callbacks we found before triggering the read to log end
                        // since it is possible for another write + readToEnd to sneak in in the meantime
                        for (int i = 0; i < numCallbacks; i++) {
                            Callback cb = readLogEndOffsetCallbacks.poll();
                            cb.onCompletion(null, null);
                        }
                    }

                    try {
                        poll(Integer.MAX_VALUE);
                    } catch (WakeupException e) {
                        // See previous comment, both possible causes of this wakeup are handled by starting this loop again
                        continue;
                    }
                }
            } catch (Throwable t) {
                log.error("Unexpected exception in KafkaBasedLog's work thread", t);
            }
        }
    }
}