All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opensearch.migrations.replay.kafka.KafkaTrafficCaptureSource Maven / Gradle / Ivy

package org.opensearch.migrations.replay.kafka;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Properties;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Supplier;
import java.util.stream.Collectors;

import com.google.protobuf.InvalidProtocolBufferException;
import org.apache.kafka.clients.consumer.Consumer;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.KafkaConsumer;

import org.opensearch.migrations.replay.datatypes.ITrafficStreamKey;
import org.opensearch.migrations.replay.datatypes.PojoTrafficStreamAndKey;
import org.opensearch.migrations.replay.tracing.ChannelContextManager;
import org.opensearch.migrations.replay.tracing.ITrafficSourceContexts;
import org.opensearch.migrations.replay.tracing.ReplayContexts;
import org.opensearch.migrations.replay.tracing.RootReplayerContext;
import org.opensearch.migrations.replay.traffic.source.ISimpleTrafficCaptureSource;
import org.opensearch.migrations.replay.traffic.source.ITrafficStreamWithKey;
import org.opensearch.migrations.trafficcapture.protos.TrafficStream;

import io.netty.util.concurrent.DefaultThreadFactory;
import lombok.NonNull;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;

/**
 * Adapt a Kafka stream into a TrafficCaptureSource.
 *
 * Notice that there's a critical gap between how Kafka accepts commits and how the
 * BlockingTrafficSource throttles calls to Kafka.  The BlockingTrafficSource may
 * block calls to readNextTrafficStreamChunk() until some time window elapses.  This
 * could be a very large window in cases where there were long gaps between recorded
 * requests from the capturing proxy.  For example, if a TrafficStream is read and if
 * that stream is scheduled to be run one hour later, readNextTrafficStreamChunk()
 * may not be called for almost an hour.  By design, we're not calling Kafka to pull
 * any more messages since we know that we don't have work to do for an hour.  Shortly
 * after the hour of waiting begins, Kakfa will notice that this application is no
 * longer calling poll and will kick the consumer out of the client group.
 *
 * See
 * ...
 *
 * "Basically if you don't call poll at least as frequently as the configured max interval,
 * then the client will proactively leave the group so that another consumer can take
 * over its partitions. When this happens, you may see an offset commit failure (as
 * indicated by a CommitFailedException thrown from a call to commitSync())."
 *
 * Since the Kafka client requires all calls to be made from the same thread, we can't
 * simply run a background job to keep the client warm.  We need the caller to touch
 * this object periodically to keep the connection alive.
 */
@Slf4j
public class KafkaTrafficCaptureSource implements ISimpleTrafficCaptureSource {
    public static final String MAX_POLL_INTERVAL_KEY = "max.poll.interval.ms";
    // see
    // https://stackoverflow.com/questions/39730126/difference-between-session-timeout-ms-and-max-poll-interval-ms-for-kafka-0-10
    public static final String DEFAULT_POLL_INTERVAL_MS = "60000";

    final TrackingKafkaConsumer trackingKafkaConsumer;
    private final ExecutorService kafkaExecutor;
    private final AtomicLong trafficStreamsRead;
    private final KafkaBehavioralPolicy behavioralPolicy;
    private final ChannelContextManager channelContextManager;
    private final AtomicBoolean isClosed;

    public KafkaTrafficCaptureSource(
        @NonNull RootReplayerContext globalContext,
        Consumer kafkaConsumer,
        String topic,
        Duration keepAliveInterval
    ) {
        this(globalContext, kafkaConsumer, topic, keepAliveInterval, Clock.systemUTC(), new KafkaBehavioralPolicy());
    }

    public KafkaTrafficCaptureSource(
        @NonNull RootReplayerContext globalContext,
        Consumer kafkaConsumer,
        @NonNull String topic,
        Duration keepAliveInterval,
        Clock clock,
        @NonNull KafkaBehavioralPolicy behavioralPolicy
    ) {
        this.channelContextManager = new ChannelContextManager(globalContext);
        trackingKafkaConsumer = new TrackingKafkaConsumer(
            globalContext,
            kafkaConsumer,
            topic,
            keepAliveInterval,
            clock,
            this::onKeyFinishedCommitting
        );
        trafficStreamsRead = new AtomicLong();
        this.behavioralPolicy = behavioralPolicy;
        kafkaConsumer.subscribe(Collections.singleton(topic), trackingKafkaConsumer);
        kafkaExecutor = Executors.newSingleThreadExecutor(new DefaultThreadFactory("kafkaConsumerThread"));
        isClosed = new AtomicBoolean(false);
    }

    private void onKeyFinishedCommitting(ITrafficStreamKey trafficStreamKey) {
        var looseParentScope = trafficStreamKey.getTrafficStreamsContext().getEnclosingScope();
        if (!(looseParentScope instanceof ReplayContexts.KafkaRecordContext)) {
            throw new IllegalArgumentException(
                "Expected parent context of type "
                    + ReplayContexts.KafkaRecordContext.class
                    + " instead of "
                    + looseParentScope
                    + " (of type="
                    + looseParentScope.getClass()
                    + ")"
            );
        }
        var kafkaCtx = (ReplayContexts.KafkaRecordContext) looseParentScope;
        kafkaCtx.close();
        channelContextManager.releaseContextFor(kafkaCtx.getImmediateEnclosingScope());
    }

    public static KafkaTrafficCaptureSource buildKafkaSource(
        @NonNull RootReplayerContext globalContext,
        @NonNull String brokers,
        @NonNull String topic,
        @NonNull String groupId,
        boolean enableMSKAuth,
        String propertyFilePath,
        @NonNull Clock clock,
        @NonNull KafkaBehavioralPolicy behavioralPolicy
    ) throws IOException {
        var kafkaProps = buildKafkaProperties(brokers, groupId, enableMSKAuth, propertyFilePath);
        kafkaProps.putIfAbsent(MAX_POLL_INTERVAL_KEY, DEFAULT_POLL_INTERVAL_MS);
        var pollPeriod = Duration.ofMillis(Long.valueOf((String) kafkaProps.get(MAX_POLL_INTERVAL_KEY)));
        var keepAlivePeriod = getKeepAlivePeriodFromPollPeriod(pollPeriod);
        return new KafkaTrafficCaptureSource(
            globalContext,
            new KafkaConsumer<>(kafkaProps),
            topic,
            keepAlivePeriod,
            clock,
            behavioralPolicy
        );
    }

    /**
     * We'll have to 'maintain' touches more frequently than the poll period, otherwise the
     * consumer will fall out of the group, putting all the commits in-flight at risk.  Notice
     * that this doesn't have a bearing on heartbeats, which themselves are maintained through
     * Kafka Consumer poll() calls.  When those poll calls stop, so does the heartbeat, which
     * is more sensitive, but managed via the 'session.timeout.ms' property.
     */
    private static Duration getKeepAlivePeriodFromPollPeriod(Duration pollPeriod) {
        return pollPeriod.dividedBy(2);
    }

    public static Properties buildKafkaProperties(
        @NonNull String brokers,
        @NonNull String groupId,
        boolean enableMSKAuth,
        String propertyFilePath
    ) throws IOException {
        var kafkaProps = new Properties();
        kafkaProps.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        kafkaProps.setProperty("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
        kafkaProps.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
        kafkaProps.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
        if (propertyFilePath != null) {
            try (InputStream input = new FileInputStream(propertyFilePath)) {
                kafkaProps.load(input);
            } catch (IOException ex) {
                log.error("Unable to load properties from kafka properties file with path: {}", propertyFilePath);
                throw ex;
            }
        }
        // Required for using SASL auth with MSK public endpoint
        if (enableMSKAuth) {
            kafkaProps.setProperty("security.protocol", "SASL_SSL");
            kafkaProps.setProperty("sasl.mechanism", "AWS_MSK_IAM");
            kafkaProps.setProperty("sasl.jaas.config", "software.amazon.msk.auth.iam.IAMLoginModule required;");
            kafkaProps.setProperty(
                "sasl.client.callback.handler.class",
                "software.amazon.msk.auth.iam.IAMClientCallbackHandler"
            );
        }
        kafkaProps.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
        kafkaProps.setProperty(ConsumerConfig.GROUP_ID_CONFIG, groupId);
        return kafkaProps;
    }

    @Override
    @SneakyThrows
    public void touch(ITrafficSourceContexts.IBackPressureBlockContext context) {
        CompletableFuture.runAsync(() -> trackingKafkaConsumer.touch(context), kafkaExecutor).get();
    }

    /**
     * If messages are outstanding, we need to keep the connection alive, otherwise, there's no
     * reason to.  It's OK to fall out of the group and rejoin once ready.
     * @return
     */
    @Override
    public Optional getNextRequiredTouch() {
        return trackingKafkaConsumer.getNextRequiredTouch();
    }

    @Override
    @SuppressWarnings("unchecked")
    public CompletableFuture> readNextTrafficStreamChunk(
        Supplier contextSupplier
    ) {
        log.atTrace().setMessage("readNextTrafficStreamChunk()").log();
        return CompletableFuture.supplyAsync(() -> {
            log.atTrace().setMessage("async...readNextTrafficStreamChunk()").log();
            return readNextTrafficStreamSynchronously(contextSupplier.get());
        }, kafkaExecutor);
    }

    public List readNextTrafficStreamSynchronously(
        ITrafficSourceContexts.IReadChunkContext context
    ) {
        log.atTrace().setMessage("readNextTrafficStreamSynchronously()").log();
        try {
            return trackingKafkaConsumer.getNextBatchOfRecords(context, (offsetData, kafkaRecord) -> {
                try {
                    TrafficStream ts = TrafficStream.parseFrom(kafkaRecord.value());
                    var trafficStreamsSoFar = trafficStreamsRead.incrementAndGet();
                    log.atTrace()
                        .setMessage("{}")
                        .addArgument(
                            () -> "Parsed traffic stream #" + trafficStreamsSoFar + ": " + offsetData + " " + ts
                        )
                        .log();
                    var key = new TrafficStreamKeyWithKafkaRecordId(tsk -> {
                        var channelKeyCtx = channelContextManager.retainOrCreateContext(tsk);
                        return channelContextManager.getGlobalContext()
                            .createTrafficStreamContextForKafkaSource(
                                channelKeyCtx,
                                kafkaRecord.key(),
                                kafkaRecord.serializedKeySize() + kafkaRecord.serializedValueSize()
                            );
                    }, ts, offsetData);
                    return (ITrafficStreamWithKey) new PojoTrafficStreamAndKey(ts, key);
                } catch (InvalidProtocolBufferException e) {
                    // Assume the behavioralPolicy instance does any logging that the host may be interested in
                    RuntimeException recordError = behavioralPolicy.onInvalidKafkaRecord(kafkaRecord, e);
                    if (recordError != null) {
                        throw recordError;
                    } else {
                        return null;
                    }
                }
            }).filter(Objects::nonNull).collect(Collectors.toList());
        } catch (Exception e) {
            log.atError().setCause(e).setMessage("Terminating Kafka traffic stream due to exception").log();
            throw e;
        }
    }

    @Override
    public CommitResult commitTrafficStream(ITrafficStreamKey trafficStreamKey) {
        if (!(trafficStreamKey instanceof TrafficStreamKeyWithKafkaRecordId)) {
            throw new IllegalArgumentException(
                "Expected key of type "
                    + TrafficStreamKeyWithKafkaRecordId.class
                    + " but received "
                    + trafficStreamKey
                    + " (of type="
                    + trafficStreamKey.getClass()
                    + ")"
            );
        }
        return trackingKafkaConsumer.commitKafkaKey(
            trafficStreamKey,
            (TrafficStreamKeyWithKafkaRecordId) trafficStreamKey
        );
    }

    @Override
    public void close() throws IOException, InterruptedException, ExecutionException {
        if (isClosed.compareAndSet(false, true)) {
            kafkaExecutor.submit(trackingKafkaConsumer::close).get();
            kafkaExecutor.shutdownNow();
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy