All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.apex.malhar.kafka.AbstractKafkaInputOperator Maven / Gradle / Ivy

There is a newer version: 3.8.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.apex.malhar.kafka;

import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import javax.validation.constraints.Min;
import javax.validation.constraints.NotNull;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.apex.malhar.lib.wal.WindowDataManager;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.OffsetAndMetadata;
import org.apache.kafka.clients.consumer.OffsetCommitCallback;
import org.apache.kafka.common.TopicPartition;

import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;

import com.datatorrent.api.AutoMetric;
import com.datatorrent.api.Context;
import com.datatorrent.api.InputOperator;
import com.datatorrent.api.Operator;
import com.datatorrent.api.Partitioner;
import com.datatorrent.api.StatsListener;
import com.datatorrent.netlet.util.DTThrowable;

/**
 * The abstract kafka input operator using kafka 0.9.0 new consumer API
 * A scalable, fault-tolerant, at-least-once kafka input operator
 * Key features includes:
 *
 * 
    *
  1. Out-of-box One-to-one and one-to-many partition strategy support plus customizable partition strategy * refer to AbstractKafkaPartitioner
  2. *
  3. Fault-tolerant when the input operator goes down, it redeploys on other node
  4. *
  5. At-least-once semantics for operator failure (no matter which operator fails)
  6. *
  7. At-least-once semantics for cold restart (no data loss even if you restart the application)
  8. *
  9. Multi-cluster support, one operator can consume data from more than one kafka clusters
  10. *
  11. Multi-topic support, one operator can subscribe multiple topics
  12. *
  13. Throughput control support, you can throttle number of tuple for each streaming window
  14. *
* * @since 3.3.0 */ @InterfaceStability.Evolving public abstract class AbstractKafkaInputOperator implements InputOperator, Operator.ActivationListener, Operator.CheckpointListener, Partitioner, StatsListener, OffsetCommitCallback { private static final Logger logger = LoggerFactory.getLogger(AbstractKafkaInputOperator.class); public enum InitialOffset { EARLIEST, // consume from beginning of the partition every time when application restart LATEST, // consume from latest of the partition every time when application restart APPLICATION_OR_EARLIEST, // consume from committed position from last run or earliest if there is no committed offset(s) APPLICATION_OR_LATEST // consume from committed position from last run or latest if there is no committed offset(s) } @NotNull private String[] clusters; @NotNull private String[] topics; /** * offset track for checkpoint */ private final Map offsetTrack = new HashMap<>(); private final transient Map windowStartOffset = new HashMap<>(); private transient int operatorId; private int initialPartitionCount = 1; private long repartitionInterval = 30000L; private long repartitionCheckInterval = 5000L; @Min(1) private int maxTuplesPerWindow = Integer.MAX_VALUE; /** * By default the operator start consuming from the committed offset or the latest one */ private InitialOffset initialOffset = InitialOffset.APPLICATION_OR_LATEST; private long metricsRefreshInterval = 5000L; private long consumerTimeout = 5000L; private int holdingBufferSize = 1024; private Properties consumerProps; /** * Assignment for each operator instance */ private Set assignment; //=======================All transient fields========================== /** * Wrapper consumer object * It wraps KafkaConsumer, maintains consumer thread and store messages in a queue */ private final transient KafkaConsumerWrapper consumerWrapper = new KafkaConsumerWrapper(); /** * By default the strategy is one to one * @see PartitionStrategy */ private PartitionStrategy strategy = PartitionStrategy.ONE_TO_ONE; /** * count the emitted message in each window
* non settable */ private transient int emitCount = 0; /** * store offsets with window id, only keep offsets with windows that have not been committed */ private final transient List>> offsetHistory = new LinkedList<>(); /** * Application name is used as group.id for kafka consumer */ private transient String applicationName; private transient AbstractKafkaPartitioner partitioner; private transient long currentWindowId; private transient long lastCheckTime = 0L; private transient long lastRepartitionTime = 0L; @AutoMetric private transient KafkaMetrics metrics; private WindowDataManager windowDataManager = new WindowDataManager.NoopWindowDataManager(); @Override public void activate(Context.OperatorContext context) { consumerWrapper.start(isIdempotent()); } @Override public void deactivate() { consumerWrapper.stop(); } @Override public void checkpointed(long l) { } @Override public void committed(long windowId) { if (initialOffset == InitialOffset.LATEST || initialOffset == InitialOffset.EARLIEST) { return; } //ask kafka consumer wrapper to store the committed offsets for (Iterator>> iter = offsetHistory.iterator(); iter.hasNext(); ) { Pair> item = iter.next(); if (item.getLeft() <= windowId) { if (item.getLeft() == windowId) { consumerWrapper.commitOffsets(item.getRight()); } iter.remove(); } } if (isIdempotent()) { try { windowDataManager.deleteUpTo(operatorId, windowId); } catch (IOException e) { DTThrowable.rethrow(e); } } } @Override public void emitTuples() { int count = consumerWrapper.messageSize(); if (maxTuplesPerWindow > 0) { count = Math.min(count, maxTuplesPerWindow - emitCount); } for (int i = 0; i < count; i++) { Pair> tuple = consumerWrapper.pollMessage(); ConsumerRecord msg = tuple.getRight(); emitTuple(tuple.getLeft(), msg); AbstractKafkaPartitioner.PartitionMeta pm = new AbstractKafkaPartitioner.PartitionMeta(tuple.getLeft(), msg.topic(), msg.partition()); offsetTrack.put(pm, msg.offset() + 1); if (isIdempotent() && !windowStartOffset.containsKey(pm)) { windowStartOffset.put(pm, msg.offset()); } } emitCount += count; } protected abstract void emitTuple(String cluster, ConsumerRecord message); @Override public void beginWindow(long wid) { emitCount = 0; currentWindowId = wid; windowStartOffset.clear(); if (isIdempotent() && wid <= windowDataManager.getLargestRecoveryWindow()) { replay(wid); } else { consumerWrapper.afterReplay(); } } private void replay(long windowId) { try { Map> windowData = (Map>)windowDataManager.load(operatorId, windowId); consumerWrapper.emitImmediately(windowData); } catch (IOException e) { DTThrowable.rethrow(e); } } @Override public void endWindow() { // copy current offset track to history memory Map offsetsWithWindow = new HashMap<>(offsetTrack); offsetHistory.add(Pair.of(currentWindowId, offsetsWithWindow)); //update metrics metrics.updateMetrics(clusters, consumerWrapper.getAllConsumerMetrics()); //update the windowDataManager if (isIdempotent()) { try { Map> windowData = new HashMap<>(); for (Map.Entry e : windowStartOffset.entrySet()) { windowData.put(e.getKey(), new MutablePair<>(e.getValue(), offsetTrack.get(e.getKey()) - e.getValue())); } windowDataManager.save(windowData, operatorId, currentWindowId); } catch (IOException e) { DTThrowable.rethrow(e); } } } @Override public void setup(Context.OperatorContext context) { applicationName = context.getValue(Context.DAGContext.APPLICATION_NAME); consumerWrapper.create(this); metrics = new KafkaMetrics(metricsRefreshInterval); windowDataManager.setup(context); operatorId = context.getId(); } @Override public void teardown() { windowDataManager.teardown(); } private void initPartitioner() { if (partitioner == null) { logger.info("Initialize Partitioner"); switch (strategy) { case ONE_TO_ONE: partitioner = new OneToOnePartitioner(clusters, topics, this); break; case ONE_TO_MANY: partitioner = new OneToManyPartitioner(clusters, topics, this); break; case ONE_TO_MANY_HEURISTIC: throw new UnsupportedOperationException("Not implemented yet"); default: throw new RuntimeException("Invalid strategy"); } logger.info("Actual Partitioner is {}", partitioner.getClass()); } } @Override public Response processStats(BatchedOperatorStats batchedOperatorStats) { long t = System.currentTimeMillis(); if (repartitionInterval < 0 || repartitionCheckInterval < 0 || t - lastCheckTime < repartitionCheckInterval || t - lastRepartitionTime < repartitionInterval) { // return false if it's within repartitionCheckInterval since last time it check the stats Response response = new Response(); response.repartitionRequired = false; return response; } try { logger.debug("Process stats"); initPartitioner(); return partitioner.processStats(batchedOperatorStats); } finally { lastCheckTime = System.currentTimeMillis(); } } @Override public Collection> definePartitions( Collection> collection, PartitioningContext partitioningContext) { logger.debug("Define partitions"); initPartitioner(); return partitioner.definePartitions(collection, partitioningContext); } @Override public void partitioned(Map> map) { // update the last repartition time lastRepartitionTime = System.currentTimeMillis(); initPartitioner(); partitioner.partitioned(map); } /** * * A callback from consumer after it commits the offset * @param map * @param e */ public void onComplete(Map map, Exception e) { if (logger.isDebugEnabled()) { logger.debug("Commit offsets complete {} ", Joiner.on(';').withKeyValueSeparator("=").join(map)); } if (e != null) { logger.warn("Exceptions in committing offsets {} : {} ", Joiner.on(';').withKeyValueSeparator("=").join(map), e); } } public void assign(Set assignment) { this.assignment = assignment; } public Set assignment() { return assignment; } private boolean isIdempotent() { return windowDataManager != null && !(windowDataManager instanceof WindowDataManager.NoopWindowDataManager); } //---------------------------------------------setters and getters---------------------------------------- public void setInitialPartitionCount(int partitionCount) { this.initialPartitionCount = partitionCount; } /** * initial partition count * only used with PartitionStrategy.ONE_TO_MANY * or customized strategy */ public int getInitialPartitionCount() { return initialPartitionCount; } public void setClusters(String clusters) { this.clusters = clusters.split(";"); } /** * Same setting as bootstrap.servers property to KafkaConsumer * refer to http://kafka.apache.org/documentation.html#newconsumerconfigs * To support multi cluster, you can have multiple bootstrap.servers separated by ";" */ public String getClusters() { return Joiner.on(';').join(clusters); } public void setTopics(String topics) { this.topics = Iterables.toArray(Splitter.on(',').trimResults().omitEmptyStrings().split(topics), String.class); } /** * The topics the operator consumes, separate by',' * Topic name can only contain ASCII alphanumerics, '.', '_' and '-' */ public String getTopics() { return Joiner.on(", ").join(topics); } public void setStrategy(String policy) { this.strategy = PartitionStrategy.valueOf(policy.toUpperCase()); } public String getStrategy() { return strategy.name(); } public void setInitialOffset(String initialOffset) { this.initialOffset = InitialOffset.valueOf(initialOffset.toUpperCase()); } /** * Initial offset, it should be one of the following *
    *
  • earliest
  • *
  • latest
  • *
  • application_or_earliest
  • *
  • application_or_latest
  • *
*/ public String getInitialOffset() { return initialOffset.name(); } public String getApplicationName() { return applicationName; } public void setConsumerProps(Properties consumerProps) { this.consumerProps = consumerProps; } /** * Extra kafka consumer properties * http://kafka.apache.org/090/documentation.html#newconsumerconfigs * * Please be aware that the properties below are set by the operator, don't override it * *
    *
  • bootstrap.servers
  • *
  • group.id
  • *
  • auto.offset.reset
  • *
  • enable.auto.commit
  • *
  • partition.assignment.strategy
  • *
  • key.deserializer
  • *
  • value.deserializer
  • *
* */ public Properties getConsumerProps() { return consumerProps; } public void setMaxTuplesPerWindow(int maxTuplesPerWindow) { this.maxTuplesPerWindow = maxTuplesPerWindow; } /** * maximum tuples allowed to be emitted in each window */ public int getMaxTuplesPerWindow() { return maxTuplesPerWindow; } /** * @see * org.apache.kafka.clients.consumer.KafkaConsumer.poll */ public long getConsumerTimeout() { return consumerTimeout; } public void setConsumerTimeout(long consumerTimeout) { this.consumerTimeout = consumerTimeout; } /** * Number of messages kept in memory waiting for emission to downstream operator */ public int getHoldingBufferSize() { return holdingBufferSize; } public void setHoldingBufferSize(int holdingBufferSize) { this.holdingBufferSize = holdingBufferSize; } /** * metrics refresh interval */ public long getMetricsRefreshInterval() { return metricsRefreshInterval; } public void setMetricsRefreshInterval(long metricsRefreshInterval) { this.metricsRefreshInterval = metricsRefreshInterval; } public void setRepartitionCheckInterval(long repartitionCheckInterval) { this.repartitionCheckInterval = repartitionCheckInterval; } /** * Minimal interval between checking collected stats and decide whether it needs to repartition or not. * And minimal interval between 2 offset updates */ public long getRepartitionCheckInterval() { return repartitionCheckInterval; } public void setRepartitionInterval(long repartitionInterval) { this.repartitionInterval = repartitionInterval; } /** * Minimal interval between 2 (re)partition actions */ public long getRepartitionInterval() { return repartitionInterval; } public void setWindowDataManager(WindowDataManager windowDataManager) { this.windowDataManager = windowDataManager; } public WindowDataManager getWindowDataManager() { return windowDataManager; } /** * @omitFromUI * @return current checkpointed offsets */ public Map getOffsetTrack() { return offsetTrack; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy