gobblin.source.extractor.extract.kafka.KafkaSource Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.source.extractor.extract.kafka;
import gobblin.source.extractor.limiter.LimiterConfigurationKeys;
import gobblin.source.workunit.MultiWorkUnit;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import lombok.Getter;
import lombok.Setter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Joiner;
import com.google.common.base.Function;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Stopwatch;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.SourceState;
import gobblin.configuration.State;
import gobblin.configuration.WorkUnitState;
import gobblin.kafka.client.GobblinKafkaConsumerClient;
import gobblin.kafka.client.GobblinKafkaConsumerClient.GobblinKafkaConsumerClientFactory;
import gobblin.source.extractor.extract.EventBasedSource;
import gobblin.source.extractor.extract.kafka.workunit.packer.KafkaWorkUnitPacker;
import gobblin.source.workunit.Extract;
import gobblin.source.workunit.WorkUnit;
import gobblin.util.ClassAliasResolver;
import gobblin.util.ConfigUtils;
import gobblin.util.DatasetFilterUtils;
import gobblin.util.ExecutorsUtils;
import gobblin.util.dataset.DatasetUtils;
/**
* A {@link gobblin.source.Source} implementation for Kafka source.
*
* @author Ziyang Liu
*/
public abstract class KafkaSource extends EventBasedSource {
private static final Logger LOG = LoggerFactory.getLogger(KafkaSource.class);
public static final String TOPIC_BLACKLIST = "topic.blacklist";
public static final String TOPIC_WHITELIST = "topic.whitelist";
public static final String LATEST_OFFSET = "latest";
public static final String EARLIEST_OFFSET = "earliest";
public static final String NEAREST_OFFSET = "nearest";
public static final String BOOTSTRAP_WITH_OFFSET = "bootstrap.with.offset";
public static final String DEFAULT_BOOTSTRAP_WITH_OFFSET = LATEST_OFFSET;
public static final String TOPICS_MOVE_TO_LATEST_OFFSET = "topics.move.to.latest.offset";
public static final String RESET_ON_OFFSET_OUT_OF_RANGE = "reset.on.offset.out.of.range";
public static final String DEFAULT_RESET_ON_OFFSET_OUT_OF_RANGE = NEAREST_OFFSET;
public static final String TOPIC_NAME = "topic.name";
public static final String PARTITION_ID = "partition.id";
public static final String LEADER_ID = "leader.id";
public static final String LEADER_HOSTANDPORT = "leader.hostandport";
public static final Extract.TableType DEFAULT_TABLE_TYPE = Extract.TableType.APPEND_ONLY;
public static final String DEFAULT_NAMESPACE_NAME = "KAFKA";
public static final String ALL_TOPICS = "all";
public static final String AVG_RECORD_SIZE = "avg.record.size";
public static final String AVG_RECORD_MILLIS = "avg.record.millis";
public static final String GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS = "gobblin.kafka.consumerClient.class";
public static final String GOBBLIN_KAFKA_EXTRACT_ALLOW_TABLE_TYPE_NAMESPACE_CUSTOMIZATION =
"gobblin.kafka.extract.allowTableTypeAndNamspaceCustomization";
public static final String DEFAULT_GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS =
"gobblin.kafka.client.Kafka08ConsumerClient$Factory";
private final Set moveToLatestTopics = Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);
private final Map previousOffsets = Maps.newConcurrentMap();
private final Set partitionsToBeProcessed = Sets.newConcurrentHashSet();
private final AtomicInteger failToGetOffsetCount = new AtomicInteger(0);
private final AtomicInteger offsetTooEarlyCount = new AtomicInteger(0);
private final AtomicInteger offsetTooLateCount = new AtomicInteger(0);
private GobblinKafkaConsumerClient kafkaConsumerClient;
private final ClassAliasResolver kafkaConsumerClientResolver =
new ClassAliasResolver<>(GobblinKafkaConsumerClientFactory.class);
private volatile boolean doneGettingAllPreviousOffsets = false;
private Extract.TableType tableType;
private String extractNameSpace;
private boolean isFullExtract;
private List getLimiterExtractorReportKeys () {
List keyNames = new ArrayList<>();
keyNames.add(KafkaSource.TOPIC_NAME);
keyNames.add(KafkaSource.PARTITION_ID);
return keyNames;
}
private void setLimiterReportKeyListToWorkUnits(List workUnits, List keyNameList) {
if (keyNameList.isEmpty())
return;
String keyList = Joiner.on(',').join(keyNameList.iterator());
for (WorkUnit workUnit: workUnits) {
workUnit.setProp(LimiterConfigurationKeys.LIMITER_REPORT_KEY_LIST, keyList);
}
}
@Override
public List getWorkunits(SourceState state) {
Map> workUnits = Maps.newConcurrentMap();
if (state.getPropAsBoolean(KafkaSource.GOBBLIN_KAFKA_EXTRACT_ALLOW_TABLE_TYPE_NAMESPACE_CUSTOMIZATION)) {
String tableTypeStr = state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY,
KafkaSource.DEFAULT_TABLE_TYPE.toString());
tableType = Extract.TableType.valueOf(tableTypeStr);
extractNameSpace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, KafkaSource.DEFAULT_NAMESPACE_NAME);
} else {
// To be compatible, reject table type and namespace configuration keys as previous implementation
tableType = KafkaSource.DEFAULT_TABLE_TYPE;
extractNameSpace = KafkaSource.DEFAULT_NAMESPACE_NAME;
}
isFullExtract = state.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY);
try {
this.kafkaConsumerClient =
kafkaConsumerClientResolver
.resolveClass(
state.getProp(GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS,
DEFAULT_GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS)).newInstance()
.create(ConfigUtils.propertiesToConfig(state.getProperties()));
List topics = getFilteredTopics(state);
for (KafkaTopic topic: topics)
{
LOG.info("Discovered topic " + topic.getName());
}
Map topicSpecificStateMap =
DatasetUtils.getDatasetSpecificProps(Iterables.transform(topics, new Function() {
@Override
public String apply(KafkaTopic topic) {
return topic.getName();
}
}), state);
int numOfThreads = state.getPropAsInt(ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_THREADS,
ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_DEFAULT_THREAD_COUNT);
ExecutorService threadPool =
Executors.newFixedThreadPool(numOfThreads, ExecutorsUtils.newThreadFactory(Optional.of(LOG)));
Stopwatch createWorkUnitStopwatch = Stopwatch.createStarted();
for (KafkaTopic topic : topics) {
threadPool.submit(new WorkUnitCreator(topic, state,
Optional.fromNullable(topicSpecificStateMap.get(topic.getName())), workUnits));
}
ExecutorsUtils.shutdownExecutorService(threadPool, Optional.of(LOG), 1L, TimeUnit.HOURS);
LOG.info(String.format("Created workunits for %d topics in %d seconds", workUnits.size(),
createWorkUnitStopwatch.elapsed(TimeUnit.SECONDS)));
// Create empty WorkUnits for skipped partitions (i.e., partitions that have previous offsets,
// but aren't processed).
createEmptyWorkUnitsForSkippedPartitions(workUnits, topicSpecificStateMap, state);
int numOfMultiWorkunits =
state.getPropAsInt(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY, ConfigurationKeys.DEFAULT_MR_JOB_MAX_MAPPERS);
List workUnitList = KafkaWorkUnitPacker.getInstance(this, state).pack(workUnits, numOfMultiWorkunits);
addTopicSpecificPropsToWorkUnits(workUnitList, topicSpecificStateMap);
setLimiterReportKeyListToWorkUnits(workUnitList, getLimiterExtractorReportKeys());
return workUnitList;
} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
throw new RuntimeException(e);
} finally {
try {
if (this.kafkaConsumerClient != null) {
this.kafkaConsumerClient.close();
}
} catch (IOException e) {
throw new RuntimeException("Exception closing kafkaConsumerClient");
}
}
}
private void addTopicSpecificPropsToWorkUnits(List workUnits, Map topicSpecificStateMap) {
for (WorkUnit workUnit : workUnits) {
addTopicSpecificPropsToWorkUnit(workUnit, topicSpecificStateMap);
}
}
private void addTopicSpecificPropsToWorkUnit(WorkUnit workUnit, Map topicSpecificStateMap) {
if (workUnit instanceof MultiWorkUnit) {
for (WorkUnit wu : ((MultiWorkUnit) workUnit).getWorkUnits()) {
addTopicSpecificPropsToWorkUnit(wu, topicSpecificStateMap);
}
} else if (!workUnit.contains(TOPIC_NAME)) {
return;
} else if (topicSpecificStateMap == null) {
return;
} else if (!topicSpecificStateMap.containsKey(workUnit.getProp(TOPIC_NAME))) {
return;
} else {
workUnit.addAll(topicSpecificStateMap.get(workUnit.getProp(TOPIC_NAME)));
}
}
private void createEmptyWorkUnitsForSkippedPartitions(Map> workUnits,
Map topicSpecificStateMap, SourceState state) {
// in case the previous offset not been set
getAllPreviousOffsets(state);
// For each partition that has a previous offset, create an empty WorkUnit for it if
// it is not in this.partitionsToBeProcessed.
for (Map.Entry entry : this.previousOffsets.entrySet()) {
KafkaPartition partition = entry.getKey();
if (!this.partitionsToBeProcessed.contains(partition)) {
long previousOffset = entry.getValue();
WorkUnit emptyWorkUnit = createEmptyWorkUnit(partition, previousOffset,
Optional.fromNullable(topicSpecificStateMap.get(partition.getTopicName())));
String topicName = partition.getTopicName();
if (workUnits.containsKey(topicName)) {
workUnits.get(topicName).add(emptyWorkUnit);
} else {
workUnits.put(topicName, Lists.newArrayList(emptyWorkUnit));
}
}
}
}
/*
* This function need to be thread safe since it is called in the Runnable
*/
private List getWorkUnitsForTopic(KafkaTopic topic, SourceState state, Optional topicSpecificState) {
boolean topicQualified = isTopicQualified(topic);
List workUnits = Lists.newArrayList();
for (KafkaPartition partition : topic.getPartitions()) {
WorkUnit workUnit = getWorkUnitForTopicPartition(partition, state, topicSpecificState);
this.partitionsToBeProcessed.add(partition);
if (workUnit != null) {
// For disqualified topics, for each of its workunits set the high watermark to be the same
// as the low watermark, so that it will be skipped.
if (!topicQualified) {
skipWorkUnit(workUnit);
}
workUnits.add(workUnit);
}
}
return workUnits;
}
/**
* Whether a {@link KafkaTopic} is qualified to be pulled.
*
* This method can be overridden by subclasses for verifying topic eligibility, e.g., one may want to
* skip a topic if its schema cannot be found in the schema registry.
*/
protected boolean isTopicQualified(KafkaTopic topic) {
return true;
}
@SuppressWarnings("deprecation")
private static void skipWorkUnit(WorkUnit workUnit) {
workUnit.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, workUnit.getLowWaterMark());
}
private WorkUnit getWorkUnitForTopicPartition(KafkaPartition partition, SourceState state,
Optional topicSpecificState) {
Offsets offsets = new Offsets();
boolean failedToGetKafkaOffsets = false;
try {
offsets.setEarliestOffset(this.kafkaConsumerClient.getEarliestOffset(partition));
offsets.setLatestOffset(this.kafkaConsumerClient.getLatestOffset(partition));
} catch (KafkaOffsetRetrievalFailureException e) {
failedToGetKafkaOffsets = true;
}
long previousOffset = 0;
boolean previousOffsetNotFound = false;
try {
previousOffset = getPreviousOffsetForPartition(partition, state);
} catch (PreviousOffsetNotFoundException e) {
previousOffsetNotFound = true;
}
if (failedToGetKafkaOffsets) {
// Increment counts, which will be reported as job metrics
this.failToGetOffsetCount.incrementAndGet();
// When unable to get earliest/latest offsets from Kafka, skip the partition and create an empty workunit,
// so that previousOffset is persisted.
LOG.warn(String.format(
"Failed to retrieve earliest and/or latest offset for partition %s. This partition will be skipped.",
partition));
return previousOffsetNotFound ? null : createEmptyWorkUnit(partition, previousOffset, topicSpecificState);
}
if (shouldMoveToLatestOffset(partition, state)) {
offsets.startAtLatestOffset();
} else if (previousOffsetNotFound) {
// When previous offset cannot be found, either start at earliest offset or latest offset, or skip the partition
// (no need to create an empty workunit in this case since there's no offset to persist).
String offsetNotFoundMsg = String.format("Previous offset for partition %s does not exist. ", partition);
String offsetOption = state.getProp(BOOTSTRAP_WITH_OFFSET, DEFAULT_BOOTSTRAP_WITH_OFFSET).toLowerCase();
if (offsetOption.equals(LATEST_OFFSET)) {
LOG.warn(offsetNotFoundMsg + "This partition will start from the latest offset: " + offsets.getLatestOffset());
offsets.startAtLatestOffset();
} else if (offsetOption.equals(EARLIEST_OFFSET)) {
LOG.warn(
offsetNotFoundMsg + "This partition will start from the earliest offset: " + offsets.getEarliestOffset());
offsets.startAtEarliestOffset();
} else {
LOG.warn(offsetNotFoundMsg + "This partition will be skipped.");
return null;
}
} else {
try {
offsets.startAt(previousOffset);
} catch (StartOffsetOutOfRangeException e) {
// Increment counts, which will be reported as job metrics
if (offsets.getStartOffset() <= offsets.getLatestOffset()) {
this.offsetTooEarlyCount.incrementAndGet();
} else {
this.offsetTooLateCount.incrementAndGet();
}
// When previous offset is out of range, either start at earliest, latest or nearest offset, or skip the
// partition. If skipping, need to create an empty workunit so that previousOffset is persisted.
String offsetOutOfRangeMsg = String.format(
"Start offset for partition %s is out of range. Start offset = %d, earliest offset = %d, latest offset = %d.",
partition, offsets.getStartOffset(), offsets.getEarliestOffset(), offsets.getLatestOffset());
String offsetOption =
state.getProp(RESET_ON_OFFSET_OUT_OF_RANGE, DEFAULT_RESET_ON_OFFSET_OUT_OF_RANGE).toLowerCase();
if (offsetOption.equals(LATEST_OFFSET)
|| (offsetOption.equals(NEAREST_OFFSET) && offsets.getStartOffset() >= offsets.getLatestOffset())) {
LOG.warn(
offsetOutOfRangeMsg + "This partition will start from the latest offset: " + offsets.getLatestOffset());
offsets.startAtLatestOffset();
} else if (offsetOption.equals(EARLIEST_OFFSET) || offsetOption.equals(NEAREST_OFFSET)) {
LOG.warn(offsetOutOfRangeMsg + "This partition will start from the earliest offset: "
+ offsets.getEarliestOffset());
offsets.startAtEarliestOffset();
} else {
LOG.warn(offsetOutOfRangeMsg + "This partition will be skipped.");
return createEmptyWorkUnit(partition, previousOffset, topicSpecificState);
}
}
}
return getWorkUnitForTopicPartition(partition, offsets, topicSpecificState);
}
private long getPreviousOffsetForPartition(KafkaPartition partition, SourceState state)
throws PreviousOffsetNotFoundException {
getAllPreviousOffsets(state);
if (this.previousOffsets.containsKey(partition)) {
return this.previousOffsets.get(partition);
}
throw new PreviousOffsetNotFoundException(String.format("Previous offset for topic %s, partition %s not found.",
partition.getTopicName(), partition.getId()));
}
// need to be synchronized as this.previousOffsets need to be initialized once
private synchronized void getAllPreviousOffsets(SourceState state) {
if (this.doneGettingAllPreviousOffsets) {
return;
}
this.previousOffsets.clear();
for (WorkUnitState workUnitState : state.getPreviousWorkUnitStates()) {
List partitions = KafkaUtils.getPartitions(workUnitState);
MultiLongWatermark watermark = workUnitState.getActualHighWatermark(MultiLongWatermark.class);
Preconditions.checkArgument(partitions.size() == watermark.size(), String.format(
"Num of partitions doesn't match number of watermarks: partitions=%s, watermarks=%s", partitions, watermark));
for (int i = 0; i < partitions.size(); i++) {
if (watermark.get(i) != ConfigurationKeys.DEFAULT_WATERMARK_VALUE) {
this.previousOffsets.put(partitions.get(i), watermark.get(i));
}
}
}
this.doneGettingAllPreviousOffsets = true;
}
/**
* A topic can be configured to move to the latest offset in {@link #TOPICS_MOVE_TO_LATEST_OFFSET}.
*
* Need to be synchronized as access by multiple threads
*/
private synchronized boolean shouldMoveToLatestOffset(KafkaPartition partition, SourceState state) {
if (!state.contains(TOPICS_MOVE_TO_LATEST_OFFSET)) {
return false;
}
if (this.moveToLatestTopics.isEmpty()) {
this.moveToLatestTopics.addAll(
Splitter.on(',').trimResults().omitEmptyStrings().splitToList(state.getProp(TOPICS_MOVE_TO_LATEST_OFFSET)));
}
return this.moveToLatestTopics.contains(partition.getTopicName()) || this.moveToLatestTopics.contains(ALL_TOPICS);
}
// thread safe
private WorkUnit createEmptyWorkUnit(KafkaPartition partition, long previousOffset,
Optional topicSpecificState) {
Offsets offsets = new Offsets();
offsets.setEarliestOffset(previousOffset);
offsets.setLatestOffset(previousOffset);
offsets.startAtEarliestOffset();
return getWorkUnitForTopicPartition(partition, offsets, topicSpecificState);
}
private WorkUnit getWorkUnitForTopicPartition(KafkaPartition partition, Offsets offsets,
Optional topicSpecificState) {
Extract extract = this.createExtract(tableType, extractNameSpace, partition.getTopicName());
if (isFullExtract) {
extract.setProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY, true);
}
WorkUnit workUnit = WorkUnit.create(extract);
if (topicSpecificState.isPresent()) {
workUnit.addAll(topicSpecificState.get());
}
workUnit.setProp(TOPIC_NAME, partition.getTopicName());
workUnit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, partition.getTopicName());
workUnit.setProp(PARTITION_ID, partition.getId());
workUnit.setProp(LEADER_ID, partition.getLeader().getId());
workUnit.setProp(LEADER_HOSTANDPORT, partition.getLeader().getHostAndPort().toString());
workUnit.setProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY, offsets.getStartOffset());
workUnit.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, offsets.getLatestOffset());
LOG.info(String.format("Created workunit for partition %s: lowWatermark=%d, highWatermark=%d, range=%d", partition,
offsets.getStartOffset(), offsets.getLatestOffset(), offsets.getLatestOffset() - offsets.getStartOffset()));
return workUnit;
}
private List getFilteredTopics(SourceState state) {
List blacklist = DatasetFilterUtils.getPatternList(state, TOPIC_BLACKLIST);
List whitelist = DatasetFilterUtils.getPatternList(state, TOPIC_WHITELIST);
return this.kafkaConsumerClient.getFilteredTopics(blacklist, whitelist);
}
@Override
public void shutdown(SourceState state) {
state.setProp(ConfigurationKeys.OFFSET_TOO_EARLY_COUNT, this.offsetTooEarlyCount);
state.setProp(ConfigurationKeys.OFFSET_TOO_LATE_COUNT, this.offsetTooLateCount);
state.setProp(ConfigurationKeys.FAIL_TO_GET_OFFSET_COUNT, this.failToGetOffsetCount);
}
/**
* This class contains startOffset, earliestOffset and latestOffset for a Kafka partition.
*/
private static class Offsets {
@Getter
private long startOffset = 0;
@Getter
@Setter
private long earliestOffset = 0;
@Getter
@Setter
private long latestOffset = 0;
private void startAt(long offset) throws StartOffsetOutOfRangeException {
if (offset < this.earliestOffset || offset > this.latestOffset) {
throw new StartOffsetOutOfRangeException(
String.format("start offset = %d, earliest offset = %d, latest offset = %d", offset, this.earliestOffset,
this.latestOffset));
}
this.startOffset = offset;
}
private void startAtEarliestOffset() {
this.startOffset = this.earliestOffset;
}
private void startAtLatestOffset() {
this.startOffset = this.latestOffset;
}
}
private class WorkUnitCreator implements Runnable {
private final KafkaTopic topic;
private final SourceState state;
private final Optional topicSpecificState;
private final Map> allTopicWorkUnits;
WorkUnitCreator(KafkaTopic topic, SourceState state, Optional topicSpecificState,
Map> workUnits) {
this.topic = topic;
this.state = state;
this.topicSpecificState = topicSpecificState;
this.allTopicWorkUnits = workUnits;
}
@Override
public void run() {
try {
this.allTopicWorkUnits.put(this.topic.getName(),
KafkaSource.this.getWorkUnitsForTopic(this.topic, this.state, this.topicSpecificState));
} catch (Throwable t) {
LOG.error("Caught error in creating work unit for " + this.topic.getName(), t);
throw t;
}
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy