com.uber.hoodie.utilities.sources.KafkaSource Maven / Gradle / Ivy
/*
* Copyright (c) 2017 Uber Technologies, Inc. ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
*/
package com.uber.hoodie.utilities.sources;
import com.uber.hoodie.exception.HoodieNotSupportedException;
import com.uber.hoodie.utilities.UtilHelpers;
import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException;
import com.uber.hoodie.utilities.schema.SchemaProvider;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.streaming.kafka.KafkaCluster;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange;
import kafka.common.TopicAndPartition;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import kafka.serializer.DefaultDecoder;
import scala.Predef;
import scala.Tuple2;
import scala.collection.JavaConverters;
import scala.collection.immutable.Map;
import scala.collection.immutable.Set;
import scala.collection.mutable.ArrayBuffer;
import scala.collection.mutable.StringBuilder;
import scala.util.Either;
/**
* Source to read data from Kafka, incrementally
*/
public class KafkaSource extends Source {
private static volatile Logger log = LogManager.getLogger(KafkaSource.class);
static class CheckpointUtils {
/**
* Reconstruct checkpoint from string.
*
* @param checkpointStr
* @return
*/
public static HashMap strToOffsets(String checkpointStr) {
HashMap offsetMap = new HashMap<>();
String[] splits = checkpointStr.split(",");
String topic = splits[0];
for (int i = 1; i < splits.length; i++) {
String[] subSplits = splits[i].split(":");
offsetMap.put(new TopicAndPartition(topic, Integer.parseInt(subSplits[0])),
new KafkaCluster.LeaderOffset("", -1, Long.parseLong(subSplits[1])));
}
return offsetMap;
}
/**
* String representation of checkpoint
*
* Format:
* topic1,0:offset0,1:offset1,2:offset2, .....
*
* @param offsetMap
* @return
*/
public static String offsetsToStr(HashMap offsetMap) {
StringBuilder sb = new StringBuilder();
// atleast 1 partition will be present.
sb.append(offsetMap.entrySet().stream().findFirst().get().getKey().topic() + ",");
sb.append(offsetMap.entrySet().stream()
.map(e -> String.format("%s:%d",e.getKey().partition(), e.getValue().offset()))
.collect(Collectors.joining(",")));
return sb.toString();
}
public static OffsetRange[] computeOffsetRanges(HashMap fromOffsetMap,
HashMap toOffsetMap) {
Comparator byPartition = (OffsetRange o1, OffsetRange o2) -> {
return Integer.valueOf(o1.partition()).compareTo(Integer.valueOf(o2.partition()));
};
List offsetRanges = toOffsetMap.entrySet().stream().map(e -> {
TopicAndPartition tp = e.getKey();
long fromOffset = -1;
if (fromOffsetMap.containsKey(tp)){
fromOffset = fromOffsetMap.get(tp).offset();
}
return OffsetRange.create(tp, fromOffset, e.getValue().offset());
}).sorted(byPartition).collect(Collectors.toList());
OffsetRange[] ranges = new OffsetRange[offsetRanges.size()];
return offsetRanges.toArray(ranges);
}
public static long totalNewMessages(OffsetRange[] ranges) {
long totalMsgs = 0;
for (OffsetRange range: ranges) {
totalMsgs += Math.max(range.untilOffset()-range.fromOffset(), 0);
}
return totalMsgs;
}
}
/**
* Helpers to deal with tricky scala <=> java conversions. (oh my!)
*/
static class ScalaHelpers {
public static Map toScalaMap(HashMap m) {
return JavaConverters.mapAsScalaMapConverter(m).asScala().toMap(
Predef.>conforms()
);
}
public static Set toScalaSet(HashSet s) {
return JavaConverters.asScalaSetConverter(s).asScala().toSet();
}
public static java.util.Map toJavaMap(Map m) {
return JavaConverters.mapAsJavaMapConverter(m).asJava();
}
}
/**
* Configs to be passed for this source. All standard Kafka consumer configs are also
* respected
*/
static class Config {
private final static String KAFKA_TOPIC_NAME = "hoodie.deltastreamer.source.kafka.topic";
private final static String DEFAULT_AUTO_RESET_OFFSET = "largest";
}
private HashMap kafkaParams;
private final String topicName;
public KafkaSource(PropertiesConfiguration config, JavaSparkContext sparkContext, SourceDataFormat dataFormat, SchemaProvider schemaProvider) {
super(config, sparkContext, dataFormat, schemaProvider);
kafkaParams = new HashMap<>();
Stream keys = StreamSupport.stream(Spliterators.spliteratorUnknownSize(config.getKeys(), Spliterator.NONNULL), false);
keys.forEach(k -> kafkaParams.put(k, config.getString(k)));
UtilHelpers.checkRequiredProperties(config, Arrays.asList(Config.KAFKA_TOPIC_NAME));
topicName = config.getString(Config.KAFKA_TOPIC_NAME);
}
@Override
public Pair>, String> fetchNewData(Optional lastCheckpointStr, long maxInputBytes) {
// Obtain current metadata for the topic
KafkaCluster cluster = new KafkaCluster(ScalaHelpers.toScalaMap(kafkaParams));
Either, Set> either = cluster.getPartitions(ScalaHelpers.toScalaSet(new HashSet<>(Arrays.asList(topicName))));
if (either.isLeft()) {
// log errors. and bail out.
throw new HoodieDeltaStreamerException("Error obtaining partition metadata", either.left().get().head());
}
Set topicPartitions = either.right().get();
// Determine the offset ranges to read from
HashMap fromOffsets;
if (lastCheckpointStr.isPresent()) {
fromOffsets = CheckpointUtils.strToOffsets(lastCheckpointStr.get());
} else {
String autoResetValue = config.getString("auto.offset.reset", Config.DEFAULT_AUTO_RESET_OFFSET);
if (autoResetValue.equals("smallest")) {
fromOffsets = new HashMap(ScalaHelpers.toJavaMap(cluster.getEarliestLeaderOffsets(topicPartitions).right().get()));
} else if (autoResetValue.equals("largest")) {
fromOffsets = new HashMap(ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get()));
} else {
throw new HoodieNotSupportedException("Auto reset value must be one of 'smallest' or 'largest' ");
}
}
// Always read until the latest offset
HashMap toOffsets = new HashMap(ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get()));
// Come up with final set of OffsetRanges to read (account for new partitions)
// TODO(vc): Respect maxInputBytes, by estimating number of messages to read each batch from partition size
OffsetRange[] offsetRanges = CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets);
long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
if (totalNewMsgs <= 0) {
return new ImmutablePair<>(Optional.empty(), lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : CheckpointUtils.offsetsToStr(toOffsets));
} else {
log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + topicName);
}
// Perform the actual read from Kafka
JavaRDD kafkaRDD = KafkaUtils.createRDD(
sparkContext,
byte[].class,
byte[].class,
DefaultDecoder.class,
DefaultDecoder.class,
kafkaParams,
offsetRanges).values();
// Produce a RDD[GenericRecord]
final AvroConvertor avroConvertor = new AvroConvertor(schemaProvider.getSourceSchema().toString());
JavaRDD newDataRDD;
if (dataFormat == SourceDataFormat.AVRO) {
newDataRDD = kafkaRDD.map(bytes -> avroConvertor.fromAvroBinary(bytes));
} else if (dataFormat == SourceDataFormat.JSON) {
newDataRDD = kafkaRDD.map(bytes -> avroConvertor.fromJson(new String(bytes, Charset.forName("utf-8"))));
} else {
throw new HoodieNotSupportedException("Unsupport data format :" + dataFormat);
}
return new ImmutablePair<>(Optional.of(newDataRDD), CheckpointUtils.offsetsToStr(toOffsets));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy