org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-utilities Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.utilities.sources.helpers;

import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.stream.Collectors;
import kafka.common.TopicAndPartition;
import org.apache.hudi.DataSourceUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.TypedProperties;
import org.apache.hudi.exception.HoodieNotSupportedException;
import org.apache.hudi.utilities.exception.HoodieDeltaStreamerException;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.streaming.kafka.KafkaCluster;
import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset;
import org.apache.spark.streaming.kafka.OffsetRange;
import scala.Predef;
import scala.collection.JavaConverters;
import scala.collection.immutable.Map;
import scala.collection.immutable.Set;
import scala.collection.mutable.ArrayBuffer;
import scala.collection.mutable.StringBuilder;
import scala.util.Either;


/**
 * Source to read data from Kafka, incrementally
 */
public class KafkaOffsetGen {

  private static volatile Logger log = LogManager.getLogger(KafkaOffsetGen.class);

  private static long DEFAULT_MAX_EVENTS_TO_READ = 1000000; // 1M events max

  public static class CheckpointUtils {

    /**
     * Reconstruct checkpoint from string.
     */
    public static HashMap strToOffsets(String checkpointStr) {
      HashMap offsetMap = new HashMap<>();
      if (checkpointStr.length() == 0) {
        return offsetMap;
      }
      String[] splits = checkpointStr.split(",");
      String topic = splits[0];
      for (int i = 1; i < splits.length; i++) {
        String[] subSplits = splits[i].split(":");
        offsetMap.put(new TopicAndPartition(topic, Integer.parseInt(subSplits[0])),
            new KafkaCluster.LeaderOffset("", -1, Long.parseLong(subSplits[1])));
      }
      return offsetMap;
    }

    /**
     * String representation of checkpoint
     * 
     * Format: topic1,0:offset0,1:offset1,2:offset2, .....
     */
    public static String offsetsToStr(OffsetRange[] ranges) {
      StringBuilder sb = new StringBuilder();
      // at least 1 partition will be present.
      sb.append(ranges[0].topic() + ",");
      sb.append(Arrays.stream(ranges).map(r -> String.format("%s:%d", r.partition(), r.untilOffset()))
          .collect(Collectors.joining(",")));
      return sb.toString();
    }

    /**
     * Compute the offset ranges to read from Kafka, while handling newly added partitions, skews, event limits.
     *
     * @param fromOffsetMap offsets where we left off last time
     * @param toOffsetMap offsets of where each partitions is currently at
     * @param numEvents maximum number of events to read.
     */
    public static OffsetRange[] computeOffsetRanges(HashMap fromOffsetMap,
        HashMap toOffsetMap, long numEvents) {

      Comparator byPartition = Comparator.comparing(OffsetRange::partition);

      // Create initial offset ranges for each 'to' partition, with from = to offsets.
      OffsetRange[] ranges = new OffsetRange[toOffsetMap.size()];
      toOffsetMap.entrySet().stream().map(e -> {
        TopicAndPartition tp = e.getKey();
        long fromOffset = fromOffsetMap.getOrDefault(tp, new LeaderOffset("", -1, 0)).offset();
        return OffsetRange.create(tp, fromOffset, fromOffset);
      }).sorted(byPartition).collect(Collectors.toList()).toArray(ranges);

      long allocedEvents = 0;
      java.util.Set exhaustedPartitions = new HashSet<>();
      // keep going until we have events to allocate and partitions still not exhausted.
      while (allocedEvents < numEvents && exhaustedPartitions.size() < toOffsetMap.size()) {
        long remainingEvents = numEvents - allocedEvents;
        long eventsPerPartition =
            (long) Math.ceil((1.0 * remainingEvents) / (toOffsetMap.size() - exhaustedPartitions.size()));

        // Allocate the remaining events to non-exhausted partitions, in round robin fashion
        for (int i = 0; i < ranges.length; i++) {
          OffsetRange range = ranges[i];
          if (!exhaustedPartitions.contains(range.partition())) {
            long toOffsetMax = toOffsetMap.get(range.topicAndPartition()).offset();
            long toOffset = Math.min(toOffsetMax, range.untilOffset() + eventsPerPartition);
            if (toOffset == toOffsetMax) {
              exhaustedPartitions.add(range.partition());
            }
            allocedEvents += toOffset - range.untilOffset();
            // We need recompute toOffset if allocedEvents larger than numEvents.
            if (allocedEvents > numEvents) {
              long offsetsToAdd = Math.min(eventsPerPartition, (numEvents - allocedEvents));
              toOffset = Math.min(toOffsetMax, toOffset + offsetsToAdd);
            }
            ranges[i] = OffsetRange.create(range.topicAndPartition(), range.fromOffset(), toOffset);
          }
        }
      }

      return ranges;
    }

    public static long totalNewMessages(OffsetRange[] ranges) {
      return Arrays.stream(ranges).mapToLong(OffsetRange::count).sum();
    }
  }

  /**
   * Helpers to deal with tricky scala <=> java conversions. (oh my!)
   */
  static class ScalaHelpers {

    public static  Map toScalaMap(HashMap m) {
      return JavaConverters.mapAsScalaMapConverter(m).asScala().toMap(Predef.conforms());
    }

    public static Set toScalaSet(HashSet s) {
      return JavaConverters.asScalaSetConverter(s).asScala().toSet();
    }

    public static  java.util.Map toJavaMap(Map m) {
      return JavaConverters.mapAsJavaMapConverter(m).asJava();
    }
  }

  /**
   * Kafka reset offset strategies
   */
  enum KafkaResetOffsetStrategies {
    LARGEST, SMALLEST
  }

  /**
   * Configs to be passed for this source. All standard Kafka consumer configs are also respected
   */
  static class Config {

    private static final String KAFKA_TOPIC_NAME = "hoodie.deltastreamer.source.kafka.topic";
    private static final KafkaResetOffsetStrategies DEFAULT_AUTO_RESET_OFFSET = KafkaResetOffsetStrategies.LARGEST;
  }

  private final HashMap kafkaParams;
  private final TypedProperties props;
  protected final String topicName;

  public KafkaOffsetGen(TypedProperties props) {
    this.props = props;
    kafkaParams = new HashMap();
    for (Object prop : props.keySet()) {
      kafkaParams.put(prop.toString(), props.getString(prop.toString()));
    }
    DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.KAFKA_TOPIC_NAME));
    topicName = props.getString(Config.KAFKA_TOPIC_NAME);
  }

  public OffsetRange[] getNextOffsetRanges(Option lastCheckpointStr, long sourceLimit) {

    // Obtain current metadata for the topic
    KafkaCluster cluster = new KafkaCluster(ScalaHelpers.toScalaMap(kafkaParams));
    Either, Set> either =
        cluster.getPartitions(ScalaHelpers.toScalaSet(new HashSet<>(Collections.singletonList(topicName))));
    if (either.isLeft()) {
      // log errors. and bail out.
      throw new HoodieDeltaStreamerException("Error obtaining partition metadata", either.left().get().head());
    }
    Set topicPartitions = either.right().get();

    // Determine the offset ranges to read from
    HashMap fromOffsets;
    HashMap checkpointOffsets;
    if (lastCheckpointStr.isPresent()) {
      fromOffsets = checkupValidOffsets(cluster, lastCheckpointStr, topicPartitions);
    } else {
      KafkaResetOffsetStrategies autoResetValue = KafkaResetOffsetStrategies
          .valueOf(props.getString("auto.offset.reset", Config.DEFAULT_AUTO_RESET_OFFSET.toString()).toUpperCase());
      switch (autoResetValue) {
        case SMALLEST:
          fromOffsets =
              new HashMap(ScalaHelpers.toJavaMap(cluster.getEarliestLeaderOffsets(topicPartitions).right().get()));
          break;
        case LARGEST:
          fromOffsets =
              new HashMap(ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get()));
          break;
        default:
          throw new HoodieNotSupportedException("Auto reset value must be one of 'smallest' or 'largest' ");
      }
    }

    // Obtain the latest offsets.
    HashMap toOffsets =
        new HashMap(ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get()));

    // Come up with final set of OffsetRanges to read (account for new partitions, limit number of events)
    long numEvents = Math.min(DEFAULT_MAX_EVENTS_TO_READ, sourceLimit);
    OffsetRange[] offsetRanges = CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets, numEvents);

    return offsetRanges;
  }

  // check up checkpoint offsets is valid or not, if true, return checkpoint offsets,
  // else return earliest offsets
  private HashMap checkupValidOffsets(KafkaCluster cluster,
      Option lastCheckpointStr, Set topicPartitions) {
    HashMap checkpointOffsets =
        CheckpointUtils.strToOffsets(lastCheckpointStr.get());
    HashMap earliestOffsets =
        new HashMap(ScalaHelpers.toJavaMap(cluster.getEarliestLeaderOffsets(topicPartitions).right().get()));

    boolean checkpointOffsetReseter = checkpointOffsets.entrySet().stream()
        .anyMatch(offset -> offset.getValue().offset() < earliestOffsets.get(offset.getKey()).offset());
    return checkpointOffsetReseter ? earliestOffsets : checkpointOffsets;
  }


  public String getTopicName() {
    return topicName;
  }

  public HashMap getKafkaParams() {
    return kafkaParams;
  }
}