dev.responsive.kafka.internal.db.partitioning.Segmenter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of kafka-client Show documentation
artifact for kafka-client
The newest version!
/*
 * Copyright 2023 Responsive Computing, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package dev.responsive.kafka.internal.db.partitioning;

import static java.util.Collections.emptyList;

import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.LongStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A helper class for segment-based partitioners where kafka partitions are mapped to a subset
 * of remote partitions divided up into "segments" that correspond to a range of timestamps.
 * By deleting a table partition we can "drop" a full segment at a time, which can be done if
 * (and only if) the stream-time advances such that the highest timestamp in the range is now
 * outside the retention period. This is intended for segmented WindowStores, but could be
 * extended in the future to other timeseries and/or ttl types of state stores.
 * 
 * Each segment covers a specific time range in milliseconds. If t_n is the start time of the
 * nth segment, then that segment covers all timestamps from t_s - t_s + segmentInterval.
 * For each segment id there are N physical table partitions, where N is the number of kafka
 * partitions. Each physical state store instance should only be interacting with a single kafka
 * partition, but will generally have multiple active segments for each kafka partition.
 * 

 * For a given kafka partition, the total number of segments/table partitions is determined by
 * the store-level config for num_segments, which controls the width of each segment according
 * to the relationship num_segments * segmentInterval = retentionPeriod.
 * Note: the actual number of physical segments at any given time is likely to be num_segments + 1,
 * for two reasons. First, if the configured numSegments does not evenly divide the retentionPeriod,
 * an additional partial segment will always be required to cover any remainder of the full
 * retention period left. Second, at any given point in time it's likely that the time range
 * covered by the oldest segment is partially expired, while the upper end of the newest segment
 * may not yet be active/filled.
 * 

 * When stream-time for a given kafka partition advances, a tombstone is sent to delete the entire
 * oldest physical segment (or segments, if it has advanced by more than the segmentInterval).
 * At the same time, a new physical segment (or segments) will be created and the corresponding
 * remote partition initialized.
 * 

 * Let's look at an example case, at a particular moment in time, with the following configuration:
 * N = 4
 * retentionPeriod = 100ms
 * numSegments = 3
 * segmentInterval = 33
 * 

 * kafkaPartition | stream-time | minValidTs |    active segment time bounds   | segmentTimestamp ||
 *           0    |     16      |      0     | 0-32                            | 0                ||
 *           1    |     88      |      0     | 0-32, 33-65, 66-98              | 0, 33, 66        ||
 *           2    |     101     |      2     | 0-32, 33-65, 66-98, 99-131      | 0, 33, 66, 99    ||
 *           3    |     169     |      70    | 66-98, 99-131, 132-164, 165-197 | 66, 99, 132, 165 ||
 * 

 * NOTE: because we are already dividing up the partition space into segments, we don't further
 * split things into true sub-partitions based on key. Each kafka partition still maps to multiple
 * table partitions which should help with the parallelism, but unlike the subpartitioner, the
 * segment-partitioning scheme is not static and temporal rather than key-based. If data is skewed
 * in time, this might result in uneven partitions and a need to further subdivide the partition
 * space.
 * 
 * For the time being, we simply recommend that users configure the number of segments
 * similarly to how they would configure the number of sub-partitions for a key-value store.
 */
public class Segmenter {

  private static final Logger LOG = LoggerFactory.getLogger(Segmenter.class);

  public static final long UNINITIALIZED_STREAM_TIME = -1L;

  private final long retentionPeriodMs;
  private final long segmentIntervalMs;

  public static class SegmentPartition {
    public final int tablePartition;
    public final long segmentStartTimestamp;

    public SegmentPartition(final int tablePartition, final long segmentStartTimestamp) {
      this.tablePartition = tablePartition;
      this.segmentStartTimestamp = segmentStartTimestamp;
    }

    @Override
    public boolean equals(final Object o) {
      if (this == o) {
        return true;
      }
      if (o == null || getClass() != o.getClass()) {
        return false;
      }

      final SegmentPartition that = (SegmentPartition) o;

      if (tablePartition != that.tablePartition) {
        return false;
      }
      return segmentStartTimestamp == that.segmentStartTimestamp;
    }

    @Override
    public int hashCode() {
      int result = tablePartition;
      result = 31 * result + (int) (segmentStartTimestamp ^ (segmentStartTimestamp >>> 32));
      return result;
    }

    @Override
    public String toString() {
      return "SegmentPartition{"
          + "tablePartition=" + tablePartition
          + ", segmentStartTimestamp=" + segmentStartTimestamp
          + '}';
    }
  }

  public Segmenter(
      final long retentionPeriodMs,
      final long segmentIntervalMs
  ) {
    this.retentionPeriodMs = retentionPeriodMs;
    this.segmentIntervalMs = segmentIntervalMs;
    if (retentionPeriodMs <= 0L || segmentIntervalMs <= 0L) {
      LOG.error("Segment values should all be positive, got retentionPeriod={}ms, "
          + "segmentInterval={}ms", retentionPeriodMs, segmentIntervalMs
      );
      throw new IllegalStateException("Segment partitioner received a negative or zero value");
    }

    LOG.info(
        "Created segment partitioner with retentionPeriod={}ms, segmentInterval={}ms",
        retentionPeriodMs,
        segmentIntervalMs
    );
  }

  public long segmentIntervalMs() {
    return segmentIntervalMs;
  }

  /**
   * Return all active segments for the given stream-time and retention period
   *
   * @param kafkaPartition the original partition in kafka
   * @param streamTime     the lowest timestamp in the fetched range
   * @return               all remote partitions for active segments of this kafka partition
   */
  public List activeSegments(
      final int kafkaPartition,
      final long streamTime
  ) {
    if (streamTime == UNINITIALIZED_STREAM_TIME) {
      return emptyList();
    } else {
      return range(kafkaPartition, minValidTs(streamTime), streamTime);
    }
  }

  /**
   * Return all active segments that could contain data with a timestamp in the specified range
   * The {@code timeFrom} parameter should already account for the retention
   *
   * @param kafkaPartition the original partition in kafka
   * @param timeFrom       the lowest timestamp in the fetched range (inclusive)
   * @param timeTo         the highest timestamp in the fetched range (inclusive)
   * @return               all remote partitions for segments in this range for this kafka partition
   */
  public List range(
      final int kafkaPartition,
      final long timeFrom,
      final long timeTo
  ) {
    return LongStream.range(segmentId(timeFrom), segmentId(timeTo) + 1)
        .mapToObj(segmentId -> new SegmentPartition(
            kafkaPartition,
            segmentId * segmentIntervalMs
        ))
        .collect(Collectors.toList());
  }

  /**
   * Return all active segments that could contain data with a timestamp in the specified range,
   * in reverse order
   * The{@code timeFrom} parameter should already account for the retention
   *
   * @param kafkaPartition the original partition in kafka
   * @param timeFrom       the lowest timestamp in the fetched range
   * @param timeTo         the highest timestamp in the fetched range
   * @return               all remote partitions for segments in this range for this kafka partition
   */
  public List reverseRange(
      final int kafkaPartition,
      final long timeFrom,
      final long timeTo
  ) {
    return LongStream.range(segmentId(timeFrom), segmentId(timeTo) + 1)
        .boxed()
        .sorted(Collections.reverseOrder())
        .map(segmentId -> new SegmentPartition(
            kafkaPartition,
            segmentId * segmentIntervalMs
        ))
        .collect(Collectors.toList());
  }

  public SegmentRoll rolledSegments(
      final String tableName,
      final int kafkaPartition,
      final long oldStreamTime,
      final long newStreamTime
  ) {
    final long oldMaxActiveSegment = segmentId(oldStreamTime);
    final long newMaxActiveSegment = segmentId(newStreamTime);

    final long oldMinActiveSegment = segmentId(minValidTs(oldStreamTime));
    final long newMinActiveSegment = segmentId(minValidTs(newStreamTime));

    // Special case where this is the first record we've received
    if (oldStreamTime == UNINITIALIZED_STREAM_TIME) {
      final LongStream segmentsToExpire = LongStream.empty();

      final LongStream segmentsToCreate = LongStream.range(
          newMinActiveSegment,
          newMaxActiveSegment + 1 // add 1 since the upper bound is exclusive
      ).map(segmentId -> segmentId * segmentIntervalMs);

      LOG.info("Initializing stream-time for table {} to {}ms and creating segments: [{}-{}]",
          tableName, newStreamTime, newMinActiveSegment, newMaxActiveSegment
      );

      return new SegmentRoll(
          segmentsToExpire,
          segmentsToCreate
      );
    } else {
      final LongStream segmentsToExpire = LongStream.range(
          oldMinActiveSegment,
          newMinActiveSegment
      ).map(segmentId -> segmentId * segmentIntervalMs);

      final LongStream segmentsToCreate = LongStream.range(
          oldMaxActiveSegment + 1, // inclusive: add 1 b/c the old max segment should already exist
          newMaxActiveSegment + 1  // exclusive: add 1 to create segment for highest valid timestamp
      ).map(segmentId -> segmentId * segmentIntervalMs);

      if (newMinActiveSegment > oldMinActiveSegment) {
        LOG.info("{}[{}] Advancing stream-time from {}ms to {}ms and rolling segments with "
                + "expiredSegments: [{}-{}] and newSegments: [{}-{}]",
            tableName, kafkaPartition, oldStreamTime, newStreamTime, oldMinActiveSegment,
            newMinActiveSegment, oldMaxActiveSegment + 1, newMaxActiveSegment
        );
      }
      return new SegmentRoll(
          segmentsToExpire,
          segmentsToCreate
      );
    }
  }

  public long segmentStartTimestamp(final long windowTimestamp) {
    return segmentId(windowTimestamp) * segmentIntervalMs;
  }

  private long segmentId(final long windowTimestamp) {
    return Long.max(0, windowTimestamp / segmentIntervalMs);
  }

  private long minValidTs(final long streamTime) {
    return streamTime - retentionPeriodMs + 1;
  }

  public static class SegmentRoll {
    private final List segmentsToExpire;
    private final List segmentsToCreate;

    public SegmentRoll(final LongStream segmentsToExpire, final LongStream segmentsToCreate) {
      this.segmentsToExpire = segmentsToExpire.boxed().collect(Collectors.toUnmodifiableList());
      this.segmentsToCreate = segmentsToCreate.boxed().collect(Collectors.toUnmodifiableList());
    }

    public List segmentsToExpire() {
      return segmentsToExpire;
    }

    public List segmentsToCreate() {
      return segmentsToCreate;
    }

    @Override
    public String toString() {
      final int numExpired = segmentsToExpire.size();
      final String expired = numExpired == 0
          ? "[]"
          : String.format("[%d-%d]", segmentsToExpire.get(0), segmentsToExpire.get(numExpired - 1));

      final int numCreated = segmentsToCreate.size();
      final String created = numCreated == 0
          ? "[]"
          : String.format("[%d-%d]", segmentsToCreate.get(0), segmentsToCreate.get(numCreated - 1));

      return String.format("SegmentRoll: expired segment(s)=%s, new segments(s)=%s",
          expired, created
      );
    }

    public boolean equals(Object obj) {
      if (obj == this) {
        return true;
      } else if (obj == null) {
        return false;
      } else if (this.getClass() != obj.getClass()) {
        return false;
      }

      SegmentRoll other = (SegmentRoll) obj;
      return this.segmentsToCreate.equals(other.segmentsToCreate) && this.segmentsToExpire.equals(
          other.segmentsToExpire);
    }

    @Override
    public int hashCode() {
      int result = this.segmentsToCreate.hashCode();
      result = 31 * result + this.segmentsToExpire.hashCode();
      return result;
    }
  }
}