All Downloads are FREE. Search and download functionalities are using the official Maven repository.

dev.responsive.kafka.internal.db.partitioning.Segmenter Maven / Gradle / Ivy

There is a newer version: 0.28.0
Show newest version
/*
 * Copyright 2023 Responsive Computing, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package dev.responsive.kafka.internal.db.partitioning;

import static java.util.Collections.emptyList;

import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.LongStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A helper class for segment-based partitioners where kafka partitions are mapped to a subset
 * of remote partitions divided up into "segments" that correspond to a range of timestamps.
 * By deleting a table partition we can "drop" a full segment at a time, which can be done if
 * (and only if) the stream-time advances such that the highest timestamp in the range is now
 * outside the retention period. This is intended for segmented WindowStores, but could be
 * extended in the future to other timeseries and/or ttl types of state stores.
 * 

* Each segment covers a specific time range in milliseconds. If t_n is the start time of the * nth segment, then that segment covers all timestamps from t_s - t_s + segmentInterval. * For each segment id there are N physical table partitions, where N is the number of kafka * partitions. Each physical state store instance should only be interacting with a single kafka * partition, but will generally have multiple active segments for each kafka partition. *

* For a given kafka partition, the total number of segments/table partitions is determined by * the store-level config for num_segments, which controls the width of each segment according * to the relationship num_segments * segmentInterval = retentionPeriod. * Note: the actual number of physical segments at any given time is likely to be num_segments + 1, * for two reasons. First, if the configured numSegments does not evenly divide the retentionPeriod, * an additional partial segment will always be required to cover any remainder of the full * retention period left. Second, at any given point in time it's likely that the time range * covered by the oldest segment is partially expired, while the upper end of the newest segment * may not yet be active/filled. *

* When stream-time for a given kafka partition advances, a tombstone is sent to delete the entire * oldest physical segment (or segments, if it has advanced by more than the segmentInterval). * At the same time, a new physical segment (or segments) will be created and the corresponding * remote partition initialized. *

* Let's look at an example case, at a particular moment in time, with the following configuration: * N = 4 * retentionPeriod = 100ms * numSegments = 3 * segmentInterval = 33 *

* kafkaPartition | stream-time | minValidTs | active segment time bounds | segmentTimestamp || * 0 | 16 | 0 | 0-32 | 0 || * 1 | 88 | 0 | 0-32, 33-65, 66-98 | 0, 33, 66 || * 2 | 101 | 2 | 0-32, 33-65, 66-98, 99-131 | 0, 33, 66, 99 || * 3 | 169 | 70 | 66-98, 99-131, 132-164, 165-197 | 66, 99, 132, 165 || *

* NOTE: because we are already dividing up the partition space into segments, we don't further * split things into true sub-partitions based on key. Each kafka partition still maps to multiple * table partitions which should help with the parallelism, but unlike the subpartitioner, the * segment-partitioning scheme is not static and temporal rather than key-based. If data is skewed * in time, this might result in uneven partitions and a need to further subdivide the partition * space. *

* For the time being, we simply recommend that users configure the number of segments * similarly to how they would configure the number of sub-partitions for a key-value store. */ public class Segmenter { private static final Logger LOG = LoggerFactory.getLogger(Segmenter.class); public static final long UNINITIALIZED_STREAM_TIME = -1L; private final long retentionPeriodMs; private final long segmentIntervalMs; public static class SegmentPartition { public final int tablePartition; public final long segmentStartTimestamp; public SegmentPartition(final int tablePartition, final long segmentStartTimestamp) { this.tablePartition = tablePartition; this.segmentStartTimestamp = segmentStartTimestamp; } @Override public boolean equals(final Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } final SegmentPartition that = (SegmentPartition) o; if (tablePartition != that.tablePartition) { return false; } return segmentStartTimestamp == that.segmentStartTimestamp; } @Override public int hashCode() { int result = tablePartition; result = 31 * result + (int) (segmentStartTimestamp ^ (segmentStartTimestamp >>> 32)); return result; } @Override public String toString() { return "SegmentPartition{" + "tablePartition=" + tablePartition + ", segmentStartTimestamp=" + segmentStartTimestamp + '}'; } } public Segmenter( final long retentionPeriodMs, final long segmentIntervalMs ) { this.retentionPeriodMs = retentionPeriodMs; this.segmentIntervalMs = segmentIntervalMs; if (retentionPeriodMs <= 0L || segmentIntervalMs <= 0L) { LOG.error("Segment values should all be positive, got retentionPeriod={}ms, " + "segmentInterval={}ms", retentionPeriodMs, segmentIntervalMs ); throw new IllegalStateException("Segment partitioner received a negative or zero value"); } LOG.info( "Created segment partitioner with retentionPeriod={}ms, segmentInterval={}ms", retentionPeriodMs, segmentIntervalMs ); } public long segmentIntervalMs() { return segmentIntervalMs; } /** * Return all active segments for the given stream-time and retention period * * @param kafkaPartition the original partition in kafka * @param streamTime the lowest timestamp in the fetched range * @return all remote partitions for active segments of this kafka partition */ public List activeSegments( final int kafkaPartition, final long streamTime ) { if (streamTime == UNINITIALIZED_STREAM_TIME) { return emptyList(); } else { return range(kafkaPartition, minValidTs(streamTime), streamTime); } } /** * Return all active segments that could contain data with a timestamp in the specified range * The {@code timeFrom} parameter should already account for the retention * * @param kafkaPartition the original partition in kafka * @param timeFrom the lowest timestamp in the fetched range (inclusive) * @param timeTo the highest timestamp in the fetched range (inclusive) * @return all remote partitions for segments in this range for this kafka partition */ public List range( final int kafkaPartition, final long timeFrom, final long timeTo ) { return LongStream.range(segmentId(timeFrom), segmentId(timeTo) + 1) .mapToObj(segmentId -> new SegmentPartition( kafkaPartition, segmentId * segmentIntervalMs )) .collect(Collectors.toList()); } /** * Return all active segments that could contain data with a timestamp in the specified range, * in reverse order * The{@code timeFrom} parameter should already account for the retention * * @param kafkaPartition the original partition in kafka * @param timeFrom the lowest timestamp in the fetched range * @param timeTo the highest timestamp in the fetched range * @return all remote partitions for segments in this range for this kafka partition */ public List reverseRange( final int kafkaPartition, final long timeFrom, final long timeTo ) { return LongStream.range(segmentId(timeFrom), segmentId(timeTo) + 1) .boxed() .sorted(Collections.reverseOrder()) .map(segmentId -> new SegmentPartition( kafkaPartition, segmentId * segmentIntervalMs )) .collect(Collectors.toList()); } public SegmentRoll rolledSegments( final String tableName, final int kafkaPartition, final long oldStreamTime, final long newStreamTime ) { final long oldMaxActiveSegment = segmentId(oldStreamTime); final long newMaxActiveSegment = segmentId(newStreamTime); final long oldMinActiveSegment = segmentId(minValidTs(oldStreamTime)); final long newMinActiveSegment = segmentId(minValidTs(newStreamTime)); // Special case where this is the first record we've received if (oldStreamTime == UNINITIALIZED_STREAM_TIME) { final LongStream segmentsToExpire = LongStream.empty(); final LongStream segmentsToCreate = LongStream.range( newMinActiveSegment, newMaxActiveSegment + 1 // add 1 since the upper bound is exclusive ).map(segmentId -> segmentId * segmentIntervalMs); LOG.info("Initializing stream-time for table {} to {}ms and creating segments: [{}-{}]", tableName, newStreamTime, newMinActiveSegment, newMaxActiveSegment ); return new SegmentRoll( segmentsToExpire, segmentsToCreate ); } else { final LongStream segmentsToExpire = LongStream.range( oldMinActiveSegment, newMinActiveSegment ).map(segmentId -> segmentId * segmentIntervalMs); final LongStream segmentsToCreate = LongStream.range( oldMaxActiveSegment + 1, // inclusive: add 1 b/c the old max segment should already exist newMaxActiveSegment + 1 // exclusive: add 1 to create segment for highest valid timestamp ).map(segmentId -> segmentId * segmentIntervalMs); if (newMinActiveSegment > oldMinActiveSegment) { LOG.info("{}[{}] Advancing stream-time from {}ms to {}ms and rolling segments with " + "expiredSegments: [{}-{}] and newSegments: [{}-{}]", tableName, kafkaPartition, oldStreamTime, newStreamTime, oldMinActiveSegment, newMinActiveSegment, oldMaxActiveSegment + 1, newMaxActiveSegment ); } return new SegmentRoll( segmentsToExpire, segmentsToCreate ); } } public long segmentStartTimestamp(final long windowTimestamp) { return segmentId(windowTimestamp) * segmentIntervalMs; } private long segmentId(final long windowTimestamp) { return Long.max(0, windowTimestamp / segmentIntervalMs); } private long minValidTs(final long streamTime) { return streamTime - retentionPeriodMs + 1; } public static class SegmentRoll { private final List segmentsToExpire; private final List segmentsToCreate; public SegmentRoll(final LongStream segmentsToExpire, final LongStream segmentsToCreate) { this.segmentsToExpire = segmentsToExpire.boxed().collect(Collectors.toUnmodifiableList()); this.segmentsToCreate = segmentsToCreate.boxed().collect(Collectors.toUnmodifiableList()); } public List segmentsToExpire() { return segmentsToExpire; } public List segmentsToCreate() { return segmentsToCreate; } @Override public String toString() { final int numExpired = segmentsToExpire.size(); final String expired = numExpired == 0 ? "[]" : String.format("[%d-%d]", segmentsToExpire.get(0), segmentsToExpire.get(numExpired - 1)); final int numCreated = segmentsToCreate.size(); final String created = numCreated == 0 ? "[]" : String.format("[%d-%d]", segmentsToCreate.get(0), segmentsToCreate.get(numCreated - 1)); return String.format("SegmentRoll: expired segment(s)=%s, new segments(s)=%s", expired, created ); } public boolean equals(Object obj) { if (obj == this) { return true; } else if (obj == null) { return false; } else if (this.getClass() != obj.getClass()) { return false; } SegmentRoll other = (SegmentRoll) obj; return this.segmentsToCreate.equals(other.segmentsToCreate) && this.segmentsToExpire.equals( other.segmentsToExpire); } @Override public int hashCode() { int result = this.segmentsToCreate.hashCode(); result = 31 * result + this.segmentsToExpire.hashCode(); return result; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy