All Downloads are FREE. Search and download functionalities are using the official Maven repository.

dev.responsive.kafka.api.stores.ResponsiveWindowParams Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2023 Responsive Computing, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package dev.responsive.kafka.api.stores;

import static dev.responsive.kafka.internal.utils.StoreUtil.durationToMillis;

import dev.responsive.kafka.internal.db.CassandraKeyValueTable;
import dev.responsive.kafka.internal.stores.SchemaTypes.WindowSchema;
import dev.responsive.kafka.internal.utils.TableName;
import java.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public final class ResponsiveWindowParams {

  private static final Logger LOG = LoggerFactory.getLogger(ResponsiveWindowParams.class);

  // TODO: should we enforce a real upper bound on the number of segments/partitions that are
  //  created? Is there a maximum recommended for Scylla, or for this compaction strategy, etc?
  private static final long MINIMUM_ALLOWED_NUM_SEGMENTS = 2L;
  private static final long MAXIMUM_ALLOWED_NUM_SEGMENTS = Long.MAX_VALUE;
  private static final long MAXIMUM_DEFAULT_NUM_SEGMENTS = 100L;
  private static final long DEFAULT_NUM_SEGMENTS = 10L; // space amplification = 1.1x

  private final TableName name;
  private final WindowSchema schemaType;
  private final long windowSizeMs;
  private final boolean retainDuplicates;
  private final long retentionPeriodMs;

  private long numSegments;

  private ResponsiveWindowParams(
      final String name,
      final WindowSchema schemaType,
      final Duration windowSize,
      final Duration retentionPeriod,
      final boolean retainDuplicates
  ) {
    this.name = new TableName(name);
    this.schemaType = schemaType;
    this.windowSizeMs = durationToMillis(windowSize, "windowSize");
    this.retentionPeriodMs = durationToMillis(retentionPeriod, "retentionPeriod");
    this.retainDuplicates = retainDuplicates;

    this.numSegments = computeDefaultNumSegments(windowSizeMs, retentionPeriodMs);
  }

  public static ResponsiveWindowParams window(
      final String name,
      final Duration windowSize,
      final Duration retentionPeriod
  ) {
    return new ResponsiveWindowParams(
        name, WindowSchema.WINDOW, windowSize, retentionPeriod, false
    );
  }

  public static ResponsiveWindowParams streamStreamJoin(
      final String name,
      final Duration windowSize
  ) {
    return new ResponsiveWindowParams(
            name, WindowSchema.WINDOW, windowSize, windowSize, true
    );
  }

  public ResponsiveWindowParams withNumSegments(final long numSegments) {
    final long maximumSegmentCount = maximumAllowedNumSegments(retentionPeriodMs);
    if (numSegments > maximumSegmentCount) {
      LOG.warn("Attempted to divide window store {} into more segments ({}) than is possible. "
                   + "Will use the maximum possible number of segments for this store, as"
                   + "determined by the retention period ({})",
               name, numSegments, maximumSegmentCount);
      this.numSegments = maximumSegmentCount;
    } else if (numSegments < MINIMUM_ALLOWED_NUM_SEGMENTS) {
      LOG.warn("Attempted to divide window store {} into fewer segments ({}) than is allowed. "
                   + "Will use the minimum allowable number of segments for this store ({})",
               name, numSegments, MINIMUM_ALLOWED_NUM_SEGMENTS);
    } else {
      this.numSegments = numSegments;
    }
    return this;
  }

  public WindowSchema schemaType() {
    return schemaType;
  }

  public TableName name() {
    return name;
  }

  public long windowSize() {
    return windowSizeMs;
  }

  public long retentionPeriod() {
    return retentionPeriodMs;
  }

  public boolean retainDuplicates() {
    return retainDuplicates;
  }

  public long numSegments() {
    return numSegments;
  }

  private static long maximumAllowedNumSegments(final long retentionPeriodMs) {
    return Math.min(MAXIMUM_ALLOWED_NUM_SEGMENTS, retentionPeriodMs);
  }

  /**
   * The general approach here is specific to C*, in which each segment is a separate table
   * partition similar to a subpartition in the {@link CassandraKeyValueTable}.
   * 

* We probably don't need a huge number of table partitions for parallelism reasons, but * should have sufficient segments to make sure we are able to delete expired records * at a good enough pace to keep the additional storage cost low. *

* The absolute minimum number of segments should be 2, which gives a space amplification * of 1.5x the total size of the working data set (fairly large as-is, we may want a larger * minimum). We probably also want to cap off the maximum at some point but for now, we just * enforce that the num_segments is no larger than the retention period. *

* Given these limits, we select a general default for most cases based on a target * storage amplification factor. In addition, we consider the special case in which * the grace period is much larger than the window size. For this case, to optimize for * good caching we want to try and have the entire active window fit into the latest segment. */ // TODO: experiment to determine a good default, and consider how to adapt this computation // for a different storage engine or table implementation private static long computeDefaultNumSegments( final long windowSizeMs, final long retentionPeriodMs ) { // If we're able to fit the full window size in a single segment by using more than the // DEFAULT_NUM_SEGMENTS, do so by setting the segment interval equal to the window size // (where segmentInterval = retentionPeriod / numSegments if (windowSizeMs < retentionPeriodMs / DEFAULT_NUM_SEGMENTS) { return Math.min(MAXIMUM_DEFAULT_NUM_SEGMENTS, retentionPeriodMs / windowSizeMs); } else { return DEFAULT_NUM_SEGMENTS; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy