org.apache.druid.timeline.SegmentId Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of druid-processing Show documentation
A module that is everything required to understands Druid Segments
There is a newer version: 30.0.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.timeline;

import com.fasterxml.jackson.annotation.JsonValue;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Interner;
import com.google.common.collect.Interners;
import com.google.common.collect.Iterables;
import com.google.common.primitives.Ints;
import org.apache.druid.guice.annotations.PublicApi;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.Intervals;
import org.apache.druid.query.SegmentDescriptor;
import org.apache.druid.timeline.partition.ShardSpec;
import org.joda.time.DateTime;
import org.joda.time.Interval;

import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.stream.IntStream;

/**
 * Identifier of {@link DataSegment}.
 */
@PublicApi
public final class SegmentId implements Comparable
{
  /*
   * Implementation note: this class must be optimized for resident memory footprint, because segment data consumes
   * a lot of heap memory on Druid Broker and Coordinator nodes.
   *
   * This class is separate from org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec
   * because in a lot of places segment ids are transmitted as "segment id strings" that don't contain enough
   * information to deconstruct the ShardSpec. Also, even a single extra field is important for SegmentIds, because it
   * adds to the memory footprint considerably.
   *
   * The difference between this class and {@link org.apache.druid.query.SegmentDescriptor} is that the latter is
   * a "light" version of SegmentId, that only contains the interval, version, and partition number. It's used where the
   * data source, another essential part of SegmentId is determined by the context (e. g. in {@link
   * org.apache.druid.client.CachingClusteredClient}, where SegmentDescriptor is used when Brokers tell data servers
   * which segments to include for a particular query) and where having lean JSON representations is important, because
   * it's actively transferred detween Druid nodes. It's also for this reason that the JSON field names of
   * SegmentDescriptor are abbreviated.
   *
   * API design note: "SegmentId" is chosen as the name for this class instead of more verbose "SegmentIdentifier" or
   * "DataSegmentIdentifier" because it's used very frequently and a long class name adds noticeable clutter. Variables
   * of SegmentId type are recommended to be named "segmentId" rather than "identifier" or "segmentIdentifier".
   */

  /**
   * {@link #dataSource} field values are stored as canonical strings to decrease memory required for large numbers of
   * segment identifiers.
   */
  private static final Interner STRING_INTERNER = Interners.newWeakInterner();

  /**
   * Store Intervals since creating them each time before returning is an expensive operation
   * To decrease the memory required for storing intervals, intern them, since the number of distinct values is "low"
   */
  private static final Interner INTERVAL_INTERNER = Interners.newWeakInterner();

  private static final char DELIMITER = '_';
  private static final Splitter DELIMITER_SPLITTER = Splitter.on(DELIMITER);
  private static final Joiner DELIMITER_JOINER = Joiner.on(DELIMITER);

  private static final int DATE_TIME_SIZE_UPPER_LIMIT = "yyyy-MM-ddTHH:mm:ss.SSS+00:00".length();

  public static SegmentId of(String dataSource, Interval interval, String version, int partitionNum)
  {
    return new SegmentId(dataSource, interval, version, partitionNum);
  }

  public static SegmentId of(String dataSource, Interval interval, String version, @Nullable ShardSpec shardSpec)
  {
    return of(dataSource, interval, version, shardSpec != null ? shardSpec.getPartitionNum() : 0);
  }

  /**
   * Tries to parse a segment id from the given String representation, or returns null on failure. If returns a non-null
   * {@code SegmentId} object, calling {@link #toString()} on the latter is guaranteed to return a string equal to the
   * argument string of the {@code tryParse()} call.
   *
   * It is possible that this method may incorrectly parse a segment id, for example if the dataSource name in the
   * segment id contains a DateTime parseable string such as 'datasource_2000-01-01T00:00:00.000Z' and dataSource was
   * provided as 'datasource'. The desired behavior in this case would be to return null since the identifier does not
   * actually belong to the provided dataSource but a non-null result would be returned. This is an edge case that would
   * currently only affect paged select queries with a union dataSource of two similarly-named dataSources as in the
   * given example.
   *
   * Another source of ambiguity is the end of a segment id like '_123' - it could always be interpreted either as the
   * partitionNum of the segment id, or as the end of the version, with the implicit partitionNum of 0. This method
   * prefers the first iterpretation. To iterate all possible parsings of a segment id, use {@link
   * #iteratePossibleParsingsWithDataSource}.
   *
   * @param dataSource the dataSource corresponding to this segment id
   * @param segmentId segment id
   * @return a {@link SegmentId} object if the segment id could be parsed, null otherwise
   */
  @Nullable
  public static SegmentId tryParse(String dataSource, String segmentId)
  {
    List possibleParsings = iteratePossibleParsingsWithDataSource(dataSource, segmentId);
    return possibleParsings.isEmpty() ? null : possibleParsings.get(0);
  }

  /**
   * Returns a (potentially empty) lazy iteration of all possible valid parsings of the given segment id string into
   * {@code SegmentId} objects.
   *
   * Warning: most of the parsing work is repeated each time {@link Iterable#iterator()} of this iterable is consumed,
   * so it should be consumed only once if possible.
   */
  public static Iterable iterateAllPossibleParsings(String segmentId)
  {
    List splits = DELIMITER_SPLITTER.splitToList(segmentId);
    String probableDataSource = tryExtractMostProbableDataSource(segmentId);
    // Iterate parsings with the most probably data source first to allow the users of iterateAllPossibleParsings() to
    // break from the iteration earlier with higher probability.
    if (probableDataSource != null) {
      List probableParsings = iteratePossibleParsingsWithDataSource(probableDataSource, segmentId);
      Iterable otherPossibleParsings = () -> IntStream
          .range(1, splits.size() - 3)
          .mapToObj(dataSourceDelimiterOrder -> DELIMITER_JOINER.join(splits.subList(0, dataSourceDelimiterOrder)))
          .filter(dataSource -> dataSource.length() != probableDataSource.length())
          .flatMap(dataSource -> iteratePossibleParsingsWithDataSource(dataSource, segmentId).stream())
          .iterator();
      return Iterables.concat(probableParsings, otherPossibleParsings);
    } else {
      return () -> IntStream
          .range(1, splits.size() - 3)
          .mapToObj(dataSourceDelimiterOrder -> {
            String dataSource = DELIMITER_JOINER.join(splits.subList(0, dataSourceDelimiterOrder));
            return iteratePossibleParsingsWithDataSource(dataSource, segmentId);
          })
          .flatMap(List::stream)
          .iterator();
    }
  }

  /**
   * Returns a list of either 0, 1 or 2 elements containing possible parsings if the given segment id String
   * representation with the given data source name. Returns an empty list when parsing into a valid {@code SegmentId}
   * object is impossible. Returns a list of a single element when the given segment id doesn't end with
   * '_[any positive number]', that means that the implicit partitionNum is 0. Otherwise the end of the segment id
   * is interpreted in two ways: with the explicit partitionNum (the first element in the returned list), and with the
   * implicit partitionNum of 0 and the version that ends with '_[any positive number]' (the second element in the
   * returned list).
   */
  public static List iteratePossibleParsingsWithDataSource(String dataSource, String segmentId)
  {
    if (!segmentId.startsWith(dataSource) || segmentId.charAt(dataSource.length()) != DELIMITER) {
      return Collections.emptyList();
    }

    String remaining = segmentId.substring(dataSource.length() + 1);
    List splits = DELIMITER_SPLITTER.splitToList(remaining);
    if (splits.size() < 3) {
      return Collections.emptyList();
    }

    DateTime start;
    DateTime end;
    try {
      start = DateTimes.ISO_DATE_TIME.parse(splits.get(0));
      end = DateTimes.ISO_DATE_TIME.parse(splits.get(1));
    }
    catch (IllegalArgumentException e) {
      return Collections.emptyList();
    }
    if (start.compareTo(end) >= 0) {
      return Collections.emptyList();
    }
    List possibleParsings = new ArrayList<>(2);
    String version = DELIMITER_JOINER.join(splits.subList(2, Math.max(splits.size() - 1, 3)));
    String trail = splits.size() > 3 ? splits.get(splits.size() - 1) : null;
    if (trail != null) {
      Integer possiblePartitionNum = Ints.tryParse(trail);
      if (possiblePartitionNum != null && possiblePartitionNum > 0) {
        possibleParsings.add(of(dataSource, new Interval(start, end), version, possiblePartitionNum));
      }
      version = version + '_' + trail;
    }
    possibleParsings.add(of(dataSource, new Interval(start, end), version, 0));
    return possibleParsings;
  }

  /**
   * Heuristically tries to extract the most probable data source from a String segment id representation, or returns
   * null on failure.
   *
   * This method is not guaranteed to return a non-null data source given a valid String segment id representation.
   */
  @VisibleForTesting
  @Nullable
  static String tryExtractMostProbableDataSource(String segmentId)
  {
    Matcher dateTimeMatcher = DateTimes.COMMON_DATE_TIME_PATTERN.matcher(segmentId);
    while (true) {
      if (!dateTimeMatcher.find()) {
        return null;
      }
      int dataSourceEnd = dateTimeMatcher.start() - 1;
      if (segmentId.charAt(dataSourceEnd) != DELIMITER) {
        continue;
      }
      return segmentId.substring(0, dataSourceEnd);
    }
  }


  /**
   * Creates a merged SegmentId for the given data source, interval and partition number. Used when segments are
   * merged.
   */
  public static SegmentId merged(String dataSource, Interval interval, int partitionNum)
  {
    return of(dataSource, interval, "merged", partitionNum);
  }

  /**
   * Creates a dummy SegmentId with the given data source. This method is useful in benchmark and test code.
   */
  public static SegmentId dummy(String dataSource)
  {
    return of(dataSource, Intervals.ETERNITY, "dummy_version", 0);
  }

  /**
   * Creates a dummy SegmentId with the given data source and partition number.
   * This method is useful in benchmark and test code.
   */
  public static SegmentId dummy(String dataSource, int partitionNum)
  {
    return of(dataSource, Intervals.ETERNITY, "dummy_version", partitionNum);
  }

  private final String dataSource;
  private final Interval interval;
  private final String version;
  private final int partitionNum;

  /**
   * Cache the hash code eagerly, because SegmentId is almost always expected to be used as a map key or
   * for map lookup.
   */
  private final int hashCode;

  private SegmentId(String dataSource, Interval interval, String version, int partitionNum)
  {
    this.dataSource = STRING_INTERNER.intern(Objects.requireNonNull(dataSource));
    this.interval = INTERVAL_INTERNER.intern(Objects.requireNonNull(interval));
    // Versions are timestamp-based Strings, interning of them doesn't make sense. If this is not the case, interning
    // could be conditionally allowed via a system property.
    this.version = Objects.requireNonNull(version);
    this.partitionNum = partitionNum;
    this.hashCode = computeHashCode();
  }

  private int computeHashCode()
  {
    // Start with partitionNum and version hash codes, because they are often little sequential numbers. If they are
    // added in the end of the chain, resulting hashCode of SegmentId could have worse distribution.
    int hashCode = partitionNum;
    // 1000003 is a constant used in Google AutoValue, provides a little better distribution than 31
    hashCode = hashCode * 1000003 + version.hashCode();

    hashCode = hashCode * 1000003 + dataSource.hashCode();
    hashCode = hashCode * 1000003 + interval.hashCode();
    return hashCode;
  }

  public String getDataSource()
  {
    return dataSource;
  }

  public DateTime getIntervalStart()
  {
    return new DateTime(interval.getStartMillis(), interval.getChronology());
  }

  public DateTime getIntervalEnd()
  {
    return new DateTime(interval.getEndMillis(), interval.getChronology());
  }

  public Interval getInterval()
  {
    return interval;
  }

  public String getVersion()
  {
    return version;
  }

  public int getPartitionNum()
  {
    return partitionNum;
  }

  public SegmentId withInterval(Interval newInterval)
  {
    return of(dataSource, newInterval, version, partitionNum);
  }

  public SegmentDescriptor toDescriptor()
  {
    return new SegmentDescriptor(Intervals.utc(interval.getStartMillis(), interval.getEndMillis()), version, partitionNum);
  }

  @Override
  public boolean equals(Object o)
  {
    if (this == o) {
      return true;
    }
    if (!(o instanceof SegmentId)) {
      return false;
    }
    SegmentId that = (SegmentId) o;
    // Compare hashCode instead of partitionNum: break the chain quicker if the objects are not equal. If the hashCodes
    // are equal as well as all other fields used to compute them, the partitionNums are also guaranteed to be equal.
    return hashCode == that.hashCode &&
           dataSource.equals(that.dataSource) &&
           interval.equals(that.interval) &&
           version.equals(that.version);
  }

  @Override
  public int hashCode()
  {
    return hashCode;
  }

  @Override
  public int compareTo(SegmentId o)
  {
    int result = dataSource.compareTo(o.dataSource);
    if (result != 0) {
      return result;
    }
    result = Long.compare(interval.getStartMillis(), o.interval.getStartMillis());
    if (result != 0) {
      return result;
    }
    result = Long.compare(interval.getEndMillis(), o.interval.getEndMillis());
    if (result != 0) {
      return result;
    }
    result = version.compareTo(o.version);
    if (result != 0) {
      return result;
    }
    return Integer.compare(partitionNum, o.partitionNum);
  }

  @JsonValue
  @Override
  public String toString()
  {
    StringBuilder sb = new StringBuilder(safeUpperLimitOfStringSize());

    sb.append(dataSource).append(DELIMITER)
      .append(getIntervalStart()).append(DELIMITER)
      .append(getIntervalEnd()).append(DELIMITER)
      .append(version);

    if (partitionNum != 0) {
      sb.append(DELIMITER).append(partitionNum);
    }

    return sb.toString();
  }

  public int safeUpperLimitOfStringSize()
  {
    int delimiters = 4;
    int partitionNumSizeUpperLimit = 3; // less than 1000 partitions
    return dataSource.length() +
           version.length() +
           (DATE_TIME_SIZE_UPPER_LIMIT * 2) + // interval start and end
           delimiters +
           partitionNumSizeUpperLimit;
  }
}