org.apache.druid.timeline.partition.HashBasedNumberedShardSpec Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of druid-processing Show documentation
A module that is everything required to understands Druid Segments
There is a newer version: 30.0.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.timeline.partition;

import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.BoundType;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Range;
import com.google.common.collect.RangeSet;
import org.apache.druid.java.util.common.ISE;

import javax.annotation.Nullable;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

public class HashBasedNumberedShardSpec extends NumberedShardSpec
{
  public static final List DEFAULT_PARTITION_DIMENSIONS = ImmutableList.of();

  private final int bucketId;

  /**
   * Number of hash buckets
   */
  private final int numBuckets;
  private final ObjectMapper jsonMapper;
  private final List partitionDimensions;

  /**
   * A hash function to use for both hash partitioning at ingestion time and pruning segments at query time.
   *
   * During ingestion, the partition function is defaulted to {@link HashPartitionFunction#MURMUR3_32_ABS} if this
   * variable is null. See {@link HashPartitioner} for details.
   *
   * During query, this function will be null unless it is explicitly specified in
   * {@link org.apache.druid.indexer.partitions.HashedPartitionsSpec} at ingestion time. This is because the default
   * hash function used to create segments at ingestion time can change over time, but we don't guarantee the changed
   * hash function is backwards-compatible. The query will process all segments if this function is null.
   */
  @Nullable
  private final HashPartitionFunction partitionFunction;

  @JsonCreator
  public HashBasedNumberedShardSpec(
      @JsonProperty("partitionNum") int partitionNum,    // partitionId
      @JsonProperty("partitions") int partitions,        // core partition set size
      @JsonProperty("bucketId") @Nullable Integer bucketId, // nullable for backward compatibility
      @JsonProperty("numBuckets") @Nullable Integer numBuckets, // nullable for backward compatibility
      @JsonProperty("partitionDimensions") @Nullable List partitionDimensions,
      @JsonProperty("partitionFunction") @Nullable HashPartitionFunction partitionFunction, // nullable for backward compatibility
      @JacksonInject ObjectMapper jsonMapper
  )
  {
    super(partitionNum, partitions);
    // Use partitionId as bucketId if it's missing.
    this.bucketId = bucketId == null ? partitionNum : bucketId;
    // If numBuckets is missing, assume that any hash bucket is not empty.
    // Use the core partition set size as the number of buckets.
    this.numBuckets = numBuckets == null ? partitions : numBuckets;
    this.partitionDimensions = partitionDimensions == null ? DEFAULT_PARTITION_DIMENSIONS : partitionDimensions;
    this.partitionFunction = partitionFunction;
    this.jsonMapper = jsonMapper;
  }

  @JsonProperty
  public int getBucketId()
  {
    return bucketId;
  }

  @JsonProperty
  public int getNumBuckets()
  {
    return numBuckets;
  }

  @JsonProperty("partitionDimensions")
  public List getPartitionDimensions()
  {
    return partitionDimensions;
  }

  @JsonProperty
  public @Nullable HashPartitionFunction getPartitionFunction()
  {
    return partitionFunction;
  }

  @Override
  public List getDomainDimensions()
  {
    return partitionDimensions;
  }

  @Override
  public ShardSpecLookup getLookup(final List shardSpecs)
  {
    // partitionFunction can be null when you read a shardSpec of a segment created in an old version of Druid.
    // The current version of Druid will always specify a partitionFunction on newly created segments.
    if (partitionFunction == null) {
      throw new ISE("Cannot create a hashPartitioner since partitionFunction is null");
    }
    return new HashPartitioner(
        jsonMapper,
        partitionFunction,
        partitionDimensions,
        numBuckets
    ).createHashLookup(shardSpecs);
  }

  @Override
  public boolean possibleInDomain(Map> domain)
  {
    // partitionFunction should be used instead of HashPartitioner at query time.
    // We should process all segments if partitionFunction is null because we don't know what hash function
    // was used to create segments at ingestion time.
    if (partitionFunction == null) {
      return true;
    }

    // If no partitionDimensions are specified during ingestion, hash is based on all dimensions plus the truncated
    // input timestamp according to QueryGranularity instead of just partitionDimensions. Since we don't store in shard
    // specs the truncated timestamps of the events that fall into the shard after ingestion, there's no way to recover
    // the hash during ingestion, bypass this case
    if (partitionDimensions.isEmpty()) {
      return true;
    }

    // One possible optimization is to move the conversion from range set to point set to the function signature and
    // cache it in the caller of this function if there are repetitive calls of the same domain
    Map> domainSet = new HashMap<>();
    for (String p : partitionDimensions) {
      RangeSet domainRangeSet = domain.get(p);
      if (domainRangeSet == null || domainRangeSet.isEmpty()) {
        return true;
      }

      for (Range v : domainRangeSet.asRanges()) {
        // If there are range values, simply bypass, because we can't hash range values
        if (v.isEmpty() || !v.hasLowerBound() || !v.hasUpperBound() ||
            v.lowerBoundType() != BoundType.CLOSED || v.upperBoundType() != BoundType.CLOSED ||
            !v.lowerEndpoint().equals(v.upperEndpoint())) {
          return true;
        }
        domainSet.computeIfAbsent(p, k -> new HashSet<>()).add(v.lowerEndpoint());
      }
    }

    return !domainSet.isEmpty() && chunkPossibleInDomain(partitionFunction, domainSet, new HashMap<>());
  }

  /**
   * Recursively enumerate all possible combinations of values for dimensions in {@link #partitionDimensions} based on
   * {@code domainSet}, test if any combination matches the current segment
   *
   * @param hashPartitionFunction     hash function used to create segments at ingestion time
   * @param domainSet                 The set where values of dimensions in {@link #partitionDimensions} are
   *                                  drawn from
   * @param partitionDimensionsValues A map from dimensions in {@link #partitionDimensions} to their values drawn from
   *                                  {@code domainSet}
   * @return Whether the current segment possibly holds records for the provided domain. Return false if and only if
   * none of the combinations matches this segment
   */
  private boolean chunkPossibleInDomain(
      HashPartitionFunction hashPartitionFunction,
      Map> domainSet,
      Map partitionDimensionsValues
  )
  {
    int curIndex = partitionDimensionsValues.size();
    if (curIndex == partitionDimensions.size()) {
      return isInChunk(hashPartitionFunction, partitionDimensionsValues);
    }

    String dimension = partitionDimensions.get(curIndex);
    for (String e : domainSet.get(dimension)) {
      partitionDimensionsValues.put(dimension, e);
      if (chunkPossibleInDomain(hashPartitionFunction, domainSet, partitionDimensionsValues)) {
        return true;
      }
      partitionDimensionsValues.remove(dimension);
    }

    return false;
  }

  /**
   * Check if the current segment possibly holds records if the values of dimensions in {@link #partitionDimensions}
   * are of {@code partitionDimensionsValues}
   *
   * @param hashPartitionFunction     hash function used to create segments at ingestion time
   * @param partitionDimensionsValues An instance of values of dimensions in {@link #partitionDimensions}
   *
   * @return Whether the current segment possibly holds records for the given values of partition dimensions
   */
  private boolean isInChunk(HashPartitionFunction hashPartitionFunction, Map partitionDimensionsValues)
  {
    assert !partitionDimensions.isEmpty();
    List