All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.druid.segment.index.IndexedUtf8ValueIndexes Maven / Gradle / Ivy

There is a newer version: 30.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.segment.index;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;
import org.apache.druid.annotations.SuppressFBWarnings;
import org.apache.druid.collections.bitmap.BitmapFactory;
import org.apache.druid.collections.bitmap.ImmutableBitmap;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.ByteBufferUtils;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.math.expr.ExprEval;
import org.apache.druid.math.expr.ExpressionType;
import org.apache.druid.query.BitmapResultFactory;
import org.apache.druid.segment.column.TypeSignature;
import org.apache.druid.segment.column.ValueType;
import org.apache.druid.segment.data.Indexed;
import org.apache.druid.segment.index.semantic.StringValueSetIndexes;
import org.apache.druid.segment.index.semantic.Utf8ValueSetIndexes;
import org.apache.druid.segment.index.semantic.ValueIndexes;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.nio.ByteBuffer;
import java.util.Comparator;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.SortedSet;

public final class IndexedUtf8ValueIndexes>
    implements StringValueSetIndexes, Utf8ValueSetIndexes, ValueIndexes
{
  // This determines the cut-off point to switch the merging algorithm from doing binary-search per element in the value
  // set to doing a sorted merge algorithm between value set and dictionary. The ratio here represents the ratio b/w
  // the number of elements in value set and the number of elements in the dictionary. The number has been derived
  // using benchmark in https://github.com/apache/druid/pull/13133. If the ratio is higher than the threshold, we use
  // sorted merge instead of binary-search based algorithm.
  private static final double SORTED_MERGE_RATIO_THRESHOLD = 0.12D;
  private static final int SIZE_WORTH_CHECKING_MIN = 8;
  private static final Comparator COMPARATOR = ByteBufferUtils.utf8Comparator();

  private final BitmapFactory bitmapFactory;
  private final TDictionary dictionary;
  private final Indexed bitmaps;

  public IndexedUtf8ValueIndexes(
      BitmapFactory bitmapFactory,
      TDictionary dictionary,
      Indexed bitmaps
  )
  {
    Preconditions.checkArgument(dictionary.isSorted(), "Dictionary must be sorted");
    this.bitmapFactory = bitmapFactory;
    this.dictionary = dictionary;
    this.bitmaps = bitmaps;
  }

  @Override
  public BitmapColumnIndex forValue(@Nullable String value)
  {
    final ByteBuffer utf8 = StringUtils.toUtf8ByteBuffer(value);
    return new SimpleBitmapColumnIndex()
    {
      @Override
      public double estimateSelectivity(int totalRows)
      {
        return Math.min(1, (double) getBitmapForValue().size() / totalRows);
      }

      @Override
      public  T computeBitmapResult(BitmapResultFactory bitmapResultFactory, boolean includeUnknown)
      {
        if (includeUnknown && NullHandling.isNullOrEquivalent(dictionary.get(0))) {
          return bitmapResultFactory.unionDimensionValueBitmaps(
              ImmutableList.of(getBitmapForValue(), getBitmap(0))
          );
        }
        return bitmapResultFactory.wrapDimensionValue(getBitmapForValue());
      }

      private ImmutableBitmap getBitmapForValue()
      {
        final int idx = dictionary.indexOf(utf8);
        return getBitmap(idx);
      }
    };
  }

  @Nullable
  @Override
  public BitmapColumnIndex forValue(@Nonnull Object value, TypeSignature valueType)
  {
    if (valueType.isPrimitive()) {
      return forValue(
          ExprEval.ofType(ExpressionType.fromColumnTypeStrict(valueType), value)
                  .castTo(ExpressionType.STRING)
                  .asString()
      );
    }
    return null;
  }

  @SuppressFBWarnings("NP_NONNULL_PARAM_VIOLATION")
  @Override
  public BitmapColumnIndex forSortedValues(SortedSet values)
  {
    return getBitmapColumnIndexForSortedIterableUtf8(
        Iterables.transform(
            values,
            StringUtils::toUtf8ByteBuffer
        ),
        values.size(),
        values.contains(null)
    );
  }

  @SuppressFBWarnings("NP_NONNULL_PARAM_VIOLATION")
  @Override
  public BitmapColumnIndex forSortedValuesUtf8(SortedSet valuesUtf8)
  {
    final SortedSet tailSet;

    if (valuesUtf8.size() >= SIZE_WORTH_CHECKING_MIN) {
      final ByteBuffer minValueInColumn = dictionary.get(0);
      tailSet = valuesUtf8.tailSet(minValueInColumn);
    } else {
      tailSet = valuesUtf8;
    }

    return getBitmapColumnIndexForSortedIterableUtf8(tailSet, tailSet.size(), valuesUtf8.contains(null));
  }

  private ImmutableBitmap getBitmap(int idx)
  {
    if (idx < 0) {
      return bitmapFactory.makeEmptyImmutableBitmap();
    }

    final ImmutableBitmap bitmap = bitmaps.get(idx);
    return bitmap == null ? bitmapFactory.makeEmptyImmutableBitmap() : bitmap;
  }

  /**
   * Helper used by {@link #forSortedValues} and {@link #forSortedValuesUtf8}.
   */
  private BitmapColumnIndex getBitmapColumnIndexForSortedIterableUtf8(Iterable valuesUtf8, int size, boolean valuesContainsNull)
  {
    // for large number of in-filter values in comparison to the dictionary size, use the sorted merge algorithm.
    if (size > SORTED_MERGE_RATIO_THRESHOLD * dictionary.size()) {
      return new SimpleImmutableBitmapIterableIndex()
      {
        @Override
        public Iterable getBitmapIterable()
        {
          return () -> new Iterator()
          {
            final PeekingIterator valuesIterator = Iterators.peekingIterator(valuesUtf8.iterator());
            final PeekingIterator dictionaryIterator = Iterators.peekingIterator(dictionary.iterator());
            int next = -1;
            int idx = 0;

            @Override
            public boolean hasNext()
            {
              if (next < 0) {
                findNext();
              }
              return next >= 0;
            }

            @Override
            public ImmutableBitmap next()
            {
              if (next < 0) {
                findNext();
                if (next < 0) {
                  throw new NoSuchElementException();
                }
              }
              final int swap = next;
              next = -1;
              return getBitmap(swap);
            }

            private void findNext()
            {
              while (next < 0 && valuesIterator.hasNext() && dictionaryIterator.hasNext()) {
                final ByteBuffer nextValue = valuesIterator.peek();
                final ByteBuffer nextDictionaryKey = dictionaryIterator.peek();
                final int comparison = COMPARATOR.compare(nextValue, nextDictionaryKey);
                if (comparison == 0) {
                  next = idx;
                  valuesIterator.next();
                  break;
                } else if (comparison < 0) {
                  valuesIterator.next();
                } else {
                  dictionaryIterator.next();
                  idx++;
                }
              }
            }
          };
        }

        @Nullable
        @Override
        protected ImmutableBitmap getUnknownsBitmap()
        {
          if (!valuesContainsNull && NullHandling.isNullOrEquivalent(dictionary.get(0))) {
            return bitmaps.get(0);
          }
          return null;
        }
      };
    }

    // if the size of in-filter values is less than the threshold percentage of dictionary size, then use binary search
    // based lookup per value. The algorithm works well for smaller number of values.
    return getSimpleImmutableBitmapIterableIndexFromIterator(valuesUtf8, valuesContainsNull);
  }

  /**
   * Iterates over the value set, using binary search to look up each element. The algorithm works well for smaller
   * number of values, and must be used if the values are not sorted in the same manner as {@link #dictionary}
   */
  private SimpleImmutableBitmapIterableIndex getSimpleImmutableBitmapIterableIndexFromIterator(Iterable valuesUtf8, boolean valuesContainsNull)
  {
    return new SimpleImmutableBitmapIterableIndex()
    {
      @Override
      public Iterable getBitmapIterable()
      {
        return () -> new Iterator()
        {
          final int dictionarySize = dictionary.size();
          final Iterator iterator = valuesUtf8.iterator();
          int next = -1;

          @Override
          public boolean hasNext()
          {
            if (next < 0) {
              findNext();
            }
            return next >= 0;
          }

          @Override
          public ImmutableBitmap next()
          {
            if (next < 0) {
              findNext();
              if (next < 0) {
                throw new NoSuchElementException();
              }
            }
            final int swap = next;
            next = -1;
            return getBitmap(swap);
          }

          private void findNext()
          {
            while (next < 0 && iterator.hasNext()) {
              ByteBuffer nextValue = iterator.next();
              next = dictionary.indexOf(nextValue);

              if (next == -dictionarySize - 1) {
                // nextValue is past the end of the dictionary so we can break early
                // Note: we can rely on indexOf returning (-(insertion point) - 1), because of the earlier check
                // for Indexed.isSorted(), which guarantees this behavior
                break;
              }
            }
          }
        };
      }

      @Nullable
      @Override
      protected ImmutableBitmap getUnknownsBitmap()
      {
        if (!valuesContainsNull && NullHandling.isNullOrEquivalent(dictionary.get(0))) {
          return bitmaps.get(0);
        }
        return null;
      }
    };
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy