org.apache.druid.segment.index.semantic.ValueSetIndexes Maven / Gradle / Ivy
Show all versions of druid-processing Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.segment.index.semantic;
import com.google.common.base.Supplier;
import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;
import org.apache.druid.collections.bitmap.BitmapFactory;
import org.apache.druid.collections.bitmap.ImmutableBitmap;
import org.apache.druid.segment.column.TypeSignature;
import org.apache.druid.segment.column.ValueType;
import org.apache.druid.segment.data.Indexed;
import org.apache.druid.segment.index.BitmapColumnIndex;
import org.apache.druid.segment.index.SimpleImmutableBitmapDelegatingIterableIndex;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
/**
* Construct a {@link BitmapColumnIndex} for a set of values which might be present in the column.
*/
public interface ValueSetIndexes
{
/**
* threshold of sorted match value iterator size compared to dictionary size to use
* {@link #buildBitmapColumnIndexFromSortedIteratorScan} instead of
* {@link #buildBitmapColumnIndexFromSortedIteratorBinarySearch}.
*/
double SORTED_SCAN_RATIO_THRESHOLD = 0.12D;
/**
* minimum sorted match value iterator size to trim the initial values from the iterator to seek to the start of the
* value dictionary when using {@link #buildBitmapColumnIndexFromSortedIteratorScan} or
* {@link #buildBitmapColumnIndexFromSortedIteratorBinarySearch}.
*/
int SIZE_WORTH_CHECKING_MIN = 8;
/**
* Get the wrapped {@link ImmutableBitmap} corresponding to the specified set of values (if they are contained in the
* underlying column). The set must be sorted using the comparator of the supplied matchValueType.
*
* @param sortedValues values to match, sorted in matchValueType order
* @param matchValueType type of the value to match, used to assist conversion from the match value type to the column
* value type
* @return {@link ImmutableBitmap} corresponding to the rows which match the values, or null if an index
* connot be computed for the supplied value type
*/
@Nullable
BitmapColumnIndex forSortedValues(@Nonnull List> sortedValues, TypeSignature matchValueType);
/**
* Helper method for implementing {@link #forSortedValues} for a value set that is sorted the same as the column
* dictionary.
*
* Builds a {@link BitmapColumnIndex} from an {@link Iterable} that is sorted the same as the columns
* {@link Indexed} value dictionary. Uses a strategy that does zipping similar to the merge step of a sort-merge,
* where we step forward on both the iterator and the dictionary to find matches to build a
* {@link Iterable}.
*
* If sorted match value iterator size is greater than (dictionary size * {@link #SORTED_SCAN_RATIO_THRESHOLD}),
* consider using this method instead of {@link #buildBitmapColumnIndexFromSortedIteratorBinarySearch}.
*
* If the values in the iterator are NOT sorted the same as the dictionary, do NOT use this method, use
* {@link #buildBitmapColumnIndexFromIteratorBinarySearch} instead.
*/
static BitmapColumnIndex buildBitmapColumnIndexFromSortedIteratorScan(
BitmapFactory bitmapFactory,
Comparator comparator,
Iterable values,
Indexed dictionary,
Indexed bitmaps,
Supplier unknownsBitmap
)
{
return new BaseValueSetIndexesFromIterable(bitmapFactory, bitmaps, unknownsBitmap)
{
@Override
public Iterable getBitmapIterable()
{
return () -> new Iterator()
{
final PeekingIterator valuesIterator = Iterators.peekingIterator(values.iterator());
final PeekingIterator dictionaryIterator = Iterators.peekingIterator(dictionary.iterator());
int next = -1;
int idx = 0;
@Override
public boolean hasNext()
{
if (next < 0) {
findNext();
}
return next >= 0;
}
@Override
public ImmutableBitmap next()
{
if (next < 0) {
findNext();
if (next < 0) {
throw new NoSuchElementException();
}
}
final int swap = next;
next = -1;
return getBitmap(swap);
}
private void findNext()
{
while (next < 0 && valuesIterator.hasNext() && dictionaryIterator.hasNext()) {
final T nextValue = valuesIterator.peek();
final T nextDictionaryKey = dictionaryIterator.peek();
final int comparison = comparator.compare(nextValue, nextDictionaryKey);
if (comparison == 0) {
next = idx;
valuesIterator.next();
break;
} else if (comparison < 0) {
valuesIterator.next();
} else {
dictionaryIterator.next();
idx++;
}
}
}
};
}
};
}
/**
* Helper method for implementing {@link #forSortedValues} for a value set that is sorted the same as the column
* dictionary.
*
* Builds a {@link BitmapColumnIndex} from an {@link Iterable} that is sorted the same as the columns
* {@link Indexed} value dictionary. This algorithm iterates the values to match and does a binary search for
* matching values using {@link Indexed#indexOf(Object)} to build a {@link Iterable} short-circuiting
* the iteration if we reach the end of the {@link Indexed} before the values to match are exhausted.
*
* If sorted match value iterator size is less than (dictionary size * {@link #SORTED_SCAN_RATIO_THRESHOLD}),
* consider using this method instead of {@link #buildBitmapColumnIndexFromSortedIteratorScan}.
*
* If the values in the iterator are not sorted the same as the dictionary, do not use this method, use
* {@link #buildBitmapColumnIndexFromIteratorBinarySearch} instead.
*/
static BitmapColumnIndex buildBitmapColumnIndexFromSortedIteratorBinarySearch(
BitmapFactory bitmapFactory,
Iterable values,
Indexed dictionary,
Indexed bitmaps,
Supplier getUnknownsIndex
)
{
return new BaseValueSetIndexesFromIterable(bitmapFactory, bitmaps, getUnknownsIndex)
{
@Override
public Iterable getBitmapIterable()
{
return () -> new Iterator()
{
final int dictionarySize = dictionary.size();
final Iterator iterator = values.iterator();
int next = -1;
@Override
public boolean hasNext()
{
if (next < 0) {
findNext();
}
return next >= 0;
}
@Override
public ImmutableBitmap next()
{
if (next < 0) {
findNext();
if (next < 0) {
throw new NoSuchElementException();
}
}
final int swap = next;
next = -1;
return getBitmap(swap);
}
private void findNext()
{
while (next < 0 && iterator.hasNext()) {
T nextValue = iterator.next();
next = dictionary.indexOf(nextValue);
if (next == -dictionarySize - 1) {
// nextValue is past the end of the dictionary so we can break early
// Note: we can rely on indexOf returning (-(insertion point) - 1), because the Indexed
// is sorted, which guarantees this behavior
break;
}
}
}
};
}
};
}
/**
* Helper method for implementing {@link #forSortedValues} for a value set that is NOT sorted the same as the column
* dictionary.
*
* Builds a {@link BitmapColumnIndex} from an {@link Iterable} that is NOT sorted the same as the columns
* {@link Indexed} value dictionary. This algorithm iterates the values to match and does a binary search for
* matching values using {@link Indexed#indexOf(Object)} to build a {@link Iterable} until the match
* values iterator is exhausted.
*
* If values of the iterator are sorted the same as the dictionary, use
* {@link #buildBitmapColumnIndexFromSortedIteratorScan} or
* {@link #buildBitmapColumnIndexFromSortedIteratorBinarySearch} instead.
*/
static BitmapColumnIndex buildBitmapColumnIndexFromIteratorBinarySearch(
BitmapFactory bitmapFactory,
Iterable values,
Indexed dictionary,
Indexed bitmaps,
Supplier getUnknownsIndex
)
{
return new BaseValueSetIndexesFromIterable(bitmapFactory, bitmaps, getUnknownsIndex)
{
@Override
public Iterable getBitmapIterable()
{
return () -> new Iterator()
{
final Iterator iterator = values.iterator();
int next = -1;
@Override
public boolean hasNext()
{
if (next < 0) {
findNext();
}
return next >= 0;
}
@Override
public ImmutableBitmap next()
{
if (next < 0) {
findNext();
if (next < 0) {
throw new NoSuchElementException();
}
}
final int swap = next;
next = -1;
return getBitmap(swap);
}
private void findNext()
{
while (next < 0 && iterator.hasNext()) {
T nextValue = iterator.next();
next = dictionary.indexOf(nextValue);
}
}
};
}
};
}
abstract class BaseValueSetIndexesFromIterable extends SimpleImmutableBitmapDelegatingIterableIndex
{
private final Indexed bitmaps;
private final BitmapFactory bitmapFactory;
private final Supplier unknownsBitmap;
public BaseValueSetIndexesFromIterable(
BitmapFactory bitmapFactory,
Indexed bitmaps,
Supplier unknownsBitmap
)
{
this.bitmaps = bitmaps;
this.bitmapFactory = bitmapFactory;
this.unknownsBitmap = unknownsBitmap;
}
@Nullable
@Override
protected ImmutableBitmap getUnknownsBitmap()
{
return unknownsBitmap.get();
}
protected ImmutableBitmap getBitmap(int idx)
{
if (idx < 0) {
return bitmapFactory.makeEmptyImmutableBitmap();
}
final ImmutableBitmap bitmap = bitmaps.get(idx);
return bitmap == null ? bitmapFactory.makeEmptyImmutableBitmap() : bitmap;
}
}
}