All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.druid.segment.ColumnProcessors Maven / Gradle / Ivy

There is a newer version: 30.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.segment;

import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.math.expr.Expr;
import org.apache.druid.query.dimension.DefaultDimensionSpec;
import org.apache.druid.query.dimension.DimensionSpec;
import org.apache.druid.query.extraction.ExtractionFn;
import org.apache.druid.segment.column.ColumnCapabilities;
import org.apache.druid.segment.column.ColumnCapabilitiesImpl;
import org.apache.druid.segment.column.ColumnType;
import org.apache.druid.segment.column.TypeSignature;
import org.apache.druid.segment.column.ValueType;
import org.apache.druid.segment.vector.MultiValueDimensionVectorSelector;
import org.apache.druid.segment.vector.NilVectorSelector;
import org.apache.druid.segment.vector.SingleValueDimensionVectorSelector;
import org.apache.druid.segment.vector.VectorColumnSelectorFactory;
import org.apache.druid.segment.vector.VectorObjectSelector;
import org.apache.druid.segment.vector.VectorValueSelector;
import org.apache.druid.segment.virtual.ExpressionSelectors;

import javax.annotation.Nullable;

/**
 * Creates "column processors", which are objects that wrap a single input column and provide some functionality on
 * top of it.
 *
 * @see DimensionHandlerUtils#createColumnSelectorPlus which this may eventually replace
 */
public class ColumnProcessors
{
  /**
   * Capabilites that are used when we return a nil selector for a nonexistent column.
   */
  public static final ColumnCapabilities NIL_COLUMN_CAPABILITIES =
      new ColumnCapabilitiesImpl().setType(ColumnType.STRING)
                                  .setDictionaryEncoded(true)
                                  .setDictionaryValuesUnique(true)
                                  .setDictionaryValuesSorted(true)
                                  .setHasBitmapIndexes(false)
                                  .setHasMultipleValues(false);

  /**
   * Make a processor for a particular named column.
   *
   * @param column           the column
   * @param processorFactory the processor factory
   * @param selectorFactory  the column selector factory
   * @param               processor type
   */
  public static  T makeProcessor(
      final String column,
      final ColumnProcessorFactory processorFactory,
      final ColumnSelectorFactory selectorFactory
  )
  {
    return makeProcessorInternal(
        factory -> factory.getColumnCapabilities(column),
        factory -> factory.makeDimensionSelector(DefaultDimensionSpec.of(column)),
        factory -> factory.makeColumnValueSelector(column),
        processorFactory,
        selectorFactory
    );
  }

  /**
   * Make a processor for a particular {@link DimensionSpec}.
   *
   * @param dimensionSpec    the dimension spec
   * @param processorFactory the processor factory
   * @param selectorFactory  the column selector factory
   * @param               processor type
   */
  public static  T makeProcessor(
      final DimensionSpec dimensionSpec,
      final ColumnProcessorFactory processorFactory,
      final ColumnSelectorFactory selectorFactory
  )
  {
    return makeProcessorInternal(
        factory -> computeDimensionSpecCapabilities(
            dimensionSpec,
            factory.getColumnCapabilities(dimensionSpec.getDimension())
        ),
        factory -> factory.makeDimensionSelector(dimensionSpec),
        factory -> factory.makeColumnValueSelector(dimensionSpec.getDimension()),
        processorFactory,
        selectorFactory
    );
  }

  /**
   * Make a processor for a particular expression. If the expression is a simple identifier, this behaves identically
   * to {@link #makeProcessor(String, ColumnProcessorFactory, ColumnSelectorFactory)} and accesses the column directly.
   * Otherwise, it uses an expression selector of type {@code exprTypeHint}.
   *
   * @param expr             the parsed expression
   * @param exprTypeHint     expression selector type to use for exprs that are not simple identifiers
   * @param processorFactory the processor factory
   * @param selectorFactory  the column selector factory
   * @param               processor type
   */
  public static  T makeProcessor(
      final Expr expr,
      final ColumnType exprTypeHint,
      final ColumnProcessorFactory processorFactory,
      final ColumnSelectorFactory selectorFactory
  )
  {
    Preconditions.checkNotNull(exprTypeHint, "'exprTypeHint' must be nonnull");

    if (expr.getBindingIfIdentifier() != null) {
      // If expr is an identifier, treat this the same way as a direct column reference.
      return makeProcessor(expr.getBindingIfIdentifier(), processorFactory, selectorFactory);
    } else {
      return makeProcessorInternal(
          factory -> new ColumnCapabilitiesImpl().setType(exprTypeHint)
                                                 .setHasMultipleValues(true)
                                                 .setDictionaryValuesUnique(false)
                                                 .setDictionaryValuesSorted(false),
          factory -> ExpressionSelectors.makeDimensionSelector(factory, expr, null),
          factory -> ExpressionSelectors.makeColumnValueSelector(factory, expr),
          processorFactory,
          selectorFactory
      );
    }
  }

  /**
   * Make a processor for a particular named column.
   *
   * @param column           the column
   * @param processorFactory the processor factory
   * @param selectorFactory  the column selector factory
   * @param               processor type
   */
  public static  T makeVectorProcessor(
      final String column,
      final VectorColumnProcessorFactory processorFactory,
      final VectorColumnSelectorFactory selectorFactory
  )
  {
    return makeVectorProcessorInternal(
        factory -> factory.getColumnCapabilities(column),
        factory -> factory.makeSingleValueDimensionSelector(DefaultDimensionSpec.of(column)),
        factory -> factory.makeMultiValueDimensionSelector(DefaultDimensionSpec.of(column)),
        factory -> factory.makeValueSelector(column),
        factory -> factory.makeObjectSelector(column),
        processorFactory,
        selectorFactory
    );
  }

  /**
   * Make a processor for a particular {@link DimensionSpec}.
   *
   * @param dimensionSpec    the dimension spec
   * @param processorFactory the processor factory
   * @param selectorFactory  the column selector factory
   * @param               processor type
   */
  public static  T makeVectorProcessor(
      final DimensionSpec dimensionSpec,
      final VectorColumnProcessorFactory processorFactory,
      final VectorColumnSelectorFactory selectorFactory
  )
  {
    return makeVectorProcessorInternal(
        factory -> computeDimensionSpecCapabilities(
            dimensionSpec,
            factory.getColumnCapabilities(dimensionSpec.getDimension())
        ),
        factory -> factory.makeSingleValueDimensionSelector(dimensionSpec),
        factory -> factory.makeMultiValueDimensionSelector(dimensionSpec),
        factory -> factory.makeValueSelector(dimensionSpec.getDimension()),
        factory -> factory.makeObjectSelector(dimensionSpec.getDimension()),
        processorFactory,
        selectorFactory
    );
  }

  /**
   * Returns the capabilities of selectors derived from a particular {@link DimensionSpec}.
   *
   * Will only return non-STRING types if the DimensionSpec passes through inputs unchanged. (i.e., it's a
   * {@link DefaultDimensionSpec}, or something that behaves like one.)
   *
   * @param dimensionSpec      The dimensionSpec.
   * @param columnCapabilities Capabilities of the column that the dimensionSpec is reading, i.e.
   *                           {@link DimensionSpec#getDimension()}.
   */
  @Nullable
  private static ColumnCapabilities computeDimensionSpecCapabilities(
      final DimensionSpec dimensionSpec,
      @Nullable final ColumnCapabilities columnCapabilities
  )
  {
    if (dimensionSpec.mustDecorate()) {
      // Decorating DimensionSpecs could do anything. We can't pass along any useful info other than the type.
      return new ColumnCapabilitiesImpl().setType(ColumnType.STRING);
    } else if (dimensionSpec.getExtractionFn() != null) {
      // DimensionSpec is applying an extractionFn but *not* decorating. We have some insight into how the
      // extractionFn will behave, so let's use it.
      final boolean dictionaryEncoded;
      final boolean unique;
      final boolean sorted;
      if (columnCapabilities != null) {
        dictionaryEncoded = columnCapabilities.isDictionaryEncoded().isTrue();
        unique = columnCapabilities.areDictionaryValuesUnique().isTrue();
        sorted = columnCapabilities.areDictionaryValuesSorted().isTrue();
      } else {
        dictionaryEncoded = false;
        unique = false;
        sorted = false;
      }

      return new ColumnCapabilitiesImpl()
          .setType(ColumnType.STRING)
          .setDictionaryEncoded(dictionaryEncoded)
          .setDictionaryValuesSorted(sorted && dimensionSpec.getExtractionFn().preservesOrdering())
          .setDictionaryValuesUnique(
              unique && dimensionSpec.getExtractionFn().getExtractionType() == ExtractionFn.ExtractionType.ONE_TO_ONE
          )
          .setHasMultipleValues(mayBeMultiValue(columnCapabilities));
    } else {
      // No transformation. Pass through underlying types.
      return columnCapabilities;
    }
  }

  /**
   * Creates "column processors", which are objects that wrap a single input column and provide some
   * functionality on top of it.
   *
   * @param inputCapabilitiesFn   function that returns capabilities of the column being processed. The type provided
   *                              by these capabilities will be used to determine what kind of selector to create. If
   *                              this function returns null, then processorFactory.defaultType() will be
   *                              used to construct a set of assumed capabilities.
   * @param dimensionSelectorFn   function that creates a DimensionSelector for the column being processed. Will be
   *                              called if the column type is string.
   * @param valueSelectorFunction function that creates a ColumnValueSelector for the column being processed. Will be
   *                              called if the column type is long, float, double, or complex.
   * @param processorFactory      object that encapsulates the knowledge about how to create processors
   * @param selectorFactory       column selector factory used for creating the vector processor
   */
  private static  T makeProcessorInternal(
      final Function inputCapabilitiesFn,
      final Function dimensionSelectorFn,
      final Function> valueSelectorFunction,
      final ColumnProcessorFactory processorFactory,
      final ColumnSelectorFactory selectorFactory
  )
  {
    final ColumnCapabilities capabilities = inputCapabilitiesFn.apply(selectorFactory);
    final TypeSignature effectiveType = capabilities != null ? capabilities : processorFactory.defaultType();

    switch (effectiveType.getType()) {
      case STRING:
        return processorFactory.makeDimensionProcessor(
            dimensionSelectorFn.apply(selectorFactory),
            mayBeMultiValue(capabilities)
        );
      case LONG:
        return processorFactory.makeLongProcessor(valueSelectorFunction.apply(selectorFactory));
      case FLOAT:
        return processorFactory.makeFloatProcessor(valueSelectorFunction.apply(selectorFactory));
      case DOUBLE:
        return processorFactory.makeDoubleProcessor(valueSelectorFunction.apply(selectorFactory));
      case ARRAY:
        return processorFactory.makeArrayProcessor(
            valueSelectorFunction.apply(selectorFactory),
            capabilities
        );
      case COMPLEX:
        return processorFactory.makeComplexProcessor(valueSelectorFunction.apply(selectorFactory));
      default:
        throw new ISE("Unsupported type[%s]", effectiveType.asTypeString());
    }
  }

  /**
   * Creates "column processors", which are objects that wrap a single input column and provide some
   * functionality on top of it.
   *
   * @param inputCapabilitiesFn            function that returns capabilities of the column being processed. The type provided
   *                                       by these capabilities will be used to determine what kind of selector to create. If
   *                                       this function returns null, then it is assumed that the column does not exist.
   *                                       Note: this is different behavior from the non-vectorized version.
   * @param singleValueDimensionSelectorFn function that creates a singly-valued dimension selector for the column being
   *                                       processed. Will be called if the column is singly-valued string.
   * @param multiValueDimensionSelectorFn  function that creates a multi-valued dimension selector for the column being
   *                                       processed. Will be called if the column is multi-valued string.
   * @param valueSelectorFn                function that creates a value selector for the column being processed. Will be
   *                                       called if the column type is long, float, or double.
   * @param objectSelectorFn               function that creates an object selector for the column being processed. Will
   *                                       be called if the column type is complex.
   * @param processorFactory               object that encapsulates the knowledge about how to create processors
   * @param selectorFactory                column selector factory used for creating the vector processor
   */
  private static  T makeVectorProcessorInternal(
      final Function inputCapabilitiesFn,
      final Function singleValueDimensionSelectorFn,
      final Function multiValueDimensionSelectorFn,
      final Function valueSelectorFn,
      final Function objectSelectorFn,
      final VectorColumnProcessorFactory processorFactory,
      final VectorColumnSelectorFactory selectorFactory
  )
  {
    final ColumnCapabilities capabilities = inputCapabilitiesFn.apply(selectorFactory);

    if (capabilities == null) {
      // Column does not exist.
      return processorFactory.makeSingleValueDimensionProcessor(
          NIL_COLUMN_CAPABILITIES,
          NilVectorSelector.create(selectorFactory.getReadableVectorInspector())
      );
    }

    switch (capabilities.getType()) {
      case STRING:
        // let the processor factory decide if it prefers to use an object selector or dictionary encoded selector
        if (!processorFactory.useDictionaryEncodedSelector(capabilities)) {
          return processorFactory.makeObjectProcessor(
              capabilities,
              objectSelectorFn.apply(selectorFactory)
          );
        }

        if (capabilities.hasMultipleValues().isMaybeTrue()) {
          return processorFactory.makeMultiValueDimensionProcessor(
              capabilities,
              multiValueDimensionSelectorFn.apply(selectorFactory)
          );
        } else {
          return processorFactory.makeSingleValueDimensionProcessor(
              capabilities,
              singleValueDimensionSelectorFn.apply(selectorFactory)
          );
        }
      case LONG:
        return processorFactory.makeLongProcessor(capabilities, valueSelectorFn.apply(selectorFactory));
      case FLOAT:
        return processorFactory.makeFloatProcessor(capabilities, valueSelectorFn.apply(selectorFactory));
      case DOUBLE:
        return processorFactory.makeDoubleProcessor(capabilities, valueSelectorFn.apply(selectorFactory));
      case ARRAY:
        return processorFactory.makeArrayProcessor(capabilities, objectSelectorFn.apply(selectorFactory));
      case COMPLEX:
        return processorFactory.makeObjectProcessor(capabilities, objectSelectorFn.apply(selectorFactory));
      default:
        throw new ISE("Unsupported type[%s]", capabilities.getType());
    }
  }

  /**
   * Returns true if a given set of {@link ColumnCapabilities} indicate that a processor should handle the column as a
   * multi-value column. If capabilities are null, or if {@link ColumnCapabilities#hasMultipleValues()} is unknown,
   * this method errs on the side of returning true. If this method returns false, the column is _definitely not_
   * multi-value.
   *
   * Note, this method is not suitable for use with vector engines because null capabilities are indicative of a column
   * that does not exist, rather than unknown capabilities.
   */
  private static boolean mayBeMultiValue(@Nullable final ColumnCapabilities capabilities)
  {
    return capabilities == null || capabilities.hasMultipleValues().isMaybeTrue();
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy