org.apache.druid.query.QueryToolChest Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of druid-processing Show documentation
A module that is everything required to understands Druid Segments
There is a newer version: 30.0.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.query;

import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.JavaType;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.type.TypeFactory;
import com.google.common.base.Function;
import org.apache.druid.error.DruidException;
import org.apache.druid.frame.allocation.MemoryAllocatorFactory;
import org.apache.druid.guice.annotations.ExtensionPoint;
import org.apache.druid.java.util.common.UOE;
import org.apache.druid.java.util.common.guava.Sequence;
import org.apache.druid.query.aggregation.MetricManipulationFn;
import org.apache.druid.segment.column.RowSignature;
import org.apache.druid.timeline.LogicalSegment;

import javax.annotation.Nullable;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
import java.util.function.BinaryOperator;

/**
 * The broker-side (also used by server in some cases) API for a specific Query type.
 */
@ExtensionPoint
public abstract class QueryToolChest>
{
  private final JavaType baseResultType;
  private final JavaType bySegmentResultType;

  protected QueryToolChest()
  {
    final TypeFactory typeFactory = TypeFactory.defaultInstance();
    TypeReference resultTypeReference = getResultTypeReference();
    // resultTypeReference is null in MaterializedViewQueryQueryToolChest.
    // See https://github.com/apache/druid/issues/6977
    if (resultTypeReference != null) {
      baseResultType = typeFactory.constructType(resultTypeReference);
      bySegmentResultType = typeFactory.constructParametrizedType(
          Result.class,
          Result.class,
          typeFactory.constructParametrizedType(
              BySegmentResultValueClass.class,
              BySegmentResultValueClass.class,
              baseResultType
          )
      );
    } else {
      baseResultType = null;
      bySegmentResultType = null;
    }
  }

  public final JavaType getBaseResultType()
  {
    return baseResultType;
  }

  public final JavaType getBySegmentResultType()
  {
    return bySegmentResultType;
  }

  /**
   * Perform any per-query decoration of an {@link ObjectMapper} that enables it to read and write objects of the
   * query's {@link ResultType}. It is used by QueryResource on the write side, and DirectDruidClient on the read side.
   * 
   * For most queries, this is a no-op, but it can be useful for query types that support more than one result
   * serialization format. Queries that implement this method must not modify the provided ObjectMapper, but instead
   * must return a copy.
   */
  public ObjectMapper decorateObjectMapper(final ObjectMapper objectMapper, final QueryType query)
  {
    return objectMapper;
  }

  /**
   * This method wraps a QueryRunner.  The input QueryRunner, by contract, will provide a series of
   * ResultType objects in time order (ascending or descending).  This method should return a new QueryRunner that
   * merges the stream of ordered ResultType objects.
   * 

   * A default implementation constructs a {@link ResultMergeQueryRunner} which creates a
   * {@link org.apache.druid.common.guava.CombiningSequence} using the supplied {@link QueryRunner} with
   * {@link QueryToolChest#createResultComparator(Query)} and {@link QueryToolChest#createMergeFn(Query)}} supplied
   * by this toolchest.
   * 

   * Generally speaking, the logic that exists in makePostComputeManipulatorFn should actually exist in this method.
   * Additionally, if a query supports PostAggregations, this method should take steps to ensure that it computes
   * PostAggregations a minimum number of times.  This is most commonly achieved by computing the PostAgg results
   * during merge and also rewriting the query such that it has the minimum number of PostAggs (most
   * often zero).
   *
   * @param runner A QueryRunner that provides a series of ResultType objects in time order (ascending or descending)
   * @return a QueryRunner that merges the stream of ordered ResultType objects
   */
  public QueryRunner mergeResults(QueryRunner runner)
  {
    return new ResultMergeQueryRunner<>(runner, this::createResultComparator, this::createMergeFn);
  }

  /**
   * Creates a merge function that is used to merge intermediate aggregates from historicals in broker. This merge
   * function is used in the default {@link ResultMergeQueryRunner} provided by
   * {@link QueryToolChest#mergeResults(QueryRunner)} and also used in
   * {@link org.apache.druid.java.util.common.guava.ParallelMergeCombiningSequence} by 'CachingClusteredClient' if it
   * does not return null.
   * 

   * Returning null from this function means that a query does not support result merging, at
   * least via the mechanisms that utilize this function.
   */
  @Nullable
  public BinaryOperator createMergeFn(Query query)
  {
    return null;
  }

  /**
   * Creates an ordering comparator that is used to order results. This comparator is used in the default
   * {@link ResultMergeQueryRunner} provided by {@link QueryToolChest#mergeResults(QueryRunner)}
   */
  public Comparator createResultComparator(Query query)
  {
    throw DruidException.defensive("%s doesn't provide a result comparator", query.getClass().getName());
  }

  /**
   * Creates a {@link QueryMetrics} object that is used to generate metrics for this specific query type.  This exists
   * to allow for query-specific dimensions and metrics.  That is, the ToolChest is expected to set some
   * meaningful dimensions for metrics given this query type.  Examples might be the topN threshold for
   * a TopN query or the number of dimensions included for a groupBy query.
   *
   * 
QueryToolChests for query types in core (druid-processing) and public extensions (belonging to the Druid source
   * tree) should use delegate this method to {@link GenericQueryMetricsFactory#makeMetrics(Query)} on an injected
   * instance of {@link GenericQueryMetricsFactory}, as long as they don't need to emit custom dimensions and/or
   * metrics.
   *
   * 
If some custom dimensions and/or metrics should be emitted for a query type, a plan described in
   * "Making subinterfaces of QueryMetrics" section in {@link QueryMetrics}'s class-level Javadocs should be followed.
   *
   * 
One way or another, this method should ensure that {@link QueryMetrics#query(Query)} is called with the given
   * query passed on the created QueryMetrics object before returning.
   *
   * @param query The query that is being processed
   * @return A QueryMetrics that can be used to make metrics for the provided query
   */
  public abstract QueryMetrics makeMetrics(QueryType query);

  /**
   * Creates a Function that can take in a ResultType and return a new ResultType having applied
   * the MetricManipulatorFn to each of the metrics.
   * 

   * This function's primary purpose is to help work around some challenges that exist around deserializing
   * results across the wire.  Specifically, different aggregators will generate different object types in a
   * result set, if we wanted jackson to be able to deserialize these directly, we'd need to generate a response
   * class for each query that jackson could use to deserialize things.  That is not what we do.  Instead, we have
   * jackson deserialize Object instances and then use a MetricManipulatorFn to convert from those object instances
   * to the actual object that the aggregator expects.  As such, this would be more effectively named
   * "makeObjectDeserializingFn".
   * 

   * It is safe and acceptable for implementations of this method to first validate that the MetricManipulationFn
   * is {@link org.apache.druid.query.aggregation.MetricManipulatorFns#DESERIALIZING_INSTANCE} and throw an exception
   * if it is not.  If such an exception is ever thrown, it is indicative of a bug in the caller which should be fixed
   * by not calling this method with anything other than the deserializing manipulator function.
   * 

   * There are some implementations where this was also tasked with computing PostAggregators, but this is actually
   * not a good place to compute those as this function can be called in a number of cases when PostAggs are not
   * really meaningful to compute.  Instead, PostAggs should be computed in the mergeResults call and the
   * mergeResults implementation should take care to ensure that PostAggs are only computed the minimum number of
   * times necessary.
   * 

   * This function is called very early in the processing pipeline on the Broker.
   *
   * @param query The Query that is currently being processed
   * @param fn    The function that should be applied to all metrics in the results
   * @return A function that will apply the provided fn to all metrics in the input ResultType object
   */
  public abstract Function makePreComputeManipulatorFn(
      QueryType query,
      MetricManipulationFn fn
  );

  /**
   * This manipulator functions primary purpose is to conduct finalization of aggregator values.  It would be better
   * named "makeFinalizingManipulatorFn", even that should really be done as part of {@link #mergeResults} instead
   * of with this separate method.
   * 

   * It is safe and acceptable for implementations of this method to first validate that the MetricManipulationFn
   * is either {@link org.apache.druid.query.aggregation.MetricManipulatorFns#FINALIZING_INSTANCE} or
   * {@link org.apache.druid.query.aggregation.MetricManipulatorFns#IDENTITY_INSTANCE} and throw an exception
   * if it is not.  If such an exception is ever thrown, it is indicative of a bug in the caller which should be fixed
   * by not calling this method with unsupported manipulator functions.
   *
   * @param query The Query that is currently being processed
   * @param fn    The function that should be applied to all metrics in the results
   * @return A function that will apply the provided fn to all metrics in the input ResultType object
   */
  public Function makePostComputeManipulatorFn(QueryType query, MetricManipulationFn fn)
  {
    return makePreComputeManipulatorFn(query, fn);
  }

  /**
   * Returns a TypeReference object that is just passed through to Jackson in order to deserialize
   * the results of this type of query.
   *
   * @return A TypeReference to indicate to Jackson what type of data will exist for this query
   */
  public abstract TypeReference getResultTypeReference();

  /**
   * Returns a CacheStrategy to be used to load data into the cache and remove it from the cache.
   * 

   * This is optional.  If it returns null, caching is effectively disabled for the query.
   *
   * @param query The query whose results might be cached
   * @param    The type of object that will be stored in the cache
   * @return A CacheStrategy that can be used to populate and read from the Cache
   */
  @Nullable
  public  CacheStrategy getCacheStrategy(QueryType query)
  {
    return null;
  }

  /**
   * Wraps a QueryRunner.  The input QueryRunner is the QueryRunner as it exists *before* being passed to
   * mergeResults().
   * 

   * In fact, the return value of this method is always passed to mergeResults, so it is equivalent to
   * just implement this functionality as extra decoration on the QueryRunner during mergeResults().
   * 

   * In the interests of potentially simplifying these interfaces, the recommendation is to actually not
   * override this method and instead apply anything that might be needed here in the mergeResults() call.
   *
   * @param runner The runner to be wrapped
   * @return The wrapped runner
   */
  public QueryRunner preMergeQueryDecoration(QueryRunner runner)
  {
    return runner;
  }

  /**
   * Wraps a QueryRunner.  The input QueryRunner is the QueryRunner as it exists coming out of mergeResults()
   * 

   * In fact, the input value of this method is always the return value from mergeResults, so it is equivalent
   * to just implement this functionality as extra decoration on the QueryRunner during mergeResults().
   * 

   * In the interests of potentially simplifying these interfaces, the recommendation is to actually not
   * override this method and instead apply anything that might be needed here in the mergeResults() call.
   *
   * @param runner The runner to be wrapped
   * @return The wrapped runner
   */
  public QueryRunner postMergeQueryDecoration(QueryRunner runner)
  {
    return runner;
  }

  /**
   * This method is called to allow the query to prune segments that it does not believe need to actually
   * be queried.  It can use whatever criteria it wants in order to do the pruning, it just needs to
   * return the list of Segments it actually wants to see queried.
   *
   * @param query    The query being processed
   * @param segments The list of candidate segments to be queried
   * @param       A Generic parameter because Java is cool
   * @return The list of segments to actually query
   */
  public  List filterSegments(QueryType query, List segments)
  {
    return segments;
  }

  /**
   * Returns whether this toolchest is able to handle the provided subquery.
   * 

   * When this method returns true, the core query stack will pass subquery datasources over to the toolchest and will
   * assume they are properly handled.
   * 

   * When this method returns false, the core query stack will throw an error if subqueries are present. In the future,
   * instead of throwing an error, the core query stack will handle the subqueries on its own.
   */
  public boolean canPerformSubquery(final Query subquery)
  {
    return false;
  }

  /**
   * Returns a {@link RowSignature} for the arrays returned by {@link #resultsAsArrays}. The returned signature will
   * be the same length as each array returned by {@link #resultsAsArrays}.
   *
   * @param query same query passed to {@link #resultsAsArrays}
   * @return row signature
   * @throws UnsupportedOperationException if this query type does not support returning results as arrays
   */
  public RowSignature resultArraySignature(QueryType query)
  {
    throw new UOE("Query type '%s' does not support returning results as arrays", query.getType());
  }

  /**
   * Converts a sequence of this query's ResultType into arrays. The array signature is given by
   * {@link #resultArraySignature}. This functionality is useful because it allows higher-level processors to operate on
   * the results of any query in a consistent way. This is useful for the SQL layer and for any algorithm that might
   * operate on the results of an inner query.
   * 

   * Not all query types support this method. They will throw {@link UnsupportedOperationException}, and they cannot
   * be used by the SQL layer or by generic higher-level algorithms.
   * 

   * Some query types return less information after translating their results into arrays, especially in situations
   * where there is no clear way to translate fully rich results into flat arrays. For example, the scan query does not
   * include the segmentId in its array-based results, because it could potentially conflict with a 'segmentId' field
   * in the actual datasource being scanned.
   * 

   * It is possible that there will be multiple arrays returned for a single result object. For example, in the topN
   * query, each {@link org.apache.druid.query.topn.TopNResultValue} will generate a separate array for each of its
   * {@code values}.
   * 

   * By convention, the array form should include the __time column, if present, as a long (milliseconds since epoch).
   *
   * @param resultSequence results of the form returned by {@link #mergeResults}
   * @return results in array form
   * @throws UnsupportedOperationException if this query type does not support returning results as arrays
   */
  public Sequence resultsAsArrays(QueryType query, Sequence resultSequence)
  {
    throw new UOE("Query type '%s' does not support returning results as arrays", query.getType());
  }

  /**
   * Converts a sequence of this query's ResultType into a sequence of {@link FrameSignaturePair}. The array signature
   * is the one give by {@link #resultArraySignature(Query)}. If the toolchest doesn't support this method, then it can
   * return an empty optional. It is the duty of the callees to throw an appropriate exception in that case or use an
   * alternative fallback approach
   * 

   * Check documentation of {@link #resultsAsArrays(Query, Sequence)} as the behaviour of the rows represented by the
   * frame sequence is identical.
   * 
   * Each Frame has a separate {@link RowSignature} because for some query types like the Scan query, every
   * column in the final result might not be present in the individual ResultType (and subsequently Frame). Therefore,
   * this is done to preserve the space by not populating the column in that particular Frame and omitting it from its
   * signature
   *
   * @param query                    Query being executed by the toolchest. Used to determine the rowSignature of the Frames
   * @param resultSequence           results of the form returned by {@link #mergeResults(QueryRunner)}
   * @param memoryAllocatorFactory
   * @param useNestedForUnknownTypes true if the unknown types in the results can be serded using complex types
   */
  public Optional> resultsAsFrames(
      QueryType query,
      Sequence resultSequence,
      MemoryAllocatorFactory memoryAllocatorFactory,
      boolean useNestedForUnknownTypes
  )
  {
    return Optional.empty();
  }
}