All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.druid.query.groupby.GroupingEngine Maven / Gradle / Ivy

There is a newer version: 30.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.query.groupby;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Supplier;
import com.google.common.base.Suppliers;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.inject.Inject;
import org.apache.druid.collections.BlockingPool;
import org.apache.druid.collections.NonBlockingPool;
import org.apache.druid.collections.ReferenceCountingResourceHolder;
import org.apache.druid.guice.annotations.Global;
import org.apache.druid.guice.annotations.Json;
import org.apache.druid.guice.annotations.Merging;
import org.apache.druid.guice.annotations.Smile;
import org.apache.druid.java.util.common.Intervals;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.collect.Utils;
import org.apache.druid.java.util.common.granularity.Granularities;
import org.apache.druid.java.util.common.granularity.Granularity;
import org.apache.druid.java.util.common.guava.LazySequence;
import org.apache.druid.java.util.common.guava.Sequence;
import org.apache.druid.java.util.common.guava.Sequences;
import org.apache.druid.query.DruidProcessingConfig;
import org.apache.druid.query.Query;
import org.apache.druid.query.QueryCapacityExceededException;
import org.apache.druid.query.QueryContext;
import org.apache.druid.query.QueryContexts;
import org.apache.druid.query.QueryPlus;
import org.apache.druid.query.QueryProcessingPool;
import org.apache.druid.query.QueryRunner;
import org.apache.druid.query.QueryWatcher;
import org.apache.druid.query.ResourceLimitExceededException;
import org.apache.druid.query.ResultMergeQueryRunner;
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.druid.query.aggregation.PostAggregator;
import org.apache.druid.query.context.ResponseContext;
import org.apache.druid.query.dimension.DefaultDimensionSpec;
import org.apache.druid.query.dimension.DimensionSpec;
import org.apache.druid.query.groupby.epinephelinae.GroupByBinaryFnV2;
import org.apache.druid.query.groupby.epinephelinae.GroupByMergingQueryRunnerV2;
import org.apache.druid.query.groupby.epinephelinae.GroupByQueryEngineV2;
import org.apache.druid.query.groupby.epinephelinae.GroupByRowProcessor;
import org.apache.druid.query.groupby.orderby.DefaultLimitSpec;
import org.apache.druid.query.groupby.orderby.LimitSpec;
import org.apache.druid.query.groupby.orderby.NoopLimitSpec;
import org.apache.druid.query.spec.MultipleIntervalSegmentSpec;
import org.apache.druid.segment.StorageAdapter;
import org.apache.druid.segment.VirtualColumns;
import org.apache.druid.segment.join.filter.AllNullColumnSelectorFactory;
import org.apache.druid.utils.CloseableUtils;

import javax.annotation.Nullable;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.BinaryOperator;
import java.util.stream.Collectors;

public class GroupingEngine
{
  public static final String CTX_KEY_FUDGE_TIMESTAMP = "fudgeTimestamp";
  public static final String CTX_KEY_OUTERMOST = "groupByOutermost";

  private final DruidProcessingConfig processingConfig;
  private final Supplier configSupplier;
  private final NonBlockingPool bufferPool;
  private final BlockingPool mergeBufferPool;
  private final ObjectMapper jsonMapper;
  private final ObjectMapper spillMapper;
  private final QueryWatcher queryWatcher;

  @Inject
  public GroupingEngine(
      DruidProcessingConfig processingConfig,
      Supplier configSupplier,
      @Global NonBlockingPool bufferPool,
      @Merging BlockingPool mergeBufferPool,
      @Json ObjectMapper jsonMapper,
      @Smile ObjectMapper spillMapper,
      QueryWatcher queryWatcher
  )
  {
    this.processingConfig = processingConfig;
    this.configSupplier = configSupplier;
    this.bufferPool = bufferPool;
    this.mergeBufferPool = mergeBufferPool;
    this.jsonMapper = jsonMapper;
    this.spillMapper = spillMapper;
    this.queryWatcher = queryWatcher;
  }

  /**
   * Initializes resources required to run {@link GroupByQueryQueryToolChest#mergeResults(QueryRunner)} for a
   * particular query. That method is also the primary caller of this method.
   *
   * Used by {@link GroupByQueryQueryToolChest#mergeResults(QueryRunner)}.
   *
   * @param query a groupBy query to be processed
   *
   * @return broker resource
   */
  public GroupByQueryResources prepareResource(GroupByQuery query)
  {
    final int requiredMergeBufferNum = GroupByQueryResources.countRequiredMergeBufferNum(query);

    if (requiredMergeBufferNum > mergeBufferPool.maxSize()) {
      throw new ResourceLimitExceededException(
          "Query needs " + requiredMergeBufferNum + " merge buffers, but only "
          + mergeBufferPool.maxSize() + " merge buffers were configured"
      );
    } else if (requiredMergeBufferNum == 0) {
      return new GroupByQueryResources();
    } else {
      final List> mergeBufferHolders;
      final QueryContext context = query.context();
      if (context.hasTimeout()) {
        mergeBufferHolders = mergeBufferPool.takeBatch(requiredMergeBufferNum, context.getTimeout());
      } else {
        mergeBufferHolders = mergeBufferPool.takeBatch(requiredMergeBufferNum);
      }
      if (mergeBufferHolders.isEmpty()) {
        throw QueryCapacityExceededException.withErrorMessageAndResolvedHost(
            StringUtils.format(
                "Cannot acquire %s merge buffers. Try again after current running queries are finished.",
                requiredMergeBufferNum
            )
        );
      } else {
        return new GroupByQueryResources(mergeBufferHolders);
      }
    }
  }

  /**
   * See {@link org.apache.druid.query.QueryToolChest#createResultComparator(Query)}, allows
   * {@link GroupByQueryQueryToolChest} to delegate implementation to the strategy
   */
  public Comparator createResultComparator(Query queryParam)
  {
    return ((GroupByQuery) queryParam).getRowOrdering(true);
  }

  /**
   * See {@link org.apache.druid.query.QueryToolChest#createMergeFn(Query)} for details, allows
   * {@link GroupByQueryQueryToolChest} to delegate implementation to the strategy
   */
  public BinaryOperator createMergeFn(Query queryParam)
  {
    return new GroupByBinaryFnV2((GroupByQuery) queryParam);
  }

  public GroupByQuery prepareGroupByQuery(GroupByQuery query)
  {
    // Set up downstream context.
    final ImmutableMap.Builder context = ImmutableMap.builder();
    context.put(QueryContexts.FINALIZE_KEY, false);
    context.put(CTX_KEY_OUTERMOST, false);

    Granularity granularity = query.getGranularity();
    List dimensionSpecs = query.getDimensions();
    // the CTX_TIMESTAMP_RESULT_FIELD is set in DruidQuery.java
    final QueryContext queryContext = query.context();
    final String timestampResultField = queryContext.getString(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD);
    final boolean hasTimestampResultField = (timestampResultField != null && !timestampResultField.isEmpty())
                                            && queryContext.getBoolean(CTX_KEY_OUTERMOST, true)
                                            && !query.isApplyLimitPushDown();
    if (hasTimestampResultField) {
      // sql like "group by city_id,time_floor(__time to day)",
      // the original translated query is granularity=all and dimensions:[d0, d1]
      // the better plan is granularity=day and dimensions:[d0]
      // but the ResultRow structure is changed from [d0, d1] to [__time, d0]
      // this structure should be fixed as [d0, d1] (actually it is [d0, __time]) before postAggs are called.
      //
      // the above is the general idea of this optimization.
      // but from coding perspective, the granularity=all and "d0" dimension are referenced by many places,
      // eg: subtotals, having, grouping set, post agg,
      // there would be many many places need to be fixed if "d0" dimension is removed from query.dimensions
      // and the same to the granularity change.
      // so from easier coding perspective, this optimization is coded as groupby engine-level inner process change.
      // the most part of codes are in GroupByStrategyV2 about the process change between broker and compute node.
      // the basic logic like nested queries and subtotals are kept unchanged,
      // they will still see the granularity=all and the "d0" dimension.
      //
      // the tradeoff is that GroupByStrategyV2 behaviors differently according to the query contexts set in DruidQuery
      // in another word,
      // the query generated by "explain plan for select ..." doesn't match to the native query ACTUALLY being executed,
      // the granularity and dimensions are slightly different.
      // now, part of the query plan logic is handled in GroupByStrategyV2, not only in DruidQuery.toGroupByQuery()
      final Granularity timestampResultFieldGranularity
          = queryContext.getGranularity(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD_GRANULARITY, jsonMapper);
      dimensionSpecs =
          query.getDimensions()
               .stream()
               .filter(dimensionSpec -> !dimensionSpec.getOutputName().equals(timestampResultField))
               .collect(Collectors.toList());
      granularity = timestampResultFieldGranularity;
      // when timestampResultField is the last dimension, should set sortByDimsFirst=true,
      // otherwise the downstream is sorted by row's timestamp first which makes the final ordering not as expected
      int timestampResultFieldIndex = queryContext.getInt(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD_INDEX, 0);
      if (!query.getContextSortByDimsFirst() && timestampResultFieldIndex == query.getDimensions().size() - 1) {
        context.put(GroupByQuery.CTX_KEY_SORT_BY_DIMS_FIRST, true);
      }
      // when timestampResultField is the first dimension and sortByDimsFirst=true,
      // it is actually equals to sortByDimsFirst=false
      if (query.getContextSortByDimsFirst() && timestampResultFieldIndex == 0) {
        context.put(GroupByQuery.CTX_KEY_SORT_BY_DIMS_FIRST, false);
      }
      // when hasTimestampResultField=true and timestampResultField is neither first nor last dimension,
      // the DefaultLimitSpec will always do the reordering
    }
    if (query.getUniversalTimestamp() != null && !hasTimestampResultField) {
      // universalTimestamp works only when granularity is all
      // hasTimestampResultField works only when granularity is all
      // fudgeTimestamp should not be used when hasTimestampResultField=true due to the row's actual timestamp is used
      context.put(CTX_KEY_FUDGE_TIMESTAMP, String.valueOf(query.getUniversalTimestamp().getMillis()));
    }

    // The having spec shouldn't be passed down, so we need to convey the existing limit push down status
    context.put(GroupByQueryConfig.CTX_KEY_APPLY_LIMIT_PUSH_DOWN, query.isApplyLimitPushDown());

    // Always request array result rows when passing the query downstream.
    context.put(GroupByQueryConfig.CTX_KEY_ARRAY_RESULT_ROWS, true);

    return new GroupByQuery(
        query.getDataSource(),
        query.getQuerySegmentSpec(),
        query.getVirtualColumns(),
        query.getDimFilter(),
        granularity,
        dimensionSpecs,
        query.getAggregatorSpecs(),
        // Don't apply postaggregators on compute nodes
        ImmutableList.of(),
        // Don't do "having" clause until the end of this method.
        null,
        // Potentially pass limit down the stack (i.e. limit pushdown). Notes:
        //   (1) Limit pushdown is only supported for DefaultLimitSpec.
        //   (2) When pushing down a limit, it must be extended to include the offset (the offset will be applied
        //       higher-up).
        query.isApplyLimitPushDown() ? ((DefaultLimitSpec) query.getLimitSpec()).withOffsetToLimit() : null,
        query.getSubtotalsSpec(),
        query.getContext()
    ).withOverriddenContext(
        context.build()
    );
  }

  /**
   * Runs a provided {@link QueryRunner} on a provided {@link GroupByQuery}, which is assumed to return rows that are
   * properly sorted (by timestamp and dimensions) but not necessarily fully merged (that is, there may be adjacent
   * rows with the same timestamp and dimensions) and without PostAggregators computed. This method will fully merge
   * the rows, apply PostAggregators, and return the resulting {@link Sequence}.
   *
   * The query will be modified using {@link #prepareGroupByQuery(GroupByQuery)} before passing it down to the base
   * runner. For example, "having" clauses will be removed and various context parameters will be adjusted.
   *
   * Despite the similar name, this method is much reduced in scope compared to
   * {@link GroupByQueryQueryToolChest#mergeResults(QueryRunner)}. That method does delegate to this one at some points,
   * but has a truckload of other responsibility, including computing outer query results (if there are subqueries),
   * computing subtotals (like GROUPING SETS), and computing the havingSpec and limitSpec.
   *
   * @param baseRunner      base query runner
   * @param query           the groupBy query to run inside the base query runner
   * @param responseContext the response context to pass to the base query runner
   *
   * @return merged result sequence
   */
  public Sequence mergeResults(
      final QueryRunner baseRunner,
      final GroupByQuery query,
      final ResponseContext responseContext
  )
  {
    // Merge streams using ResultMergeQueryRunner, then apply postaggregators, then apply limit (which may
    // involve materialization)
    final ResultMergeQueryRunner mergingQueryRunner = new ResultMergeQueryRunner<>(
        baseRunner,
        this::createResultComparator,
        this::createMergeFn
    );

    final QueryContext queryContext = query.context();
    final String timestampResultField = queryContext.getString(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD);
    final boolean hasTimestampResultField = (timestampResultField != null && !timestampResultField.isEmpty())
                                            && queryContext.getBoolean(CTX_KEY_OUTERMOST, true)
                                            && !query.isApplyLimitPushDown();
    final int timestampResultFieldIndexInOriginalDimensions = hasTimestampResultField ? queryContext.getInt(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD_INDEX) : 0;
    final GroupByQuery newQuery = prepareGroupByQuery(query);

    final Sequence mergedResults = mergingQueryRunner.run(QueryPlus.wrap(newQuery), responseContext);

    // Apply postaggregators if this is the outermost mergeResults (CTX_KEY_OUTERMOST) and we are not executing a
    // pushed-down subquery (CTX_KEY_EXECUTING_NESTED_QUERY).

    if (!queryContext.getBoolean(CTX_KEY_OUTERMOST, true)
        || queryContext.getBoolean(GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, false)) {
      return mergedResults;
    } else if (query.getPostAggregatorSpecs().isEmpty()) {
      if (!hasTimestampResultField) {
        return mergedResults;
      }
      return Sequences.map(
          mergedResults,
          row -> {
            final ResultRow resultRow = ResultRow.create(query.getResultRowSizeWithoutPostAggregators());
            moveOrReplicateTimestampInRow(
                query,
                timestampResultFieldIndexInOriginalDimensions,
                row,
                resultRow
            );

            return resultRow;
          }
      );
    } else {
      return Sequences.map(
          mergedResults,
          row -> {
            // This function's purpose is to apply PostAggregators.

            final ResultRow rowWithPostAggregations = ResultRow.create(query.getResultRowSizeWithPostAggregators());

            // Copy everything that comes before the postaggregations.
            if (hasTimestampResultField) {
              moveOrReplicateTimestampInRow(
                  query,
                  timestampResultFieldIndexInOriginalDimensions,
                  row,
                  rowWithPostAggregations
              );
            } else {
              for (int i = 0; i < query.getResultRowPostAggregatorStart(); i++) {
                rowWithPostAggregations.set(i, row.get(i));
              }
            }

            // Compute postaggregations. We need to do this with a result-row map because PostAggregator.compute
            // expects a map. Some further design adjustment may eliminate the need for it, and speed up this function.
            final Map mapForPostAggregationComputation = rowWithPostAggregations.toMap(query);

            for (int i = 0; i < query.getPostAggregatorSpecs().size(); i++) {
              final PostAggregator postAggregator = query.getPostAggregatorSpecs().get(i);
              final Object value = postAggregator.compute(mapForPostAggregationComputation);

              rowWithPostAggregations.set(query.getResultRowPostAggregatorStart() + i, value);
              mapForPostAggregationComputation.put(postAggregator.getName(), value);
            }

            return rowWithPostAggregations;
          }
      );
    }
  }

  /**
   * Merge a variety of single-segment query runners into a combined runner. Used by
   * {@link GroupByQueryRunnerFactory#mergeRunners(QueryProcessingPool, Iterable)}. In
   * that sense, it is intended to go along with {@link #process(GroupByQuery, StorageAdapter, GroupByQueryMetrics)} (the runners created
   * by that method will be fed into this method).
   * 

* This method is only called on data servers, like Historicals (not the Broker). * * @param queryProcessingPool {@link QueryProcessingPool} service used for parallel execution of the query runners * @param queryRunners collection of query runners to merge * @return merged query runner */ public QueryRunner mergeRunners( final QueryProcessingPool queryProcessingPool, final Iterable> queryRunners ) { return new GroupByMergingQueryRunnerV2( configSupplier.get(), processingConfig, queryProcessingPool, queryWatcher, queryRunners, processingConfig.getNumThreads(), mergeBufferPool, processingConfig.intermediateComputeSizeBytes(), spillMapper, processingConfig.getTmpDir() ); } /** * Process a groupBy query on a single {@link StorageAdapter}. This is used by * {@link GroupByQueryRunnerFactory#createRunner} to create per-segment * QueryRunners. * * This method is only called on data servers, like Historicals (not the Broker). * * @param query the groupBy query * @param storageAdapter storage adatper for the segment in question * * @return result sequence for the storage adapter */ public Sequence process( GroupByQuery query, StorageAdapter storageAdapter, @Nullable GroupByQueryMetrics groupByQueryMetrics ) { return GroupByQueryEngineV2.process( query, storageAdapter, bufferPool, configSupplier.get().withOverrides(query), processingConfig, groupByQueryMetrics ); } /** * Apply the {@link GroupByQuery} "postProcessingFn", which is responsible for HavingSpec and LimitSpec. * * @param results sequence of results * @param query the groupBy query * * @return post-processed results, with HavingSpec and LimitSpec applied */ public Sequence applyPostProcessing(Sequence results, GroupByQuery query) { results = wrapSummaryRowIfNeeded(query, results); // Don't apply limit here for inner results, that will be pushed down to the BufferHashGrouper if (query.context().getBoolean(CTX_KEY_OUTERMOST, true)) { return query.postProcess(results); } else { return results; } } /** * Called by {@link GroupByQueryQueryToolChest#mergeResults(QueryRunner)} when it needs to process a subquery. * * @param subquery inner query * @param query outer query * @param resource resources returned by {@link #prepareResource(GroupByQuery)} * @param subqueryResult result rows from the subquery * @param wasQueryPushedDown true if the outer query was pushed down (so we only need to merge the outer query's * results, not run it from scratch like a normal outer query) * * @return results of the outer query */ public Sequence processSubqueryResult( GroupByQuery subquery, GroupByQuery query, GroupByQueryResources resource, Sequence subqueryResult, boolean wasQueryPushedDown ) { // Keep a reference to resultSupplier outside the "try" so we can close it if something goes wrong // while creating the sequence. GroupByRowProcessor.ResultSupplier resultSupplier = null; try { final GroupByQuery queryToRun; if (wasQueryPushedDown) { // If the query was pushed down, filters would have been applied downstream, so skip it here. queryToRun = query.withDimFilter(null) .withQuerySegmentSpec(new MultipleIntervalSegmentSpec(Intervals.ONLY_ETERNITY)); } else { queryToRun = query; } resultSupplier = GroupByRowProcessor.process( queryToRun, wasQueryPushedDown ? queryToRun : subquery, subqueryResult, configSupplier.get(), processingConfig, resource, spillMapper, processingConfig.getTmpDir(), processingConfig.intermediateComputeSizeBytes() ); final GroupByRowProcessor.ResultSupplier finalResultSupplier = resultSupplier; return Sequences.withBaggage( mergeResults( (queryPlus, responseContext) -> finalResultSupplier.results(null), query, ResponseContext.createEmpty() ), finalResultSupplier ); } catch (Throwable e) { throw CloseableUtils.closeAndWrapInCatch(e, resultSupplier); } } /** * Called by {@link GroupByQueryQueryToolChest#mergeResults(QueryRunner)} when it needs to generate subtotals. * * @param query query that has a "subtotalsSpec" * @param resource resources returned by {@link #prepareResource(GroupByQuery)} * @param queryResult result rows from the main query * * @return results for each list of subtotals in the query, concatenated together */ public Sequence processSubtotalsSpec( GroupByQuery query, GroupByQueryResources resource, Sequence queryResult ) { // How it works? // First we accumulate the result of top level base query aka queryResult arg inside a resultSupplierOne object. // Next for each subtotalSpec // If subtotalSpec is a prefix of top level dims then we iterate on rows in resultSupplierOne object which are still // sorted by subtotalSpec, stream merge them and return. // // If subtotalSpec is not a prefix of top level dims then we create a resultSupplierTwo object filled with rows from // resultSupplierOne object with only dims from subtotalSpec. Then we iterate on rows in resultSupplierTwo object which are // of course sorted by subtotalSpec, stream merge them and return. // Keep a reference to resultSupplier outside the "try" so we can close it if something goes wrong // while creating the sequence. GroupByRowProcessor.ResultSupplier resultSupplierOne = null; try { // baseSubtotalQuery is the original query with dimensions and aggregators rewritten to apply to the *results* // rather than *inputs* of that query. It has its virtual columns and dim filter removed, because those only // make sense when applied to inputs. Finally, it has subtotalsSpec removed, since we'll be computing them // one-by-one soon enough. GroupByQuery baseSubtotalQuery = query .withDimensionSpecs(query.getDimensions().stream().map( dimSpec -> new DefaultDimensionSpec( dimSpec.getOutputName(), dimSpec.getOutputName(), dimSpec.getOutputType() )).collect(Collectors.toList()) ) .withAggregatorSpecs( query.getAggregatorSpecs() .stream() .map(AggregatorFactory::getCombiningFactory) .collect(Collectors.toList()) ) .withVirtualColumns(VirtualColumns.EMPTY) .withDimFilter(null) .withSubtotalsSpec(null) // timestampResult optimization is not for subtotal scenario, so disable it .withOverriddenContext(ImmutableMap.of(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD, "")); resultSupplierOne = GroupByRowProcessor.process( baseSubtotalQuery, baseSubtotalQuery, queryResult, configSupplier.get(), processingConfig, resource, spillMapper, processingConfig.getTmpDir(), processingConfig.intermediateComputeSizeBytes() ); List queryDimNames = baseSubtotalQuery.getDimensions().stream().map(DimensionSpec::getOutputName) .collect(Collectors.toList()); // Only needed to make LimitSpec.filterColumns(..) call later in case base query has a non default LimitSpec. Set aggsAndPostAggs = null; if (!(baseSubtotalQuery.getLimitSpec() instanceof NoopLimitSpec)) { aggsAndPostAggs = getAggregatorAndPostAggregatorNames(baseSubtotalQuery); } List> subtotals = query.getSubtotalsSpec(); List> subtotalsResults = new ArrayList<>(subtotals.size()); // Iterate through each subtotalSpec, build results for it and add to subtotalsResults for (List subtotalSpec : subtotals) { final ImmutableSet dimsInSubtotalSpec = ImmutableSet.copyOf(subtotalSpec); // Dimension spec including dimension name and output name final List subTotalDimensionSpec = new ArrayList<>(dimsInSubtotalSpec.size()); final List dimensions = query.getDimensions(); for (DimensionSpec dimensionSpec : dimensions) { if (dimsInSubtotalSpec.contains(dimensionSpec.getOutputName())) { subTotalDimensionSpec.add(dimensionSpec); } } // Create appropriate LimitSpec for subtotal query LimitSpec subtotalQueryLimitSpec = NoopLimitSpec.instance(); if (!(baseSubtotalQuery.getLimitSpec() instanceof NoopLimitSpec)) { Set columns = new HashSet<>(aggsAndPostAggs); columns.addAll(subtotalSpec); subtotalQueryLimitSpec = baseSubtotalQuery.getLimitSpec().filterColumns(columns); } GroupByQuery subtotalQuery = baseSubtotalQuery .withLimitSpec(subtotalQueryLimitSpec); final GroupByRowProcessor.ResultSupplier resultSupplierOneFinal = resultSupplierOne; if (Utils.isPrefix(subtotalSpec, queryDimNames)) { // Since subtotalSpec is a prefix of base query dimensions, so results from base query are also sorted // by subtotalSpec as needed by stream merging. subtotalsResults.add( processSubtotalsResultAndOptionallyClose(() -> resultSupplierOneFinal, subTotalDimensionSpec, subtotalQuery, false) ); } else { // Since subtotalSpec is not a prefix of base query dimensions, so results from base query are not sorted // by subtotalSpec. So we first add the result of base query into another resultSupplier which are sorted // by subtotalSpec and then stream merge them. // Also note, we can't create the ResultSupplier eagerly here or as we don't want to eagerly allocate // merge buffers for processing subtotal. Supplier resultSupplierTwo = () -> GroupByRowProcessor.process( baseSubtotalQuery, subtotalQuery, resultSupplierOneFinal.results(subTotalDimensionSpec), configSupplier.get(), processingConfig, resource, spillMapper, processingConfig.getTmpDir(), processingConfig.intermediateComputeSizeBytes() ); subtotalsResults.add( processSubtotalsResultAndOptionallyClose(resultSupplierTwo, subTotalDimensionSpec, subtotalQuery, true) ); } } return Sequences.withBaggage( query.postProcess(Sequences.concat(subtotalsResults)), resultSupplierOne //this will close resources allocated by resultSupplierOne after sequence read ); } catch (Throwable e) { throw CloseableUtils.closeAndWrapInCatch(e, resultSupplierOne); } } private Sequence processSubtotalsResultAndOptionallyClose( Supplier baseResultsSupplier, List dimsToInclude, GroupByQuery subtotalQuery, boolean closeOnSequenceRead ) { // This closes the ResultSupplier in case of any exception here or arranges for it to be closed // on sequence read if closeOnSequenceRead is true. try { Supplier memoizedSupplier = Suppliers.memoize(baseResultsSupplier); return mergeResults( (queryPlus, responseContext) -> new LazySequence<>( () -> Sequences.withBaggage( memoizedSupplier.get().results(dimsToInclude), closeOnSequenceRead ? () -> CloseableUtils.closeAndWrapExceptions(memoizedSupplier.get()) : () -> {} ) ), subtotalQuery, ResponseContext.createEmpty() ); } catch (Throwable e) { throw CloseableUtils.closeAndWrapInCatch(e, baseResultsSupplier.get()); } } private void moveOrReplicateTimestampInRow( GroupByQuery query, int timestampResultFieldIndexInOriginalDimensions, ResultRow before, ResultRow after ) { // d1 is the __time // when query.granularity=all: convert [__time, d0] to [d0, d1] (actually, [d0, __time]) // when query.granularity!=all: convert [__time, d0] to [__time, d0, d1] (actually, [__time, d0, __time]) // overall, insert the removed d1 at the position where it is removed and remove the first __time if granularity=all Object theTimestamp = before.get(0); int expectedDimensionStartInAfterRow = 0; if (query.getResultRowHasTimestamp()) { expectedDimensionStartInAfterRow = 1; after.set(0, theTimestamp); } int timestampResultFieldIndexInAfterRow = timestampResultFieldIndexInOriginalDimensions + expectedDimensionStartInAfterRow; for (int i = expectedDimensionStartInAfterRow; i < timestampResultFieldIndexInAfterRow; i++) { // 0 in beforeRow is the timestamp, so plus 1 is the start of dimension in beforeRow after.set(i, before.get(i + 1)); } after.set(timestampResultFieldIndexInAfterRow, theTimestamp); for (int i = timestampResultFieldIndexInAfterRow + 1; i < before.length() + expectedDimensionStartInAfterRow; i++) { after.set(i, before.get(i - expectedDimensionStartInAfterRow)); } } private Set getAggregatorAndPostAggregatorNames(GroupByQuery query) { Set aggsAndPostAggs = new HashSet(); if (query.getAggregatorSpecs() != null) { for (AggregatorFactory af : query.getAggregatorSpecs()) { aggsAndPostAggs.add(af.getName()); } } if (query.getPostAggregatorSpecs() != null) { for (PostAggregator pa : query.getPostAggregatorSpecs()) { aggsAndPostAggs.add(pa.getName()); } } return aggsAndPostAggs; } /** * Wraps the sequence around if for this query a summary row might be needed in case the input becomes empty. */ public static Sequence wrapSummaryRowIfNeeded(GroupByQuery query, Sequence process) { if (!summaryRowPreconditions(query)) { return process; } final AtomicBoolean t = new AtomicBoolean(); return Sequences.concat( Sequences.map(process, ent -> { t.set(true); return ent; }), Sequences.simple(() -> { if (t.get()) { return Collections.emptyIterator(); } return summaryRowIterator(query); })); } private static boolean summaryRowPreconditions(GroupByQuery query) { LimitSpec limit = query.getLimitSpec(); if (limit instanceof DefaultLimitSpec) { DefaultLimitSpec limitSpec = (DefaultLimitSpec) limit; if (limitSpec.getLimit() == 0 || limitSpec.getOffset() > 0) { return false; } } if (!query.getDimensions().isEmpty()) { return false; } if (query.getGranularity().isFinerThan(Granularities.ALL)) { return false; } return true; } private static Iterator summaryRowIterator(GroupByQuery q) { List aggSpec = q.getAggregatorSpecs(); ResultRow resultRow = ResultRow.create(q.getResultRowSizeWithPostAggregators()); for (int i = 0; i < aggSpec.size(); i++) { resultRow.set(i, aggSpec.get(i).factorize(new AllNullColumnSelectorFactory()).get()); } Map map = resultRow.toMap(q); for (int i = 0; i < q.getPostAggregatorSpecs().size(); i++) { final PostAggregator postAggregator = q.getPostAggregatorSpecs().get(i); final Object value = postAggregator.compute(map); resultRow.set(q.getResultRowPostAggregatorStart() + i, value); map.put(postAggregator.getName(), value); } return Collections.singleton(resultRow).iterator(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy