All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.druid.query.groupby.epinephelinae.ParallelCombiner Maven / Gradle / Ivy

There is a newer version: 31.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.druid.query.groupby.epinephelinae;

import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import com.google.common.base.Suppliers;
import com.google.common.collect.Iterables;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import it.unimi.dsi.fastutil.objects.Object2IntArrayMap;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import org.apache.druid.collections.ReferenceCountingResourceHolder;
import org.apache.druid.java.util.common.CloseableIterators;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.java.util.common.io.Closer;
import org.apache.druid.java.util.common.parsers.CloseableIterator;
import org.apache.druid.query.AbstractPrioritizedCallable;
import org.apache.druid.query.QueryInterruptedException;
import org.apache.druid.query.aggregation.AggregatorFactory;
import org.apache.druid.query.dimension.DimensionSpec;
import org.apache.druid.query.groupby.epinephelinae.Grouper.Entry;
import org.apache.druid.query.groupby.epinephelinae.Grouper.KeySerdeFactory;
import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector;
import org.apache.druid.segment.ColumnSelectorFactory;
import org.apache.druid.segment.ColumnValueSelector;
import org.apache.druid.segment.DimensionSelector;
import org.apache.druid.segment.ObjectBasedColumnSelector;
import org.apache.druid.segment.column.ColumnCapabilities;

import javax.annotation.Nullable;
import java.io.Closeable;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.CancellationException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;

/**
 * ParallelCombiner builds a combining tree which asynchronously aggregates input entries.  Each node of the combining
 * tree is a combining task executed in parallel which aggregates inputs from the child nodes.
 */
public class ParallelCombiner
{
  // The combining tree created by this class can have two different degrees for intermediate nodes.
  // The "leaf combine degree (LCD)" is the number of leaf nodes combined together, while the "intermediate combine
  // degree (ICD)" is the number of non-leaf nodes combined together. The below picture shows an example where LCD = 2
  // and ICD = 4.
  //
  //        o         <- non-leaf node
  //     / / \ \      <- ICD = 4
  //  o   o   o   o   <- non-leaf nodes
  // / \ / \ / \ / \  <- LCD = 2
  // o o o o o o o o  <- leaf nodes
  //
  // The reason why we need two different degrees is to optimize the number of non-leaf nodes which are run by
  // different threads at the same time. Note that the leaf nodes are sorted iterators of SpillingGroupers which
  // generally returns multiple rows of the same grouping key which in turn should be combined, while the non-leaf nodes
  // are iterators of StreamingMergeSortedGroupers and always returns a single row per grouping key. Generally, the
  // performance will get better as LCD becomes low while ICD is some value larger than LCD because the amount of work
  // each thread has to do can be properly tuned. The optimal values for LCD and ICD may vary with query and data. Here,
  // we use a simple heuristic to avoid complex optimization. That is, ICD is fixed as a user-configurable value and the
  // minimum LCD satisfying the memory restriction is searched. See findLeafCombineDegreeAndNumBuffers() for more
  // details.
  private static final int MINIMUM_LEAF_COMBINE_DEGREE = 2;

  private final ReferenceCountingResourceHolder combineBufferHolder;
  private final AggregatorFactory[] combiningFactories;
  private final KeySerdeFactory combineKeySerdeFactory;
  private final ListeningExecutorService executor;
  private final Comparator> keyObjComparator;
  private final int concurrencyHint;
  private final int priority;
  private final long queryTimeoutAt;

  // The default value is 8 which comes from an experiment. A non-leaf node will combine up to intermediateCombineDegree
  // rows for the same grouping key.
  private final int intermediateCombineDegree;

  public ParallelCombiner(
      ReferenceCountingResourceHolder combineBufferHolder,
      AggregatorFactory[] combiningFactories,
      KeySerdeFactory combineKeySerdeFactory,
      ListeningExecutorService executor,
      boolean sortHasNonGroupingFields,
      int concurrencyHint,
      int priority,
      long queryTimeoutAt,
      int intermediateCombineDegree
  )
  {
    this.combineBufferHolder = combineBufferHolder;
    this.combiningFactories = combiningFactories;
    this.combineKeySerdeFactory = combineKeySerdeFactory;
    this.executor = executor;
    this.keyObjComparator = combineKeySerdeFactory.objectComparator(sortHasNonGroupingFields);
    this.concurrencyHint = concurrencyHint;
    this.priority = priority;
    this.intermediateCombineDegree = intermediateCombineDegree;

    this.queryTimeoutAt = queryTimeoutAt;
  }

  /**
   * Build a combining tree for the input iterators which combine input entries asynchronously.  Each node in the tree
   * is a combining task which iterates through child iterators, aggregates the inputs from those iterators, and returns
   * an iterator for the result of aggregation.
   * 

* This method is called when data is spilled and thus streaming combine is preferred to avoid too many disk accesses. * * @return an iterator of the root grouper of the combining tree */ public CloseableIterator> combine( List>> sortedIterators, List mergedDictionary ) { // CombineBuffer is initialized when this method is called and closed after the result iterator is done final Closer closer = Closer.create(); try { final ByteBuffer combineBuffer = combineBufferHolder.get(); final int minimumRequiredBufferCapacity = StreamingMergeSortedGrouper.requiredBufferCapacity( combineKeySerdeFactory.factorizeWithDictionary(mergedDictionary), combiningFactories ); // We want to maximize the parallelism while the size of buffer slice is greater than the minimum buffer size // required by StreamingMergeSortedGrouper. Here, we find the leafCombineDegree of the cominbing tree and the // required number of buffers maximizing the parallelism. final Pair degreeAndNumBuffers = findLeafCombineDegreeAndNumBuffers( combineBuffer, minimumRequiredBufferCapacity, concurrencyHint, sortedIterators.size() ); final int leafCombineDegree = degreeAndNumBuffers.lhs; final int numBuffers = degreeAndNumBuffers.rhs; final int sliceSize = combineBuffer.capacity() / numBuffers; final Supplier bufferSupplier = createCombineBufferSupplier(combineBuffer, numBuffers, sliceSize); final Pair>>, List> combineIteratorAndFutures = buildCombineTree( sortedIterators, bufferSupplier, combiningFactories, leafCombineDegree, mergedDictionary ); final CloseableIterator> combineIterator = Iterables.getOnlyElement(combineIteratorAndFutures.lhs); final List combineFutures = combineIteratorAndFutures.rhs; closer.register(() -> checkCombineFutures(combineFutures)); return CloseableIterators.wrap(combineIterator, closer); } catch (Throwable t) { try { closer.close(); } catch (Throwable t2) { t.addSuppressed(t2); } throw t; } } private static void checkCombineFutures(List combineFutures) { for (Future future : combineFutures) { try { if (!future.isDone()) { // Cancel futures if close() for the iterator is called early due to some reason (e.g., test failure) future.cancel(true); } else { future.get(); } } catch (InterruptedException | CancellationException e) { throw new QueryInterruptedException(e); } catch (ExecutionException e) { throw new RuntimeException(e); } } } private static Supplier createCombineBufferSupplier( ByteBuffer combineBuffer, int numBuffers, int sliceSize ) { return new Supplier() { private int i = 0; @Override public ByteBuffer get() { if (i < numBuffers) { return Groupers.getSlice(combineBuffer, sliceSize, i++); } else { throw new ISE("Requested number[%d] of buffer slices exceeds the planned one[%d]", i++, numBuffers); } } }; } /** * Find a minimum size of the buffer slice and corresponding leafCombineDegree and number of slices. Note that each * node in the combining tree is executed by different threads. This method assumes that combining the leaf nodes * requires threads as many as possible, while combining intermediate nodes is not. See the comment on * {@link #MINIMUM_LEAF_COMBINE_DEGREE} for more details. * * @param combineBuffer entire buffer used for combining tree * @param requiredMinimumBufferCapacity minimum buffer capacity for {@link StreamingMergeSortedGrouper} * @param numAvailableThreads number of available threads * @param numLeafNodes number of leaf nodes of combining tree * * @return a pair of leafCombineDegree and number of buffers if found. */ private Pair findLeafCombineDegreeAndNumBuffers( ByteBuffer combineBuffer, int requiredMinimumBufferCapacity, int numAvailableThreads, int numLeafNodes ) { for (int leafCombineDegree = MINIMUM_LEAF_COMBINE_DEGREE; leafCombineDegree <= numLeafNodes; leafCombineDegree++) { final int requiredBufferNum = computeRequiredBufferNum(numLeafNodes, leafCombineDegree); if (requiredBufferNum <= numAvailableThreads) { final int expectedSliceSize = combineBuffer.capacity() / requiredBufferNum; if (expectedSliceSize >= requiredMinimumBufferCapacity) { return Pair.of(leafCombineDegree, requiredBufferNum); } } } throw new ISE( "Cannot find a proper leaf combine degree for the combining tree. " + "Each node of the combining tree requires a buffer of [%d] bytes. " + "Try increasing druid.processing.buffer.sizeBytes (currently [%d] bytes) for larger buffer or " + "druid.query.groupBy.intermediateCombineDegree for a smaller tree", requiredMinimumBufferCapacity, combineBuffer.capacity() ); } /** * Recursively compute the number of required buffers for a combining tree in a bottom-up manner. Since each node of * the combining tree represents a combining task and each combining task requires one buffer, the number of required * buffers is the number of nodes of the combining tree. * * @param numChildNodes number of child nodes * @param combineDegree combine degree for the current level * * @return minimum number of buffers required for combining tree * * @see #buildCombineTree */ private int computeRequiredBufferNum(int numChildNodes, int combineDegree) { // numChildrenForLastNode used to determine that the last node is needed for the current level. // Please see buildCombineTree() for more details. final int numChildrenForLastNode = numChildNodes % combineDegree; final int numCurLevelNodes = numChildNodes / combineDegree + (numChildrenForLastNode > 1 ? 1 : 0); final int numChildOfParentNodes = numCurLevelNodes + (numChildrenForLastNode == 1 ? 1 : 0); if (numChildOfParentNodes == 1) { return numCurLevelNodes; } else { return numCurLevelNodes + computeRequiredBufferNum(numChildOfParentNodes, intermediateCombineDegree); } } /** * Recursively build a combining tree in a bottom-up manner. Each node of the tree is a task that combines input * iterators asynchronously. * * @param childIterators all iterators of the child level * @param bufferSupplier combining buffer supplier * @param combiningFactories array of combining aggregator factories * @param combineDegree combining degree for the current level * @param dictionary merged dictionary * * @return a pair of a list of iterators of the current level in the combining tree and a list of futures of all * executed combining tasks */ private Pair>>, List> buildCombineTree( List>> childIterators, Supplier bufferSupplier, AggregatorFactory[] combiningFactories, int combineDegree, List dictionary ) { final int numChildLevelIterators = childIterators.size(); final List>> childIteratorsOfNextLevel = new ArrayList<>(); final List combineFutures = new ArrayList<>(); // The below algorithm creates the combining nodes of the current level. It first checks that the number of children // to be combined together is 1. If it is, the intermediate combining node for that child is not needed. Instead, it // can be directly connected to a node of the parent level. Here is an example of generated tree when // numLeafNodes = 6 and leafCombineDegree = intermediateCombineDegree = 2. See the description of // MINIMUM_LEAF_COMBINE_DEGREE for more details about leafCombineDegree and intermediateCombineDegree. // // o // / \ // o \ // / \ \ // o o o // / \ / \ / \ // o o o o o o // // We can expect that the aggregates can be combined as early as possible because the tree is built in a bottom-up // manner. for (int i = 0; i < numChildLevelIterators; i += combineDegree) { if (i < numChildLevelIterators - 1) { final List>> subIterators = childIterators.subList( i, Math.min(i + combineDegree, numChildLevelIterators) ); final Pair>, Future> iteratorAndFuture = runCombiner( subIterators, bufferSupplier.get(), combiningFactories, dictionary ); childIteratorsOfNextLevel.add(iteratorAndFuture.lhs); combineFutures.add(iteratorAndFuture.rhs); } else { // If there remains one child, it can be directly connected to a node of the parent level. childIteratorsOfNextLevel.add(childIterators.get(i)); } } if (childIteratorsOfNextLevel.size() == 1) { // This is the root return Pair.of(childIteratorsOfNextLevel, combineFutures); } else { // Build the parent level iterators final Pair>>, List> parentIteratorsAndFutures = buildCombineTree( childIteratorsOfNextLevel, bufferSupplier, combiningFactories, intermediateCombineDegree, dictionary ); combineFutures.addAll(parentIteratorsAndFutures.rhs); return Pair.of(parentIteratorsAndFutures.lhs, combineFutures); } } private Pair>, Future> runCombiner( List>> iterators, ByteBuffer combineBuffer, AggregatorFactory[] combiningFactories, List dictionary ) { final SettableColumnSelectorFactory settableColumnSelectorFactory = new SettableColumnSelectorFactory(combiningFactories); final StreamingMergeSortedGrouper grouper = new StreamingMergeSortedGrouper<>( Suppliers.ofInstance(combineBuffer), combineKeySerdeFactory.factorizeWithDictionary(dictionary), settableColumnSelectorFactory, combiningFactories, queryTimeoutAt ); grouper.init(); // init() must be called before iterator(), so cannot be called inside the below callable. final ListenableFuture future = executor.submit( new AbstractPrioritizedCallable(priority) { @Override public Void call() { try ( CloseableIterator> mergedIterator = CloseableIterators.mergeSorted( iterators, keyObjComparator ); // This variable is used to close releaser automatically. @SuppressWarnings("unused") final Closeable releaser = combineBufferHolder.increment() ) { while (mergedIterator.hasNext()) { final Entry next = mergedIterator.next(); settableColumnSelectorFactory.set(next.getValues()); grouper.aggregate(next.getKey()); // grouper always returns ok or throws an exception settableColumnSelectorFactory.set(null); } } catch (IOException e) { throw new RuntimeException(e); } grouper.finish(); return null; } } ); return new Pair<>(grouper.iterator(), future); } private static class SettableColumnSelectorFactory implements ColumnSelectorFactory { private static final int UNKNOWN_COLUMN_INDEX = -1; private final Object2IntMap columnIndexMap; @Nullable private Object[] values; SettableColumnSelectorFactory(AggregatorFactory[] aggregatorFactories) { columnIndexMap = new Object2IntArrayMap<>(aggregatorFactories.length); columnIndexMap.defaultReturnValue(UNKNOWN_COLUMN_INDEX); for (int i = 0; i < aggregatorFactories.length; i++) { columnIndexMap.put(aggregatorFactories[i].getName(), i); } } public void set(@Nullable Object[] values) { this.values = values; } private int checkAndGetColumnIndex(String columnName) { final int columnIndex = columnIndexMap.getInt(columnName); Preconditions.checkState( columnIndex != UNKNOWN_COLUMN_INDEX, "Cannot find a proper column index for column[%s]", columnName ); return columnIndex; } @Override public DimensionSelector makeDimensionSelector(DimensionSpec dimensionSpec) { throw new UnsupportedOperationException(); } @Override public ColumnValueSelector makeColumnValueSelector(String columnName) { return new ObjectBasedColumnSelector() { @Override public void inspectRuntimeShape(RuntimeShapeInspector inspector) { // do nothing } @Override public Class classOfObject() { return Object.class; } @Override public Object getObject() { return values[checkAndGetColumnIndex(columnName)]; } }; } @Override public ColumnCapabilities getColumnCapabilities(String column) { return null; } } }