org.apache.drill.exec.planner.fragment.QueueQueryParallelizer Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.planner.fragment;

import org.apache.commons.lang3.tuple.Pair;
import org.apache.drill.common.util.function.CheckedConsumer;
import org.apache.drill.exec.ops.QueryContext;
import org.apache.drill.exec.physical.PhysicalOperatorSetupException;
import org.apache.drill.exec.physical.base.PhysicalOperator;
import org.apache.drill.exec.planner.cost.NodeResource;
import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;

import java.util.Map;
import java.util.HashMap;
import java.util.Collection;
import java.util.Set;
import java.util.List;
import java.util.ArrayList;
import java.util.function.BiFunction;
import java.util.stream.Collectors;

/**
 * Paralellizer specialized for managing resources for a query based on Queues. This parallelizer
 * does not deal with increase/decrease of the parallelization of a query plan based on the current
 * cluster state. However, the memory assignment for each operator, minor fragment and major
 * fragment is based on the cluster state and provided queue configuration.
 */
public class QueueQueryParallelizer extends SimpleParallelizer {
  private final boolean planHasMemory;
  private final QueryContext queryContext;
  private final Map> operators;

  public QueueQueryParallelizer(boolean memoryPlanning, QueryContext queryContext) {
    super(queryContext);
    this.planHasMemory = memoryPlanning;
    this.queryContext = queryContext;
    this.operators = new HashMap<>();
  }

  // return the memory computed for a physical operator on a drillbitendpoint.
  public BiFunction getMemory() {
    return (endpoint, operator) -> {
      if (!planHasMemory) {
        return operators.get(endpoint).get(operator);
      }
      else {
        return operator.getMaxAllocation();
      }
    };
  }

  /**
   * Function called by the SimpleParallelizer to adjust the memory post parallelization.
   * The overall logic is to traverse the fragment tree and call the MemoryCalculator on
   * each major fragment. Once the memory is computed, resource requirement are accumulated
   * per drillbit.
   *
   * The total resource requirements are used to select a queue. If the selected queue's
   * resource limit is more/less than the query's requirement than the memory will be re-adjusted.
   *
   * @param planningSet context of the fragments.
   * @param roots root fragments.
   * @param activeEndpoints currently active endpoints.
   * @throws PhysicalOperatorSetupException
   */
  public void adjustMemory(PlanningSet planningSet, Set roots,
                           Collection activeEndpoints) throws PhysicalOperatorSetupException {

    if (planHasMemory) {
      return;
    }
    // total node resources for the query plan maintained per drillbit.
    final Map totalNodeResources =
            activeEndpoints.stream().collect(Collectors.toMap(x ->x,
                                                              x -> NodeResource.create()));

    // list of the physical operators and their memory requirements per drillbit.
    final Map>> operators =
            activeEndpoints.stream().collect(Collectors.toMap(x -> x,
                                                              x -> new ArrayList<>()));

    for (Wrapper wrapper : roots) {
      traverse(wrapper, CheckedConsumer.throwingConsumerWrapper((Wrapper fragment) -> {
        MemoryCalculator calculator = new MemoryCalculator(planningSet, queryContext);
        fragment.getNode().getRoot().accept(calculator, fragment);
        NodeResource.merge(totalNodeResources, fragment.getResourceMap());
        operators.entrySet()
                  .stream()
                  .forEach((entry) -> entry.getValue()
                                           .addAll(calculator.getBufferedOperators(entry.getKey())));
      }));
    }
    //queryrm.selectQueue( pass the max node Resource) returns queue configuration.
    Map>> memoryAdjustedOperators = ensureOperatorMemoryWithinLimits(operators, totalNodeResources, 10);
    memoryAdjustedOperators.entrySet().stream().forEach((x) -> {
      Map memoryPerOperator = x.getValue().stream()
                                                                  .collect(Collectors.toMap(operatorLongPair -> operatorLongPair.getLeft(),
                                                                                            operatorLongPair -> operatorLongPair.getRight(),
                                                                                            (mem_1, mem_2) -> (mem_1 + mem_2)));
      this.operators.put(x.getKey(), memoryPerOperator);
    });
  }


  /**
   * Helper method to adjust the memory for the buffered operators.
   * @param memoryPerOperator list of physical operators per drillbit
   * @param nodeResourceMap resources per drillbit.
   * @param nodeLimit permissible node limit.
   * @return list of operators which contain adjusted memory limits.
   */
  private Map>>
          ensureOperatorMemoryWithinLimits(Map>> memoryPerOperator,
                                           Map nodeResourceMap, int nodeLimit) {
    // Get the physical operators which are above the node memory limit.
    Map>> onlyMemoryAboveLimitOperators = new HashMap<>();
    memoryPerOperator.entrySet().stream().forEach((entry) -> {
      onlyMemoryAboveLimitOperators.putIfAbsent(entry.getKey(), new ArrayList<>());
      if (nodeResourceMap.get(entry.getKey()).getMemory() > nodeLimit) {
        onlyMemoryAboveLimitOperators.get(entry.getKey()).addAll(entry.getValue());
      }
    });


    // Compute the total memory required by the physical operators on the drillbits which are above node limit.
    // Then use the total memory to adjust the memory requirement based on the permissible node limit.
    Map>> memoryAdjustedDrillbits = new HashMap<>();
    onlyMemoryAboveLimitOperators.entrySet().stream().forEach(
      entry -> {
        Long totalMemory = entry.getValue().stream().mapToLong(Pair::getValue).sum();
        List> adjustedMemory = entry.getValue().stream().map(operatorMemory -> {
          // formula to adjust the memory is (optimalMemory / totalMemory(this is for all the operators)) * permissible_node_limit.
          return Pair.of(operatorMemory.getKey(), (long) Math.ceil(operatorMemory.getValue()/totalMemory * nodeLimit));
        }).collect(Collectors.toList());
        memoryAdjustedDrillbits.put(entry.getKey(), adjustedMemory);
      }
    );

    // Get all the operations on drillbits which were adjusted for memory and merge them with operators which are not
    // adjusted for memory.
    Map>> allDrillbits = new HashMap<>();
    memoryPerOperator.entrySet().stream().filter((entry) -> !memoryAdjustedDrillbits.containsKey(entry.getKey())).forEach(
      operatorMemory -> {
        allDrillbits.put(operatorMemory.getKey(), operatorMemory.getValue());
      }
    );

    memoryAdjustedDrillbits.entrySet().stream().forEach(
      operatorMemory -> {
        allDrillbits.put(operatorMemory.getKey(), operatorMemory.getValue());
      }
    );

    // At this point allDrillbits contains the operators on all drillbits. The memory also is adjusted based on the nodeLimit and
    // the ratio of their requirements.
    return allDrillbits;
  }
}