org.apache.hadoop.hive.ql.exec.TopNHash Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec;

import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryMXBean;
import java.util.Arrays;
import java.util.Comparator;
import java.util.TreeMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.llap.LlapDaemonInfo;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.io.HiveKey;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.WritableComparator;
import com.facebook.presto.hive.$internal.org.slf4j.Logger;
import com.facebook.presto.hive.$internal.org.slf4j.LoggerFactory;

import com.facebook.presto.hive.$internal.com.google.common.collect.MinMaxPriorityQueue;

/**
 * Stores binary key/value in sorted manner to get top-n key/value
 * TODO: rename to TopNHeap?
 */
public class TopNHash {
  private static final Logger LOG = LoggerFactory.getLogger(TopNHash.class);

  /**
   * For interaction between operator and top-n hash.
   * Currently only used to forward key/values stored in hash.
   */
  public static interface BinaryCollector {
    public void collect(byte[] key, byte[] value, int hash) throws IOException;
  }

  public static final int FORWARD = -1; // Forward the row to reducer as is.
  public static final int EXCLUDE = -2; // Discard the row.
  private static final int MAY_FORWARD = -3; // Vectorized - may forward the row, not sure yet.

  protected BinaryCollector collector;
  protected int topN;

  protected long threshold;   // max heap size
  protected long usage;

  // binary keys, values and hashCodes of rows, lined up by index
  private byte[][] keys;
  private byte[][] values;
  private int[] hashes;
  private int[] distKeyLengths;
  private IndexStore indexes; // The heap over the keys, storing indexes in the array.

  private int evicted; // recently evicted index (used for next key/value)
  private int excluded; // count of excluded rows from previous flush

  // temporary single-batch context used for vectorization
  private int batchNumForwards = 0; // whether current batch has any forwarded keys
  private int[] indexToBatchIndex; // mapping of index (lined up w/keys) to index in the batch
  protected int[] batchIndexToResult; // mapping of index in the batch (linear) to hash result
  protected int batchSize; // Size of the current batch.

  protected boolean isEnabled = false;

  private final Comparator C = new Comparator() {
    public int compare(Integer o1, Integer o2) {
      byte[] key1 = keys[o1];
      byte[] key2 = keys[o2];
      int length1 = distKeyLengths[o1];
      int length2 = distKeyLengths[o2];
      return WritableComparator.compareBytes(key1, 0, length1, key2, 0, length2);
    }
  };

  public void initialize(
    int topN, float memUsage, boolean isMapGroupBy, BinaryCollector collector, final OperatorDesc conf,
    final Configuration hconf) {
    assert topN >= 0 && memUsage > 0;
    assert !this.isEnabled;
    this.isEnabled = false;
    this.topN = topN;
    this.collector = collector;
    if (topN == 0) {
      isEnabled = true;
      return; // topN == 0 will cause a short-circuit, don't need any initialization
    }

    final boolean isTez = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez");
    final boolean isLlap = LlapDaemonInfo.INSTANCE.isLlap();
    final int numExecutors = isLlap ? LlapDaemonInfo.INSTANCE.getNumExecutors() : 1;

    // Used Memory = totalMemory() - freeMemory();
    // Total Free Memory = maxMemory() - Used Memory;
    long totalFreeMemory = Runtime.getRuntime().maxMemory() -
      Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory();

    if (isTez) {
      MemoryMXBean memoryMXBean = ManagementFactory.getMemoryMXBean();
      // TODO: For LLAP, assumption is off-heap cache.
      final long memoryUsedPerExecutor = (memoryMXBean.getHeapMemoryUsage().getUsed() / numExecutors);
      // this is total free memory available per executor in case of LLAP
      totalFreeMemory = conf.getMaxMemoryAvailable() - memoryUsedPerExecutor;
    }

    // limit * 64 : compensation of arrays for key/value/hashcodes
    this.threshold = (long) (memUsage * totalFreeMemory) - topN * 64L;
    if (threshold < 0) {
      return;
    }
    this.indexes = isMapGroupBy ? new HashForGroup() : new HashForRow();
    this.keys = new byte[topN + 1][];
    this.values = new byte[topN + 1][];
    this.hashes = new int[topN + 1];
    this.distKeyLengths = new int[topN + 1];
    this.evicted = topN;
    this.isEnabled = true;
  }

  /**
   * Try store the non-vectorized key.
   * @param key Serialized key.
   * @return TopNHash.FORWARD if the row should be forwarded;
   *         TopNHash.EXCLUDED if the row should be discarded;
   *         any other number if the row is to be stored; the index should be passed to storeValue.
   */
  public int tryStoreKey(HiveKey key, boolean partColsIsNull) throws HiveException, IOException {
    if (!isEnabled) {
      return FORWARD; // short-circuit quickly - forward all rows
    }
    if (topN == 0) {
      return EXCLUDE; // short-circuit quickly - eat all rows
    }
    int index = insertKeyIntoHeap(key);
    if (index >= 0) {
      usage += key.getLength();
      return index;
    }
    // IndexStore is trying to tell us something.
    switch (index) {
      case FORWARD:  return FORWARD;
      case EXCLUDE: return EXCLUDE; // skip the row.
      default: {
        assert false;
        throw new HiveException("Invalid result trying to store the key: " + index);
      }
    }
  }


  /**
   * Perform basic checks and initialize TopNHash for the new vectorized row batch.
   * @param size batch size
   * @return TopNHash.FORWARD if all rows should be forwarded w/o trying to call TopN;
   *         TopNHash.EXCLUDED if all rows should be discarded w/o trying to call TopN;
   *         any other result means the batch has been started.
   */
  public int startVectorizedBatch(int size) throws IOException, HiveException {
    if (!isEnabled) {
      return FORWARD; // short-circuit quickly - forward all rows
    } else if (topN == 0) {
      return EXCLUDE; // short-circuit quickly - eat all rows
    }
    // Flush here if the memory usage is too high. After that, we have the entire
    // batch already in memory anyway so we will bypass the memory checks.
    if (usage > threshold) {
      int excluded = this.excluded;
      LOG.info("Top-N hash is flushing rows");
      flushInternal();
      if (excluded == 0) {
        LOG.info("Top-N hash has been disabled");
        isEnabled = false;
        return FORWARD; // Hash is ineffective, disable.
      }
    }
    // Started ok; initialize context for new batch.
    batchSize = size;
    if (batchIndexToResult == null || batchIndexToResult.length < batchSize) {
      batchIndexToResult = new int[Math.max(batchSize, VectorizedRowBatch.DEFAULT_SIZE)];
    }
    if (indexToBatchIndex == null) {
      indexToBatchIndex = new int[topN + 1];
    }
    Arrays.fill(indexToBatchIndex, -1);
    batchNumForwards = 0;
    return 0;
  }

  /**
   * Try to put the key from the current vectorized batch into the heap.
   * @param key the key.
   * @param batchIndex The index of the key in the vectorized batch (sequential, not .selected).
   */
  public void tryStoreVectorizedKey(HiveKey key, boolean partColsIsNull, int batchIndex)
      throws HiveException, IOException {
    // Assumption - batchIndex is increasing; startVectorizedBatch was called
    int size = indexes.size();
    int index = size < topN ? size : evicted;
    keys[index] = Arrays.copyOf(key.getBytes(), key.getLength());
    distKeyLengths[index] = key.getDistKeyLength();
    hashes[index] = key.hashCode();
    Integer collisionIndex = indexes.store(index);
    if (null != collisionIndex) {
      /*
       * since there is a collision index will be used for the next value 
       * so have the map point back to original index.
       */
      if ( indexes instanceof HashForGroup ) {
        indexes.store(collisionIndex);
      }
      // forward conditional on the survival of the corresponding key currently in indexes.
      ++batchNumForwards;
      batchIndexToResult[batchIndex] = MAY_FORWARD - collisionIndex;
      return;
    }
    indexToBatchIndex[index] = batchIndex;
    batchIndexToResult[batchIndex] = index;
    if (size != topN) return;
    evicted = indexes.removeBiggest();  // remove the biggest key
    if (index == evicted) {
      excluded++;
      batchIndexToResult[batchIndex] = EXCLUDE;
      indexToBatchIndex[index] = -1;
      return; // input key is bigger than any of keys in hash
    }
    removed(evicted);
    int evictedBatchIndex = indexToBatchIndex[evicted];
    if (evictedBatchIndex >= 0) {
      // reset the result for the evicted index
      batchIndexToResult[evictedBatchIndex] = EXCLUDE;
      indexToBatchIndex[evicted] = -1;
    }
    // Evict all results grouped with this index; it cannot be any key further in the batch.
    // If we evict a key from this batch, the keys grouped with it cannot be earlier that that key.
    // If we evict a key that is not from this batch, initial i = (-1) + 1 = 0, as intended.
    int evictedForward = (MAY_FORWARD - evicted);
    for (int i = evictedBatchIndex + 1; i < batchIndex && (batchNumForwards > 0); ++i) {
      if (batchIndexToResult[i] == evictedForward) {
        batchIndexToResult[i] = EXCLUDE;
        --batchNumForwards;
      }
    }
  }

  /**
   * Get vectorized batch result for particular index.
   * @param batchIndex index of the key in the batch.
   * @return the result, same as from {@link #tryStoreKey(HiveKey)}
   */
  public int getVectorizedBatchResult(int batchIndex) {
    int result = batchIndexToResult[batchIndex];
    return (result <= MAY_FORWARD) ? FORWARD : result;
  }

  /**
   * After vectorized batch is processed, can return the key that caused a particular row
   * to be forwarded. Because the row could only be marked to forward because it has
   * the same key with some row already in the heap (for GBY), we can use that key from the
   * heap to emit the forwarded row.
   * @param batchIndex index of the key in the batch.
   * @return The key corresponding to the index.
   */
  public HiveKey getVectorizedKeyToForward(int batchIndex) {
    int index = MAY_FORWARD - batchIndexToResult[batchIndex];
    HiveKey hk = new HiveKey();
    hk.set(keys[index], 0, keys[index].length);
    hk.setHashCode(hashes[index]);
    hk.setDistKeyLength(distKeyLengths[index]);
    return hk;
  }

  /**
   * After vectorized batch is processed, can return distribution keys length of a key.
   * @param batchIndex index of the key in the batch.
   * @return The distribution length corresponding to the key.
   */
  public int getVectorizedKeyDistLength(int batchIndex) {
    return distKeyLengths[batchIndexToResult[batchIndex]];
  }

  /**
   * After vectorized batch is processed, can return hashCode of a key.
   * @param batchIndex index of the key in the batch.
   * @return The hashCode corresponding to the key.
   */
  public int getVectorizedKeyHashCode(int batchIndex) {
    return hashes[batchIndexToResult[batchIndex]];
  }
  
  /**
   * Stores the value for the key in the heap.
   * @param index The index, either from tryStoreKey or from tryStoreVectorizedKey result.
   * @param hasCode hashCode of key, used by ptfTopNHash.
   * @param value The value to store.
   * @param keyHash The key hash to store.
   * @param vectorized Whether the result is coming from a vectorized batch.
   */
  public void storeValue(int index, int hashCode, BytesWritable value, boolean vectorized) {
    values[index] = Arrays.copyOf(value.getBytes(), value.getLength());
    // Vectorized doesn't adjust usage for the keys while processing the batch
    usage += values[index].length + (vectorized ? keys[index].length : 0);
  }

  /**
   * Flushes all the rows cached in the heap.
   */
  public void flush() throws HiveException {
    if (!isEnabled || (topN == 0)) return;
    try {
      flushInternal();
    } catch (IOException ex) {
      throw new HiveException(ex);
    }
  }

  /**
   * returns index for key/value/hashcode if it's acceptable.
   * -1, -2, -3, -4 can be returned for other actions.
   * 
   * -1 for FORWARD   : should be forwarded to output collector (for GBY)
   * -2 for EXCLUDED  : not in top-k. ignore it
   */
  private int insertKeyIntoHeap(HiveKey key) throws IOException, HiveException {
    if (usage > threshold) {
      flushInternal();
      if (excluded == 0) {
        LOG.info("Top-N hash is disabled");
        isEnabled = false;
      }
      // we can now retry adding key/value into hash, which is flushed.
      // but for simplicity, just forward them
      return FORWARD;
    }
    int size = indexes.size();
    int index = size < topN ? size : evicted;
    keys[index] = Arrays.copyOf(key.getBytes(), key.getLength());
    distKeyLengths[index] = key.getDistKeyLength();
    hashes[index] = key.hashCode();
    if (null != indexes.store(index)) {
      // it's only for GBY which should forward all values associated with the key in the range
      // of limit. new value should be attatched with the key but in current implementation,
      // only one values is allowed. with map-aggreagtion which is true by default,
      // this is not common case, so just forward new key/value and forget that (todo)
      return FORWARD;
    }
    if (size == topN) {
      evicted = indexes.removeBiggest();  // remove the biggest key
      if (index == evicted) {
        excluded++;
        return EXCLUDE;          // input key is bigger than any of keys in hash
      }
      removed(evicted);
    }
    return index;
  }

  // key/value of the index is removed. retrieve memory usage
  private void removed(int index) {
    usage -= keys[index].length;
    keys[index] = null;
    if (values[index] != null) {
      usage -= values[index].length;
      values[index] = null;
    }
    hashes[index] = -1;
    distKeyLengths[index] = -1;
  }

  private void flushInternal() throws IOException, HiveException {
    for (int index : indexes.indexes()) {
      if (index != evicted && values[index] != null) {
        collector.collect(keys[index], values[index], hashes[index]);
        usage -= values[index].length;
        values[index] = null;
        hashes[index] = -1;
      }
    }
    excluded = 0;
  }
  
  private interface IndexStore {
    int size();
    /**
     * @return the index which caused the item to be rejected; or null if accepted
     */
    Integer store(int index);
    int removeBiggest();
    Iterable indexes();
  }

  /**
   * for order by, same keys are counted (For 1-2-2-3-4, limit 3 is 1-2-2)
   * MinMaxPriorityQueue is used because it alows duplication and fast access to biggest one
   */
  private class HashForRow implements IndexStore {
    private final MinMaxPriorityQueue indexes = MinMaxPriorityQueue.orderedBy(C).create();

    public int size() {
      return indexes.size();
    }

    // returns null always
    public Integer store(int index) {
      boolean result = indexes.add(index);
      assert result;
      return null;
    }

    public int removeBiggest() {
      return indexes.removeLast();
    }

    public Iterable indexes() {
      Integer[] array = indexes.toArray(new Integer[indexes.size()]);
      Arrays.sort(array, 0, array.length, C);
      return Arrays.asList(array);
    }
  }

  /**
   * for group by, same keys are not counted (For 1-2-2-3-4, limit 3 is 1-2-(2)-3)
   * simple TreeMap is used because group by does not need keep duplicated keys
   */
  private class HashForGroup implements IndexStore {
    // TreeSet anyway uses TreeMap; so use plain TreeMap to be able to get value in collisions.
    private final TreeMap indexes = new TreeMap(C);

    public int size() {
      return indexes.size();
    }

    // returns false if index already exists in map
    public Integer store(int index) {
      return indexes.put(index, index);
    }

    public int removeBiggest() {
      Integer last = indexes.lastKey();
      indexes.remove(last);
      return last;
    }

    public Iterable indexes() {
      return indexes.keySet();
    }
  }
}