org.apache.hadoop.hive.ql.plan.MapredWork Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec Show documentation
Hive is a data warehouse infrastructure built on top of Hadoop see http://wiki.apache.org/hadoop/Hive
There is a newer version: 0.11.0-shark-0.9.1
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.plan;

import java.io.ByteArrayOutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol;
import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.SortCol;
import org.apache.hadoop.hive.ql.parse.OpParseContext;
import org.apache.hadoop.hive.ql.parse.QBJoinTree;
import org.apache.hadoop.hive.ql.parse.SplitSample;

/**
 * MapredWork.
 *
 */
@Explain(displayName = "Map Reduce")
public class MapredWork extends AbstractOperatorDesc {
  private static final long serialVersionUID = 1L;
  private String command;
  // map side work
  // use LinkedHashMap to make sure the iteration order is
  // deterministic, to ease testing
  private LinkedHashMap> pathToAliases;

  private LinkedHashMap pathToPartitionInfo;

  private LinkedHashMap> aliasToWork;

  private LinkedHashMap aliasToPartnInfo;

  private HashMap nameToSplitSample;

  // map<->reduce interface
  // schema of the map-reduce 'key' object - this is homogeneous
  private TableDesc keyDesc;

  // schema of the map-reduce 'val' object - this is heterogeneous
  private List tagToValueDesc;

  private Operator reducer;

  private Integer numReduceTasks;
  private Integer numMapTasks;
  private Long maxSplitSize;
  private Long minSplitSize;
  private Long minSplitSizePerNode;
  private Long minSplitSizePerRack;

  private boolean needsTagging;
  private boolean hadoopSupportsSplittable;

  private MapredLocalWork mapLocalWork;
  private String inputformat;
  private String indexIntermediateFile;
  private boolean gatheringStats;

  private String tmpHDFSFileURI;

  private LinkedHashMap, OpParseContext> opParseCtxMap;

  private QBJoinTree joinTree;

  private boolean mapperCannotSpanPartns;

  // used to indicate the input is sorted, and so a BinarySearchRecordReader shoudl be used
  private boolean inputFormatSorted = false;

  private transient boolean useBucketizedHiveInputFormat;

  // if this is true, this means that this is the map reduce task which writes the final data,
  // ignoring the optional merge task
  private boolean finalMapRed = false;

  // If this map reduce task has a FileSinkOperator, and bucketing/sorting metadata can be
  // inferred about the data being written by that operator, these are mappings from the directory
  // that operator writes into to the bucket/sort columns for that data.
  private final Map> bucketedColsByDirectory =
      new HashMap>();
  private final Map> sortedColsByDirectory =
      new HashMap>();

  public MapredWork() {
    aliasToPartnInfo = new LinkedHashMap();
  }

  public MapredWork(
      final String command,
      final LinkedHashMap> pathToAliases,
      final LinkedHashMap pathToPartitionInfo,
      final LinkedHashMap> aliasToWork,
      final TableDesc keyDesc, List tagToValueDesc,
      final Operator reducer, final Integer numReduceTasks,
      final MapredLocalWork mapLocalWork,
      final boolean hadoopSupportsSplittable) {
    this.command = command;
    this.pathToAliases = pathToAliases;
    this.pathToPartitionInfo = pathToPartitionInfo;
    this.aliasToWork = aliasToWork;
    this.keyDesc = keyDesc;
    this.tagToValueDesc = tagToValueDesc;
    this.reducer = reducer;
    this.numReduceTasks = numReduceTasks;
    this.mapLocalWork = mapLocalWork;
    aliasToPartnInfo = new LinkedHashMap();
    this.hadoopSupportsSplittable = hadoopSupportsSplittable;
    maxSplitSize = null;
    minSplitSize = null;
    minSplitSizePerNode = null;
    minSplitSizePerRack = null;
  }

  public String getCommand() {
    return command;
  }

  public void setCommand(final String command) {
    this.command = command;
  }

  @Explain(displayName = "Path -> Alias", normalExplain = false)
  public LinkedHashMap> getPathToAliases() {
    return pathToAliases;
  }

  public void setPathToAliases(
      final LinkedHashMap> pathToAliases) {
    this.pathToAliases = pathToAliases;
  }

  @Explain(displayName = "Truncated Path -> Alias", normalExplain = false)
  /**
   * This is used to display and verify output of "Path -> Alias" in test framework.
   *
   * {@link QTestUtil} masks "Path -> Alias" and makes verification impossible.
   * By keeping "Path -> Alias" intact and adding a new display name which is not
   * masked by {@link QTestUtil} by removing prefix.
   *
   * Notes: we would still be masking for intermediate directories.
   *
   * @return
   */
  public Map> getTruncatedPathToAliases() {
    Map> trunPathToAliases = new LinkedHashMap>();
    Iterator>> itr = this.pathToAliases.entrySet().iterator();
    while (itr.hasNext()) {
      final Entry> entry = itr.next();
      String origiKey = entry.getKey();
      String newKey = PlanUtils.removePrefixFromWarehouseConfig(origiKey);
      ArrayList value = entry.getValue();
      trunPathToAliases.put(newKey, value);
    }
    return trunPathToAliases;
  }



  @Explain(displayName = "Path -> Partition", normalExplain = false)
  public LinkedHashMap getPathToPartitionInfo() {
    return pathToPartitionInfo;
  }

  public void setPathToPartitionInfo(
      final LinkedHashMap pathToPartitionInfo) {
    this.pathToPartitionInfo = pathToPartitionInfo;
  }

  /**
   * @return the aliasToPartnInfo
   */
  public LinkedHashMap getAliasToPartnInfo() {
    return aliasToPartnInfo;
  }

  /**
   * @param aliasToPartnInfo
   *          the aliasToPartnInfo to set
   */
  public void setAliasToPartnInfo(
      LinkedHashMap aliasToPartnInfo) {
    this.aliasToPartnInfo = aliasToPartnInfo;
  }

  @Explain(displayName = "Alias -> Map Operator Tree")
  public LinkedHashMap> getAliasToWork() {
    return aliasToWork;
  }

  public void setAliasToWork(
      final LinkedHashMap> aliasToWork) {
    this.aliasToWork = aliasToWork;
  }

  /**
   * @return the mapredLocalWork
   */
  @Explain(displayName = "Local Work")
  public MapredLocalWork getMapLocalWork() {
    return mapLocalWork;
  }

  /**
   * @param mapLocalWork
   *          the mapredLocalWork to set
   */
  public void setMapLocalWork(final MapredLocalWork mapLocalWork) {
    this.mapLocalWork = mapLocalWork;
  }

  public TableDesc getKeyDesc() {
    return keyDesc;
  }

  /**
   * If the plan has a reducer and correspondingly a reduce-sink, then store the TableDesc pointing
   * to keySerializeInfo of the ReduceSink
   *
   * @param keyDesc
   */
  public void setKeyDesc(final TableDesc keyDesc) {
    this.keyDesc = keyDesc;
  }

  public List getTagToValueDesc() {
    return tagToValueDesc;
  }

  public void setTagToValueDesc(final List tagToValueDesc) {
    this.tagToValueDesc = tagToValueDesc;
  }

  @Explain(displayName = "Reduce Operator Tree")
  public Operator getReducer() {
    return reducer;
  }

  @Explain(displayName = "Split Sample")
  public HashMap getNameToSplitSample() {
    return nameToSplitSample;
  }

  public void setNameToSplitSample(HashMap nameToSplitSample) {
    this.nameToSplitSample = nameToSplitSample;
  }

  public void setReducer(final Operator reducer) {
    this.reducer = reducer;
  }

  public Integer getNumMapTasks() {
    return numMapTasks;
  }

  public void setNumMapTasks(Integer numMapTasks) {
    this.numMapTasks = numMapTasks;
  }

  /**
   * If the number of reducers is -1, the runtime will automatically figure it
   * out by input data size.
   *
   * The number of reducers will be a positive number only in case the target
   * table is bucketed into N buckets (through CREATE TABLE). This feature is
   * not supported yet, so the number of reducers will always be -1 for now.
   */
  public Integer getNumReduceTasks() {
    return numReduceTasks;
  }

  public void setNumReduceTasks(final Integer numReduceTasks) {
    this.numReduceTasks = numReduceTasks;
  }

  @Explain(displayName = "Path -> Bucketed Columns", normalExplain = false)
  public Map> getBucketedColsByDirectory() {
    return bucketedColsByDirectory;
  }

  @Explain(displayName = "Path -> Sorted Columns", normalExplain = false)
  public Map> getSortedColsByDirectory() {
    return sortedColsByDirectory;
  }

  @SuppressWarnings("nls")
  public void addMapWork(String path, String alias, Operator work,
      PartitionDesc pd) {
    ArrayList curAliases = pathToAliases.get(path);
    if (curAliases == null) {
      assert (pathToPartitionInfo.get(path) == null);
      curAliases = new ArrayList();
      pathToAliases.put(path, curAliases);
      pathToPartitionInfo.put(path, pd);
    } else {
      assert (pathToPartitionInfo.get(path) != null);
    }

    for (String oneAlias : curAliases) {
      if (oneAlias.equals(alias)) {
        throw new RuntimeException("Multiple aliases named: " + alias
            + " for path: " + path);
      }
    }
    curAliases.add(alias);

    if (aliasToWork.get(alias) != null) {
      throw new RuntimeException("Existing work for alias: " + alias);
    }
    aliasToWork.put(alias, work);
  }

  @SuppressWarnings("nls")
  public String isInvalid() {
    if ((getNumReduceTasks() >= 1) && (getReducer() == null)) {
      return "Reducers > 0 but no reduce operator";
    }

    if ((getNumReduceTasks() == 0) && (getReducer() != null)) {
      return "Reducers == 0 but reduce operator specified";
    }

    return null;
  }

  public String toXML() {
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    Utilities.serializeMapRedWork(this, baos);
    return (baos.toString());
  }

  // non bean

  /**
   * For each map side operator - stores the alias the operator is working on
   * behalf of in the operator runtime state. This is used by reducesink
   * operator - but could be useful for debugging as well.
   */
  private void setAliases() {
    if(aliasToWork == null) {
      return;
    }
    for (String oneAlias : aliasToWork.keySet()) {
      aliasToWork.get(oneAlias).setAlias(oneAlias);
    }
  }

  /**
   * Derive additional attributes to be rendered by EXPLAIN.
   */
  public void deriveExplainAttributes() {
    if (pathToPartitionInfo != null) {
      for (Map.Entry entry : pathToPartitionInfo
          .entrySet()) {
        entry.getValue().deriveBaseFileName(entry.getKey());
      }
    }
    if (mapLocalWork != null) {
      mapLocalWork.deriveExplainAttributes();
    }
  }

  public void initialize() {
    setAliases();
  }

  @Explain(displayName = "Needs Tagging", normalExplain = false)
  public boolean getNeedsTagging() {
    return needsTagging;
  }

  public void setNeedsTagging(boolean needsTagging) {
    this.needsTagging = needsTagging;
  }

  public boolean getHadoopSupportsSplittable() {
    return hadoopSupportsSplittable;
  }

  public void setHadoopSupportsSplittable(boolean hadoopSupportsSplittable) {
    this.hadoopSupportsSplittable = hadoopSupportsSplittable;
  }

  public Long getMaxSplitSize() {
    return maxSplitSize;
  }

  public void setMaxSplitSize(Long maxSplitSize) {
    this.maxSplitSize = maxSplitSize;
  }

  public Long getMinSplitSize() {
    return minSplitSize;
  }

  public void setMinSplitSize(Long minSplitSize) {
    this.minSplitSize = minSplitSize;
  }

  public Long getMinSplitSizePerNode() {
    return minSplitSizePerNode;
  }

  public void setMinSplitSizePerNode(Long minSplitSizePerNode) {
    this.minSplitSizePerNode = minSplitSizePerNode;
  }

  public Long getMinSplitSizePerRack() {
    return minSplitSizePerRack;
  }

  public void setMinSplitSizePerRack(Long minSplitSizePerRack) {
    this.minSplitSizePerRack = minSplitSizePerRack;
  }

  public String getInputformat() {
    return inputformat;
  }

  public void setInputformat(String inputformat) {
    this.inputformat = inputformat;
  }

  public String getIndexIntermediateFile() {
    return indexIntermediateFile;
  }

  public void addIndexIntermediateFile(String fileName) {
    if (this.indexIntermediateFile == null) {
      this.indexIntermediateFile = fileName;
    } else {
      this.indexIntermediateFile += "," + fileName;
    }
  }

  public void setGatheringStats(boolean gatherStats) {
    this.gatheringStats = gatherStats;
  }

  public boolean isGatheringStats() {
    return this.gatheringStats;
  }

  public void setMapperCannotSpanPartns(boolean mapperCannotSpanPartns) {
    this.mapperCannotSpanPartns = mapperCannotSpanPartns;
  }

  public boolean isMapperCannotSpanPartns() {
    return this.mapperCannotSpanPartns;
  }

  public String getTmpHDFSFileURI() {
    return tmpHDFSFileURI;
  }

  public void setTmpHDFSFileURI(String tmpHDFSFileURI) {
    this.tmpHDFSFileURI = tmpHDFSFileURI;
  }


  public QBJoinTree getJoinTree() {
    return joinTree;
  }

  public void setJoinTree(QBJoinTree joinTree) {
    this.joinTree = joinTree;
  }

  public
    LinkedHashMap, OpParseContext> getOpParseCtxMap() {
    return opParseCtxMap;
  }

  public void setOpParseCtxMap(
    LinkedHashMap, OpParseContext> opParseCtxMap) {
    this.opParseCtxMap = opParseCtxMap;
  }

  public boolean isInputFormatSorted() {
    return inputFormatSorted;
  }

  public void setInputFormatSorted(boolean inputFormatSorted) {
    this.inputFormatSorted = inputFormatSorted;
  }

  public void resolveDynamicPartitionStoredAsSubDirsMerge(HiveConf conf, Path path,
      TableDesc tblDesc, ArrayList aliases, PartitionDesc partDesc) {
    pathToAliases.put(path.toString(), aliases);
    pathToPartitionInfo.put(path.toString(), partDesc);
  }

  public List> getAllOperators() {
    ArrayList> opList = new ArrayList>();
    ArrayList> returnList = new ArrayList>();

    if (getReducer() != null) {
      opList.add(getReducer());
    }

    Map> pa = getPathToAliases();
    if (pa != null) {
      for (List ls : pa.values()) {
        for (String a : ls) {
          Operator op = getAliasToWork().get(a);
          if (op != null ) {
            opList.add(op);
          }
        }
      }
    }

    //recursively add all children
    while (!opList.isEmpty()) {
      Operator op = opList.remove(0);
      if (op.getChildOperators() != null) {
        opList.addAll(op.getChildOperators());
      }
      returnList.add(op);
    }

    return returnList;
  }

  public boolean isUseBucketizedHiveInputFormat() {
    return useBucketizedHiveInputFormat;
  }

  public void setUseBucketizedHiveInputFormat(boolean useBucketizedHiveInputFormat) {
    this.useBucketizedHiveInputFormat = useBucketizedHiveInputFormat;
  }

  public boolean isFinalMapRed() {
    return finalMapRed;
  }

  public void setFinalMapRed(boolean finalMapRed) {
    this.finalMapRed = finalMapRed;
  }
}