org.apache.hadoop.hive.ql.plan.MapWork Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.plan;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import com.facebook.presto.hive.$internal.org.apache.commons.logging.Log;
import com.facebook.presto.hive.$internal.org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorUtils;
import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol;
import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.SortCol;
import org.apache.hadoop.hive.ql.parse.SplitSample;
import org.apache.hadoop.hive.ql.plan.Explain.Level;
import org.apache.hadoop.mapred.JobConf;
import com.facebook.presto.hive.$internal.com.google.common.collect.Interner;
/**
* MapWork represents all the information used to run a map task on the cluster.
* It is first used when the query planner breaks the logical plan into tasks and
* used throughout physical optimization to track map-side operator plans, input
* paths, aliases, etc.
*
* ExecDriver will serialize the contents of this class and make sure it is
* distributed on the cluster. The ExecMapper will ultimately deserialize this
* class on the data nodes and setup it's operator pipeline accordingly.
*
* This class is also used in the explain command any property with the
* appropriate annotation will be displayed in the explain output.
*/
@SuppressWarnings({"serial", "deprecation"})
public class MapWork extends BaseWork {
private static final Log LOG = LogFactory.getLog(MapWork.class);
private boolean hadoopSupportsSplittable;
// use LinkedHashMap to make sure the iteration order is
// deterministic, to ease testing
private LinkedHashMap> pathToAliases = new LinkedHashMap>();
private LinkedHashMap pathToPartitionInfo = new LinkedHashMap();
private LinkedHashMap> aliasToWork = new LinkedHashMap>();
private LinkedHashMap aliasToPartnInfo = new LinkedHashMap();
private HashMap nameToSplitSample = new LinkedHashMap();
// If this map task has a FileSinkOperator, and bucketing/sorting metadata can be
// inferred about the data being written by that operator, these are mappings from the directory
// that operator writes into to the bucket/sort columns for that data.
private final Map> bucketedColsByDirectory =
new HashMap>();
private final Map> sortedColsByDirectory =
new HashMap>();
private Path tmpHDFSPath;
private String inputformat;
private String indexIntermediateFile;
private Integer numMapTasks;
private Long maxSplitSize;
private Long minSplitSize;
private Long minSplitSizePerNode;
private Long minSplitSizePerRack;
//use sampled partitioning
private int samplingType;
public static final int SAMPLING_ON_PREV_MR = 1; // todo HIVE-3841
public static final int SAMPLING_ON_START = 2; // sampling on task running
// the following two are used for join processing
private boolean leftInputJoin;
private String[] baseSrc;
private List mapAliases;
private boolean mapperCannotSpanPartns;
// used to indicate the input is sorted, and so a BinarySearchRecordReader shoudl be used
private boolean inputFormatSorted = false;
private boolean useBucketizedHiveInputFormat;
private boolean useOneNullRowInputFormat;
private boolean dummyTableScan = false;
// used for dynamic partitioning
private Map> eventSourceTableDescMap =
new LinkedHashMap>();
private Map> eventSourceColumnNameMap =
new LinkedHashMap>();
private Map> eventSourcePartKeyExprMap =
new LinkedHashMap>();
private boolean doSplitsGrouping = true;
public MapWork() {}
public MapWork(String name) {
super(name);
}
@Explain(displayName = "Path -> Alias", explainLevels = { Level.EXTENDED })
public LinkedHashMap> getPathToAliases() {
return pathToAliases;
}
public void setPathToAliases(
final LinkedHashMap> pathToAliases) {
this.pathToAliases = pathToAliases;
}
/**
* This is used to display and verify output of "Path -> Alias" in test framework.
*
* QTestUtil masks "Path -> Alias" and makes verification impossible.
* By keeping "Path -> Alias" intact and adding a new display name which is not
* masked by QTestUtil by removing prefix.
*
* Notes: we would still be masking for intermediate directories.
*
* @return
*/
@Explain(displayName = "Truncated Path -> Alias", explainLevels = { Level.EXTENDED })
public Map> getTruncatedPathToAliases() {
Map> trunPathToAliases = new LinkedHashMap>();
Iterator>> itr = this.pathToAliases.entrySet().iterator();
while (itr.hasNext()) {
final Entry> entry = itr.next();
String origiKey = entry.getKey();
String newKey = PlanUtils.removePrefixFromWarehouseConfig(origiKey);
ArrayList value = entry.getValue();
trunPathToAliases.put(newKey, value);
}
return trunPathToAliases;
}
@Explain(displayName = "Path -> Partition", explainLevels = { Level.EXTENDED })
public LinkedHashMap getPathToPartitionInfo() {
return pathToPartitionInfo;
}
public void setPathToPartitionInfo(
final LinkedHashMap pathToPartitionInfo) {
this.pathToPartitionInfo = pathToPartitionInfo;
}
/**
* Derive additional attributes to be rendered by EXPLAIN.
* TODO: this method is relied upon by custom input formats to set jobconf properties.
* This is madness? - This is Hive Storage Handlers!
*/
public void deriveExplainAttributes() {
if (pathToPartitionInfo != null) {
for (Map.Entry entry : pathToPartitionInfo
.entrySet()) {
entry.getValue().deriveBaseFileName(entry.getKey());
}
}
MapredLocalWork mapLocalWork = getMapRedLocalWork();
if (mapLocalWork != null) {
mapLocalWork.deriveExplainAttributes();
}
}
public void internTable(Interner interner) {
if (aliasToPartnInfo != null) {
for (PartitionDesc part : aliasToPartnInfo.values()) {
if (part == null) {
continue;
}
part.intern(interner);
}
}
if (pathToPartitionInfo != null) {
for (PartitionDesc part : pathToPartitionInfo.values()) {
part.intern(interner);
}
}
}
/**
* @return the aliasToPartnInfo
*/
public LinkedHashMap getAliasToPartnInfo() {
return aliasToPartnInfo;
}
/**
* @param aliasToPartnInfo
* the aliasToPartnInfo to set
*/
public void setAliasToPartnInfo(
LinkedHashMap aliasToPartnInfo) {
this.aliasToPartnInfo = aliasToPartnInfo;
}
public LinkedHashMap> getAliasToWork() {
return aliasToWork;
}
public void setAliasToWork(
final LinkedHashMap> aliasToWork) {
this.aliasToWork = aliasToWork;
}
@Explain(displayName = "Split Sample", explainLevels = { Level.EXTENDED })
public HashMap getNameToSplitSample() {
return nameToSplitSample;
}
public void setNameToSplitSample(HashMap nameToSplitSample) {
this.nameToSplitSample = nameToSplitSample;
}
public Integer getNumMapTasks() {
return numMapTasks;
}
public void setNumMapTasks(Integer numMapTasks) {
this.numMapTasks = numMapTasks;
}
@SuppressWarnings("nls")
public void addMapWork(String path, String alias, Operator> work,
PartitionDesc pd) {
ArrayList curAliases = pathToAliases.get(path);
if (curAliases == null) {
assert (pathToPartitionInfo.get(path) == null);
curAliases = new ArrayList();
pathToAliases.put(path, curAliases);
pathToPartitionInfo.put(path, pd);
} else {
assert (pathToPartitionInfo.get(path) != null);
}
for (String oneAlias : curAliases) {
if (oneAlias.equals(alias)) {
throw new RuntimeException("Multiple aliases named: " + alias
+ " for path: " + path);
}
}
curAliases.add(alias);
if (aliasToWork.get(alias) != null) {
throw new RuntimeException("Existing work for alias: " + alias);
}
aliasToWork.put(alias, work);
}
public boolean isInputFormatSorted() {
return inputFormatSorted;
}
public void setInputFormatSorted(boolean inputFormatSorted) {
this.inputFormatSorted = inputFormatSorted;
}
public void resolveDynamicPartitionStoredAsSubDirsMerge(HiveConf conf, Path path,
TableDesc tblDesc, ArrayList aliases, PartitionDesc partDesc) {
pathToAliases.put(path.toString(), aliases);
pathToPartitionInfo.put(path.toString(), partDesc);
}
/**
* For each map side operator - stores the alias the operator is working on
* behalf of in the operator runtime state. This is used by reduce sink
* operator - but could be useful for debugging as well.
*/
private void setAliases() {
if(aliasToWork == null) {
return;
}
for (String oneAlias : aliasToWork.keySet()) {
aliasToWork.get(oneAlias).setAlias(oneAlias);
}
}
@Explain(displayName = "Execution mode")
public String getVectorModeOn() {
return vectorMode ? "vectorized" : null;
}
@Override
public void replaceRoots(Map, Operator>> replacementMap) {
LinkedHashMap> newAliasToWork = new LinkedHashMap>();
for (Map.Entry> entry: aliasToWork.entrySet()) {
newAliasToWork.put(entry.getKey(), replacementMap.get(entry.getValue()));
}
setAliasToWork(newAliasToWork);
}
@Override
@Explain(displayName = "Map Operator Tree", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED })
public Set> getAllRootOperators() {
Set> opSet = new LinkedHashSet>();
for (Operator> op : getAliasToWork().values()) {
opSet.add(op);
}
return opSet;
}
public void mergeAliasedInput(String alias, String pathDir, PartitionDesc partitionInfo) {
ArrayList aliases = pathToAliases.get(pathDir);
if (aliases == null) {
aliases = new ArrayList(Arrays.asList(alias));
pathToAliases.put(pathDir, aliases);
pathToPartitionInfo.put(pathDir, partitionInfo);
} else {
aliases.add(alias);
}
}
public void initialize() {
setAliases();
}
public Long getMaxSplitSize() {
return maxSplitSize;
}
public void setMaxSplitSize(Long maxSplitSize) {
this.maxSplitSize = maxSplitSize;
}
public Long getMinSplitSize() {
return minSplitSize;
}
public void setMinSplitSize(Long minSplitSize) {
this.minSplitSize = minSplitSize;
}
public Long getMinSplitSizePerNode() {
return minSplitSizePerNode;
}
public void setMinSplitSizePerNode(Long minSplitSizePerNode) {
this.minSplitSizePerNode = minSplitSizePerNode;
}
public Long getMinSplitSizePerRack() {
return minSplitSizePerRack;
}
public void setMinSplitSizePerRack(Long minSplitSizePerRack) {
this.minSplitSizePerRack = minSplitSizePerRack;
}
public String getInputformat() {
return inputformat;
}
public void setInputformat(String inputformat) {
this.inputformat = inputformat;
}
public boolean isUseBucketizedHiveInputFormat() {
return useBucketizedHiveInputFormat;
}
public void setUseBucketizedHiveInputFormat(boolean useBucketizedHiveInputFormat) {
this.useBucketizedHiveInputFormat = useBucketizedHiveInputFormat;
}
public void setUseOneNullRowInputFormat(boolean useOneNullRowInputFormat) {
this.useOneNullRowInputFormat = useOneNullRowInputFormat;
}
public boolean isUseOneNullRowInputFormat() {
return useOneNullRowInputFormat;
}
public void setMapperCannotSpanPartns(boolean mapperCannotSpanPartns) {
this.mapperCannotSpanPartns = mapperCannotSpanPartns;
}
public boolean isMapperCannotSpanPartns() {
return this.mapperCannotSpanPartns;
}
public boolean getHadoopSupportsSplittable() {
return hadoopSupportsSplittable;
}
public void setHadoopSupportsSplittable(boolean hadoopSupportsSplittable) {
this.hadoopSupportsSplittable = hadoopSupportsSplittable;
}
public String getIndexIntermediateFile() {
return indexIntermediateFile;
}
public ArrayList getAliases() {
return new ArrayList(aliasToWork.keySet());
}
public ArrayList> getWorks() {
return new ArrayList>(aliasToWork.values());
}
public ArrayList getPaths() {
return new ArrayList(pathToAliases.keySet());
}
public ArrayList getPartitionDescs() {
return new ArrayList(aliasToPartnInfo.values());
}
public Path getTmpHDFSPath() {
return tmpHDFSPath;
}
public void setTmpHDFSPath(Path tmpHDFSPath) {
this.tmpHDFSPath = tmpHDFSPath;
}
public void mergingInto(MapWork mapWork) {
// currently, this is sole field affecting mergee task
mapWork.useBucketizedHiveInputFormat |= useBucketizedHiveInputFormat;
}
@Explain(displayName = "Path -> Bucketed Columns", explainLevels = { Level.EXTENDED })
public Map> getBucketedColsByDirectory() {
return bucketedColsByDirectory;
}
@Explain(displayName = "Path -> Sorted Columns", explainLevels = { Level.EXTENDED })
public Map> getSortedColsByDirectory() {
return sortedColsByDirectory;
}
public void addIndexIntermediateFile(String fileName) {
if (this.indexIntermediateFile == null) {
this.indexIntermediateFile = fileName;
} else {
this.indexIntermediateFile += "," + fileName;
}
}
public int getSamplingType() {
return samplingType;
}
public void setSamplingType(int samplingType) {
this.samplingType = samplingType;
}
@Explain(displayName = "Sampling", explainLevels = { Level.EXTENDED })
public String getSamplingTypeString() {
return samplingType == 1 ? "SAMPLING_ON_PREV_MR" :
samplingType == 2 ? "SAMPLING_ON_START" : null;
}
@Override
public void configureJobConf(JobConf job) {
for (PartitionDesc partition : aliasToPartnInfo.values()) {
PlanUtils.configureJobConf(partition.getTableDesc(), job);
}
Collection> mappers = aliasToWork.values();
for (FileSinkOperator fs : OperatorUtils.findOperators(mappers, FileSinkOperator.class)) {
PlanUtils.configureJobConf(fs.getConf().getTableInfo(), job);
}
}
public void logPathToAliases() {
if (LOG.isDebugEnabled()) {
LOG.debug("LOGGING PATH TO ALIASES");
for (Map.Entry> entry: pathToAliases.entrySet()) {
for (String a: entry.getValue()) {
LOG.debug("Path: " + entry.getKey() + ", Alias: " + a);
}
}
}
}
public void setDummyTableScan(boolean dummyTableScan) {
this.dummyTableScan = dummyTableScan;
}
public boolean getDummyTableScan() {
return dummyTableScan;
}
public void setEventSourceTableDescMap(Map> map) {
this.eventSourceTableDescMap = map;
}
public Map> getEventSourceTableDescMap() {
return eventSourceTableDescMap;
}
public void setEventSourceColumnNameMap(Map> map) {
this.eventSourceColumnNameMap = map;
}
public Map> getEventSourceColumnNameMap() {
return eventSourceColumnNameMap;
}
public Map> getEventSourcePartKeyExprMap() {
return eventSourcePartKeyExprMap;
}
public void setEventSourcePartKeyExprMap(Map> map) {
this.eventSourcePartKeyExprMap = map;
}
public void setDoSplitsGrouping(boolean doSplitsGrouping) {
this.doSplitsGrouping = doSplitsGrouping;
}
public boolean getDoSplitsGrouping() {
return this.doSplitsGrouping;
}
public boolean isLeftInputJoin() {
return leftInputJoin;
}
public void setLeftInputJoin(boolean leftInputJoin) {
this.leftInputJoin = leftInputJoin;
}
public String[] getBaseSrc() {
return baseSrc;
}
public void setBaseSrc(String[] baseSrc) {
this.baseSrc = baseSrc;
}
public List getMapAliases() {
return mapAliases;
}
public void setMapAliases(List mapAliases) {
this.mapAliases = mapAliases;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy