org.apache.hadoop.hive.ql.parse.GenTezUtils Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.parse;
import java.util.ArrayList;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.fs.Path;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.HashTableDummyOperator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.UnionOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils;
import org.apache.hadoop.hive.ql.plan.BaseWork;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.ReduceWork;
import org.apache.hadoop.hive.ql.plan.TezEdgeProperty;
import org.apache.hadoop.hive.ql.plan.TezWork;
import org.apache.hadoop.hive.ql.plan.UnionWork;
import org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType;
/**
* GenTezUtils is a collection of shared helper methods to produce
* TezWork
*/
public class GenTezUtils {
static final private Log LOG = LogFactory.getLog(GenTezUtils.class.getName());
// sequence number is used to name vertices (e.g.: Map 1, Reduce 14, ...)
private int sequenceNumber = 0;
// singleton
private static GenTezUtils utils;
public static GenTezUtils getUtils() {
if (utils == null) {
utils = new GenTezUtils();
}
return utils;
}
protected GenTezUtils() {
}
public void resetSequenceNumber() {
sequenceNumber = 0;
}
public UnionWork createUnionWork(GenTezProcContext context, Operator> operator, TezWork tezWork) {
UnionWork unionWork = new UnionWork("Union "+ (++sequenceNumber));
context.unionWorkMap.put(operator, unionWork);
tezWork.add(unionWork);
return unionWork;
}
public ReduceWork createReduceWork(GenTezProcContext context, Operator> root, TezWork tezWork) {
assert !root.getParentOperators().isEmpty();
ReduceWork reduceWork = new ReduceWork("Reducer "+ (++sequenceNumber));
LOG.debug("Adding reduce work (" + reduceWork.getName() + ") for " + root);
reduceWork.setReducer(root);
reduceWork.setNeedsTagging(GenMapRedUtils.needsTagging(reduceWork));
// All parents should be reduce sinks. We pick the one we just walked
// to choose the number of reducers. In the join/union case they will
// all be -1. In sort/order case where it matters there will be only
// one parent.
assert context.parentOfRoot instanceof ReduceSinkOperator;
ReduceSinkOperator reduceSink = (ReduceSinkOperator) context.parentOfRoot;
reduceWork.setNumReduceTasks(reduceSink.getConf().getNumReducers());
setupReduceSink(context, reduceWork, reduceSink);
tezWork.add(reduceWork);
TezEdgeProperty edgeProp = new TezEdgeProperty(EdgeType.SIMPLE_EDGE);
tezWork.connect(
context.preceedingWork,
reduceWork, edgeProp);
context.connectedReduceSinks.add(reduceSink);
return reduceWork;
}
protected void setupReduceSink(GenTezProcContext context, ReduceWork reduceWork,
ReduceSinkOperator reduceSink) {
LOG.debug("Setting up reduce sink: " + reduceSink
+ " with following reduce work: " + reduceWork.getName());
// need to fill in information about the key and value in the reducer
GenMapRedUtils.setKeyAndValueDesc(reduceWork, reduceSink);
// remember which parent belongs to which tag
reduceWork.getTagToInput().put(reduceSink.getConf().getTag(),
context.preceedingWork.getName());
// remember the output name of the reduce sink
reduceSink.getConf().setOutputName(reduceWork.getName());
}
public MapWork createMapWork(GenTezProcContext context, Operator> root,
TezWork tezWork, PrunedPartitionList partitions) throws SemanticException {
assert root.getParentOperators().isEmpty();
MapWork mapWork = new MapWork("Map "+ (++sequenceNumber));
LOG.debug("Adding map work (" + mapWork.getName() + ") for " + root);
// map work starts with table scan operators
assert root instanceof TableScanOperator;
String alias = ((TableScanOperator)root).getConf().getAlias();
setupMapWork(mapWork, context, partitions, root, alias);
// add new item to the tez work
tezWork.add(mapWork);
return mapWork;
}
// this method's main use is to help unit testing this class
protected void setupMapWork(MapWork mapWork, GenTezProcContext context,
PrunedPartitionList partitions, Operator extends OperatorDesc> root,
String alias) throws SemanticException {
// All the setup is done in GenMapRedUtils
GenMapRedUtils.setMapWork(mapWork, context.parseContext,
context.inputs, partitions, root, alias, context.conf, false);
}
// removes any union operator and clones the plan
public void removeUnionOperators(Configuration conf, GenTezProcContext context,
BaseWork work)
throws SemanticException {
Set> roots = work.getAllRootOperators();
if (work.getDummyOps() != null) {
roots.addAll(work.getDummyOps());
}
// need to clone the plan.
Set> newRoots = Utilities.cloneOperatorTree(conf, roots);
// we're cloning the operator plan but we're retaining the original work. That means
// that root operators have to be replaced with the cloned ops. The replacement map
// tells you what that mapping is.
Map, Operator>> replacementMap = new HashMap, Operator>>();
// there's some special handling for dummyOps required. Mapjoins won't be properly
// initialized if their dummy parents aren't initialized. Since we cloned the plan
// we need to replace the dummy operators in the work with the cloned ones.
List dummyOps = new LinkedList();
Iterator> it = newRoots.iterator();
for (Operator> orig: roots) {
Operator> newRoot = it.next();
if (newRoot instanceof HashTableDummyOperator) {
dummyOps.add((HashTableDummyOperator)newRoot);
it.remove();
} else {
replacementMap.put(orig,newRoot);
}
}
// now we remove all the unions. we throw away any branch that's not reachable from
// the current set of roots. The reason is that those branches will be handled in
// different tasks.
Deque> operators = new LinkedList>();
operators.addAll(newRoots);
Set> seen = new HashSet>();
while(!operators.isEmpty()) {
Operator> current = operators.pop();
seen.add(current);
if (current instanceof FileSinkOperator) {
FileSinkOperator fileSink = (FileSinkOperator)current;
// remember it for additional processing later
context.fileSinkSet.add(fileSink);
FileSinkDesc desc = fileSink.getConf();
Path path = desc.getDirName();
List linked;
if (!context.linkedFileSinks.containsKey(path)) {
linked = new ArrayList();
context.linkedFileSinks.put(path, linked);
}
linked = context.linkedFileSinks.get(path);
linked.add(desc);
desc.setDirName(new Path(path, ""+linked.size()));
desc.setLinkedFileSinkDesc(linked);
}
if (current instanceof UnionOperator) {
Operator> parent = null;
int count = 0;
for (Operator> op: current.getParentOperators()) {
if (seen.contains(op)) {
++count;
parent = op;
}
}
// we should have been able to reach the union from only one side.
assert count <= 1;
if (parent == null) {
// root operator is union (can happen in reducers)
replacementMap.put(current, current.getChildOperators().get(0));
} else {
parent.removeChildAndAdoptItsChildren(current);
}
}
if (current instanceof FileSinkOperator
|| current instanceof ReduceSinkOperator) {
current.setChildOperators(null);
} else {
operators.addAll(current.getChildOperators());
}
}
work.setDummyOps(dummyOps);
work.replaceRoots(replacementMap);
}
public void processFileSink(GenTezProcContext context, FileSinkOperator fileSink)
throws SemanticException {
ParseContext parseContext = context.parseContext;
boolean isInsertTable = // is INSERT OVERWRITE TABLE
GenMapRedUtils.isInsertInto(parseContext, fileSink);
HiveConf hconf = parseContext.getConf();
boolean chDir = GenMapRedUtils.isMergeRequired(context.moveTask,
hconf, fileSink, context.currentTask, isInsertTable);
Path finalName = GenMapRedUtils.createMoveTask(context.currentTask,
chDir, fileSink, parseContext, context.moveTask, hconf, context.dependencyTask);
if (chDir) {
// Merge the files in the destination table/partitions by creating Map-only merge job
// If underlying data is RCFile a RCFileBlockMerge task would be created.
LOG.info("using CombineHiveInputformat for the merge job");
GenMapRedUtils.createMRWorkForMergingFiles(fileSink, finalName,
context.dependencyTask, context.moveTask,
hconf, context.currentTask);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy