
org.apache.hadoop.hive.ql.exec.tez.CustomPartitionVertex Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-apache Show documentation
Show all versions of hive-apache Show documentation
Shaded version of Apache Hive for Presto
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.tez;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import com.facebook.presto.hive.$internal.com.google.common.collect.LinkedListMultimap;
import com.facebook.presto.hive.$internal.org.apache.commons.logging.Log;
import com.facebook.presto.hive.$internal.org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.plan.TezWork.VertexType;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.split.TezGroupedSplit;
import org.apache.hadoop.mapreduce.split.TezMapReduceSplitsGrouper;
import org.apache.tez.common.TezUtils;
import org.apache.tez.dag.api.EdgeProperty;
import org.apache.tez.dag.api.EdgeProperty.DataMovementType;
import org.apache.tez.dag.api.EdgeManagerPluginDescriptor;
import org.apache.tez.dag.api.InputDescriptor;
import org.apache.tez.dag.api.UserPayload;
import org.apache.tez.dag.api.VertexLocationHint;
import org.apache.tez.dag.api.VertexManagerPlugin;
import org.apache.tez.dag.api.VertexManagerPluginContext;
import org.apache.tez.mapreduce.hadoop.MRInputHelpers;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto;
import org.apache.tez.runtime.api.Event;
import org.apache.tez.runtime.api.InputSpecUpdate;
import org.apache.tez.runtime.api.events.InputConfigureVertexTasksEvent;
import org.apache.tez.runtime.api.events.InputDataInformationEvent;
import org.apache.tez.runtime.api.events.InputUpdatePayloadEvent;
import org.apache.tez.runtime.api.events.VertexManagerEvent;
import com.facebook.presto.hive.$internal.com.google.common.base.Preconditions;
import com.facebook.presto.hive.$internal.com.google.common.collect.ArrayListMultimap;
import com.facebook.presto.hive.$internal.com.google.common.collect.HashMultimap;
import com.facebook.presto.hive.$internal.com.google.common.collect.Lists;
import com.facebook.presto.hive.$internal.com.google.common.collect.Maps;
import com.facebook.presto.hive.$internal.com.google.common.collect.Multimap;
import com.facebook.presto.hive.protobuf.ByteString;
/*
* This is the central piece for Bucket Map Join and SMB join. It has the following
* responsibilities:
* 1. Group incoming splits based on bucketing.
* 2. Generate new serialized events for the grouped splits.
* 3. Create a routing table for the bucket map join and send a serialized version as payload
* for the EdgeManager.
* 4. For SMB join, generate a grouping according to bucketing for the "small" table side.
*/
public class CustomPartitionVertex extends VertexManagerPlugin {
public class PathComparatorForSplit implements Comparator {
@Override
public int compare(InputSplit inp1, InputSplit inp2) {
FileSplit fs1 = (FileSplit) inp1;
FileSplit fs2 = (FileSplit) inp2;
int retval = fs1.getPath().compareTo(fs2.getPath());
if (retval != 0) {
return retval;
}
if (fs1.getStart() != fs2.getStart()) {
return (int) (fs1.getStart() - fs2.getStart());
}
return 0;
}
}
private static final Log LOG = LogFactory.getLog(CustomPartitionVertex.class.getName());
VertexManagerPluginContext context;
private InputConfigureVertexTasksEvent configureVertexTaskEvent;
private int numBuckets = -1;
private Configuration conf = null;
private final SplitGrouper grouper = new SplitGrouper();
private int taskCount = 0;
private VertexType vertexType;
private String mainWorkName;
private final Multimap bucketToTaskMap = HashMultimap. create();
private final Map> inputToGroupedSplitMap =
new HashMap>();
private int numInputsAffectingRootInputSpecUpdate = 1;
private int numInputsSeenSoFar = 0;
private final Map emMap = Maps.newHashMap();
private final List finalSplits = Lists.newLinkedList();
private final Map inputNameInputSpecMap =
new HashMap();
public CustomPartitionVertex(VertexManagerPluginContext context) {
super(context);
}
@Override
public void initialize() {
this.context = getContext();
ByteBuffer payload = context.getUserPayload().getPayload();
CustomVertexConfiguration vertexConf = new CustomVertexConfiguration();
DataInputByteBuffer dibb = new DataInputByteBuffer();
dibb.reset(payload);
try {
vertexConf.readFields(dibb);
} catch (IOException e) {
throw new RuntimeException(e);
}
this.numBuckets = vertexConf.getNumBuckets();
this.mainWorkName = vertexConf.getInputName();
this.vertexType = vertexConf.getVertexType();
this.numInputsAffectingRootInputSpecUpdate = vertexConf.getNumInputs();
}
@Override
public void onVertexStarted(Map> completions) {
int numTasks = context.getVertexNumTasks(context.getVertexName());
List scheduledTasks =
new ArrayList(numTasks);
for (int i = 0; i < numTasks; ++i) {
scheduledTasks.add(new VertexManagerPluginContext.TaskWithLocationHint(new Integer(i), null));
}
context.scheduleVertexTasks(scheduledTasks);
}
@Override
public void onSourceTaskCompleted(String srcVertexName, Integer attemptId) {
}
@Override
public void onVertexManagerEventReceived(VertexManagerEvent vmEvent) {
}
// One call per root Input
@Override
public void onRootVertexInitialized(String inputName, InputDescriptor inputDescriptor,
List events) {
numInputsSeenSoFar++;
LOG.info("On root vertex initialized " + inputName);
try {
// This is using the payload from the RootVertexInitializer corresponding
// to InputName. Ideally it should be using it's own configuration class -
// but that
// means serializing another instance.
MRInputUserPayloadProto protoPayload =
MRInputHelpers.parseMRInputPayload(inputDescriptor.getUserPayload());
this.conf = TezUtils.createConfFromByteString(protoPayload.getConfigurationBytes());
/*
* Currently in tez, the flow of events is thus:
* "Generate Splits -> Initialize Vertex" (with parallelism info obtained
* from the generate splits phase). The generate splits phase groups
* splits using the TezGroupedSplitsInputFormat. However, for bucket map
* joins the grouping done by this input format results in incorrect
* results as the grouper has no knowledge of buckets. So, we initially
* set the input format to be HiveInputFormat (in DagUtils) for the case
* of bucket map joins so as to obtain un-grouped splits. We then group
* the splits corresponding to buckets using the tez grouper which returns
* TezGroupedSplits.
*/
// This assumes that Grouping will always be used.
// Enabling grouping on the payload.
MRInputUserPayloadProto updatedPayload =
MRInputUserPayloadProto.newBuilder(protoPayload).setGroupingEnabled(true).build();
inputDescriptor.setUserPayload(UserPayload.create(updatedPayload.toByteString().asReadOnlyByteBuffer()));
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
boolean dataInformationEventSeen = false;
Map> pathFileSplitsMap = new TreeMap>();
for (Event event : events) {
if (event instanceof InputConfigureVertexTasksEvent) {
// No tasks should have been started yet. Checked by initial state
// check.
LOG.info("Got a input configure vertex event for input: " + inputName);
Preconditions.checkState(dataInformationEventSeen == false);
InputConfigureVertexTasksEvent cEvent = (InputConfigureVertexTasksEvent) event;
// The vertex cannot be configured until all DataEvents are seen - to
// build the routing table.
configureVertexTaskEvent = cEvent;
LOG.info("Configure task for input name: " + inputName + " num tasks: "
+ configureVertexTaskEvent.getNumTasks());
}
if (event instanceof InputUpdatePayloadEvent) {
// this event can never occur. If it does, fail.
Preconditions.checkState(false);
} else if (event instanceof InputDataInformationEvent) {
dataInformationEventSeen = true;
InputDataInformationEvent diEvent = (InputDataInformationEvent) event;
FileSplit fileSplit;
try {
fileSplit = getFileSplitFromEvent(diEvent);
} catch (IOException e) {
throw new RuntimeException("Failed to get file split for event: " + diEvent, e);
}
Set fsList =
pathFileSplitsMap.get(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath()
.getName()));
if (fsList == null) {
fsList = new TreeSet(new PathComparatorForSplit());
pathFileSplitsMap.put(
Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()), fsList);
}
fsList.add(fileSplit);
}
}
LOG.info("Path file splits map for input name: " + inputName + " is " + pathFileSplitsMap);
Multimap bucketToInitialSplitMap =
getBucketSplitMapForPath(pathFileSplitsMap);
try {
int totalResource = context.getTotalAvailableResource().getMemory();
int taskResource = context.getVertexTaskResource().getMemory();
float waves =
conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES,
TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
int availableSlots = totalResource / taskResource;
LOG.info("Grouping splits. " + availableSlots + " available slots, " + waves
+ " waves. Bucket initial splits map: " + bucketToInitialSplitMap);
JobConf jobConf = new JobConf(conf);
ShimLoader.getHadoopShims().getMergedCredentials(jobConf);
Multimap bucketToGroupedSplitMap =
HashMultimap. create();
boolean secondLevelGroupingDone = false;
if ((mainWorkName.isEmpty()) || (inputName.compareTo(mainWorkName) == 0)) {
for (Integer key : bucketToInitialSplitMap.keySet()) {
InputSplit[] inputSplitArray =
(bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
Multimap groupedSplit =
grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves,
availableSlots, inputName, mainWorkName.isEmpty());
if (mainWorkName.isEmpty() == false) {
Multimap singleBucketToGroupedSplit =
HashMultimap. create();
singleBucketToGroupedSplit.putAll(key, groupedSplit.values());
groupedSplit =
grouper.group(jobConf, singleBucketToGroupedSplit, availableSlots,
HiveConf.getFloatVar(conf, HiveConf.ConfVars.TEZ_SMB_NUMBER_WAVES));
secondLevelGroupingDone = true;
}
bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
}
processAllEvents(inputName, bucketToGroupedSplitMap, secondLevelGroupingDone);
} else {
// do not group across files in case of side work because there is only 1 KV reader per
// grouped split. This would affect SMB joins where we want to find the smallest key in
// all the bucket files.
for (Integer key : bucketToInitialSplitMap.keySet()) {
InputSplit[] inputSplitArray =
(bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
Multimap groupedSplit =
grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves,
availableSlots, inputName, false);
bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
}
/*
* this is the small table side. In case of SMB join, we need to send each split to the
* corresponding bucket-based task on the other side. In case a split needs to go to
* multiple downstream tasks, we need to clone the event and send it to the right
* destination.
*/
LOG.info("This is the side work - multi-mr work.");
processAllSideEventsSetParallelism(inputName, bucketToGroupedSplitMap);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private void processAllSideEventsSetParallelism(String inputName,
Multimap bucketToGroupedSplitMap) throws IOException {
// the bucket to task map should have been setup by the big table.
LOG.info("Processing events for input " + inputName);
if (bucketToTaskMap.isEmpty()) {
LOG.info("We don't have a routing table yet. Will need to wait for the main input"
+ " initialization");
inputToGroupedSplitMap.put(inputName, bucketToGroupedSplitMap);
return;
}
processAllSideEvents(inputName, bucketToGroupedSplitMap);
setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
}
private void processAllSideEvents(String inputName,
Multimap bucketToGroupedSplitMap) throws IOException {
List taskEvents = new ArrayList();
LOG.info("We have a routing table and we are going to set the destination tasks for the"
+ " multi mr inputs. " + bucketToTaskMap);
Integer[] numSplitsForTask = new Integer[taskCount];
Multimap bucketToSerializedSplitMap = LinkedListMultimap.create();
// Create the list of serialized splits for each bucket.
for (Entry> entry : bucketToGroupedSplitMap.asMap().entrySet()) {
for (InputSplit split : entry.getValue()) {
MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(split);
ByteBuffer bs = serializedSplit.toByteString().asReadOnlyByteBuffer();
bucketToSerializedSplitMap.put(entry.getKey(), bs);
}
}
for (Entry> entry : bucketToSerializedSplitMap.asMap().entrySet()) {
Collection destTasks = bucketToTaskMap.get(entry.getKey());
for (Integer task : destTasks) {
int count = 0;
for (ByteBuffer buf : entry.getValue()) {
count++;
InputDataInformationEvent diEvent =
InputDataInformationEvent.createWithSerializedPayload(count, buf);
diEvent.setTargetIndex(task);
taskEvents.add(diEvent);
}
numSplitsForTask[task] = count;
}
}
inputNameInputSpecMap.put(inputName,
InputSpecUpdate.createPerTaskInputSpecUpdate(Arrays.asList(numSplitsForTask)));
LOG.info("For input name: " + inputName + " task events size is " + taskEvents.size());
context.addRootInputEvents(inputName, taskEvents);
}
private void processAllEvents(String inputName,
Multimap bucketToGroupedSplitMap, boolean secondLevelGroupingDone)
throws IOException {
int totalInputsCount = 0;
List numSplitsForTask = new ArrayList();
for (Entry> entry : bucketToGroupedSplitMap.asMap().entrySet()) {
int bucketNum = entry.getKey();
Collection initialSplits = entry.getValue();
finalSplits.addAll(initialSplits);
for (InputSplit inputSplit : initialSplits) {
bucketToTaskMap.put(bucketNum, taskCount);
if (secondLevelGroupingDone) {
TezGroupedSplit groupedSplit = (TezGroupedSplit) inputSplit;
numSplitsForTask.add(groupedSplit.getGroupedSplits().size());
totalInputsCount += groupedSplit.getGroupedSplits().size();
} else {
numSplitsForTask.add(1);
totalInputsCount += 1;
}
taskCount++;
}
}
inputNameInputSpecMap.put(inputName,
InputSpecUpdate.createPerTaskInputSpecUpdate(numSplitsForTask));
// Construct the EdgeManager descriptor to be used by all edges which need
// the routing table.
EdgeManagerPluginDescriptor hiveEdgeManagerDesc = null;
if ((vertexType == VertexType.MULTI_INPUT_INITIALIZED_EDGES)
|| (vertexType == VertexType.INITIALIZED_EDGES)) {
hiveEdgeManagerDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
UserPayload payload = getBytePayload(bucketToTaskMap);
hiveEdgeManagerDesc.setUserPayload(payload);
}
// Replace the edge manager for all vertices which have routing type custom.
for (Entry edgeEntry : context.getInputVertexEdgeProperties().entrySet()) {
if (edgeEntry.getValue().getDataMovementType() == DataMovementType.CUSTOM
&& edgeEntry.getValue().getEdgeManagerDescriptor().getClassName()
.equals(CustomPartitionEdge.class.getName())) {
emMap.put(edgeEntry.getKey(), hiveEdgeManagerDesc);
}
}
LOG.info("Task count is " + taskCount + " for input name: " + inputName);
List taskEvents = Lists.newArrayListWithCapacity(totalInputsCount);
// Re-serialize the splits after grouping.
int count = 0;
for (InputSplit inputSplit : finalSplits) {
if (secondLevelGroupingDone) {
TezGroupedSplit tezGroupedSplit = (TezGroupedSplit)inputSplit;
for (InputSplit subSplit : tezGroupedSplit.getGroupedSplits()) {
if ((subSplit instanceof TezGroupedSplit) == false) {
throw new IOException("Unexpected split type found: "
+ subSplit.getClass().getCanonicalName());
}
MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(subSplit);
InputDataInformationEvent diEvent =
InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit
.toByteString().asReadOnlyByteBuffer());
diEvent.setTargetIndex(count);
taskEvents.add(diEvent);
}
} else {
MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(inputSplit);
InputDataInformationEvent diEvent =
InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit
.toByteString().asReadOnlyByteBuffer());
diEvent.setTargetIndex(count);
taskEvents.add(diEvent);
}
count++;
}
// Set the actual events for the tasks.
LOG.info("For input name: " + inputName + " task events size is " + taskEvents.size());
context.addRootInputEvents(inputName, taskEvents);
if (inputToGroupedSplitMap.isEmpty() == false) {
for (Entry> entry : inputToGroupedSplitMap.entrySet()) {
processAllSideEvents(entry.getKey(), entry.getValue());
}
setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
inputToGroupedSplitMap.clear();
}
// Only done when it is a bucket map join only no SMB.
if (numInputsAffectingRootInputSpecUpdate == 1) {
setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
}
}
private void
setVertexParallelismAndRootInputSpec(Map rootInputSpecUpdate)
throws IOException {
if (numInputsAffectingRootInputSpecUpdate != numInputsSeenSoFar) {
return;
}
LOG.info("Setting vertex parallelism since we have seen all inputs.");
context.setVertexParallelism(taskCount, VertexLocationHint.create(grouper
.createTaskLocationHints(finalSplits.toArray(new InputSplit[finalSplits.size()]))), emMap,
rootInputSpecUpdate);
finalSplits.clear();
}
UserPayload getBytePayload(Multimap routingTable) throws IOException {
CustomEdgeConfiguration edgeConf =
new CustomEdgeConfiguration(routingTable.keySet().size(), routingTable);
DataOutputBuffer dob = new DataOutputBuffer();
edgeConf.write(dob);
byte[] serialized = dob.getData();
return UserPayload.create(ByteBuffer.wrap(serialized));
}
private FileSplit getFileSplitFromEvent(InputDataInformationEvent event) throws IOException {
InputSplit inputSplit = null;
if (event.getDeserializedUserPayload() != null) {
inputSplit = (InputSplit) event.getDeserializedUserPayload();
} else {
MRSplitProto splitProto = MRSplitProto.parseFrom(ByteString.copyFrom(event.getUserPayload()));
SerializationFactory serializationFactory = new SerializationFactory(new Configuration());
inputSplit = MRInputHelpers.createOldFormatSplitFromUserPayload(splitProto, serializationFactory);
}
if (!(inputSplit instanceof FileSplit)) {
throw new UnsupportedOperationException(
"Cannot handle splits other than FileSplit for the moment. Current input split type: "
+ inputSplit.getClass().getSimpleName());
}
return (FileSplit) inputSplit;
}
/*
* This method generates the map of bucket to file splits.
*/
private Multimap getBucketSplitMapForPath(
Map> pathFileSplitsMap) {
int bucketNum = 0;
Multimap bucketToInitialSplitMap =
ArrayListMultimap. create();
for (Map.Entry> entry : pathFileSplitsMap.entrySet()) {
int bucketId = bucketNum % numBuckets;
for (FileSplit fsplit : entry.getValue()) {
bucketToInitialSplitMap.put(bucketId, fsplit);
}
bucketNum++;
}
// this is just for SMB join use-case. The numBuckets would be equal to that of the big table
// and the small table could have lesser number of buckets. In this case, we want to send the
// data from the right buckets to the big table side. For e.g. Big table has 8 buckets and small
// table has 4 buckets, bucket 0 of small table needs to be sent to bucket 4 of the big table as
// well.
if (bucketNum < numBuckets) {
int loopedBucketId = 0;
for (; bucketNum < numBuckets; bucketNum++) {
for (InputSplit fsplit : bucketToInitialSplitMap.get(loopedBucketId)) {
bucketToInitialSplitMap.put(bucketNum, fsplit);
}
loopedBucketId++;
}
}
return bucketToInitialSplitMap;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy