org.apache.tez.dag.api.DAG Maven / Gradle / Ivy
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.tez.dag.api;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Stack;
import java.util.Objects;
import org.apache.commons.collections4.BidiMap;
import org.apache.commons.collections4.bidimap.DualLinkedHashBidiMap;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.tez.client.CallerContext;
import org.apache.tez.common.JavaOptsChecker;
import org.apache.tez.common.TezUtils;
import org.apache.tez.dag.api.Vertex.VertexExecutionContext;
import org.apache.tez.dag.api.records.DAGProtos;
import org.apache.tez.serviceplugins.api.ServicePluginsDescriptor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.tez.client.TezClientUtils;
import org.apache.tez.common.security.DAGAccessControls;
import org.apache.tez.common.TezCommonUtils;
import org.apache.tez.common.TezYARNUtils;
import org.apache.tez.dag.api.EdgeProperty.DataMovementType;
import org.apache.tez.dag.api.EdgeProperty.DataSourceType;
import org.apache.tez.dag.api.EdgeProperty.SchedulingType;
import org.apache.tez.dag.api.VertexGroup.GroupInfo;
import org.apache.tez.dag.api.records.DAGProtos.ConfigurationProto;
import org.apache.tez.dag.api.records.DAGProtos.DAGPlan;
import org.apache.tez.dag.api.records.DAGProtos.EdgePlan;
import org.apache.tez.dag.api.records.DAGProtos.PlanGroupInputEdgeInfo;
import org.apache.tez.dag.api.records.DAGProtos.PlanKeyValuePair;
import org.apache.tez.dag.api.records.DAGProtos.PlanTaskConfiguration;
import org.apache.tez.dag.api.records.DAGProtos.PlanTaskLocationHint;
import org.apache.tez.dag.api.records.DAGProtos.PlanVertexGroupInfo;
import org.apache.tez.dag.api.records.DAGProtos.PlanVertexType;
import org.apache.tez.dag.api.records.DAGProtos.VertexPlan;
import com.google.common.annotations.VisibleForTesting;
import org.apache.tez.common.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
* Top level entity that defines the DAG (Directed Acyclic Graph) representing
* the data flow graph. Consists of a set of Vertices and Edges connecting the
* vertices. Vertices represent transformations of data and edges represent
* movement of data between vertices.
public class DAG {
private static final Logger LOG = LoggerFactory.getLogger(DAG.class);
final BidiMap vertices =
new DualLinkedHashBidiMap();
final Set edges = Sets.newHashSet();
final String name;
final Collection urisForCredentials = new HashSet();
Credentials credentials = new Credentials();
Set vertexGroups = Sets.newHashSet();
Set groupInputEdges = Sets.newHashSet();
private DAGAccessControls dagAccessControls;
Map commonTaskLocalFiles = Maps.newHashMap();
String dagInfo;
CallerContext callerContext;
private Map dagConf = new HashMap();
private VertexExecutionContext defaultExecutionContext;
private DAG(String name) {
this.name = name;
* Create a DAG with the specified name.
* @param name the name of the DAG
* @return this {@link DAG}
public static DAG create(String name) {
return new DAG(name);
* Set the files etc that must be provided to the tasks of this DAG
* @param localFiles
* files that must be available locally for each task. These files
* may be regular files, archives etc. as specified by the value
* elements of the map.
* @return {@link DAG}
public synchronized DAG addTaskLocalFiles(Map localFiles) {
TezCommonUtils.addAdditionalLocalResources(localFiles, commonTaskLocalFiles, "DAG " + getName());
return this;
public synchronized DAG addVertex(Vertex vertex) {
if (vertices.containsKey(vertex.getName())) {
throw new IllegalStateException(
"Vertex " + vertex.getName() + " already defined!");
vertices.put(vertex.getName(), vertex);
return this;
public synchronized Vertex getVertex(String vertexName) {
return vertices.get(vertexName);
* One of the methods that can be used to provide information about required
* Credentials when running on a secure cluster. A combination of this and
* addURIsForCredentials should be used to specify information about all
* credentials required by a DAG. AM specific credentials are not used when
* executing a DAG.
* Set credentials which will be required to run this dag. This method can be
* used if the client has already obtained some or all of the required
* credentials.
* @param credentials Credentials for the DAG
* @return {@link DAG}
public synchronized DAG setCredentials(Credentials credentials) {
this.credentials = credentials;
return this;
* Set description info for this DAG that can be used for visualization purposes.
* @param dagInfo JSON blob as a serialized string.
* Recognized keys by the UI are:
* "context" - The application context in which this DAG is being used.
* For example, this could be set to "Hive" or "Pig" if
* this is being run as part of a Hive or Pig script.
* "description" - General description on what this DAG is going to do.
* In the case of Hive, this could be the SQL query text.
* @return {@link DAG}
public synchronized DAG setDAGInfo(String dagInfo) {
this.dagInfo = dagInfo;
return this;
* Set the Context in which Tez is being called.
* @param callerContext Caller Context
* @return {@link DAG}
public synchronized DAG setCallerContext(CallerContext callerContext) {
this.callerContext = callerContext;
return this;
* Create a group of vertices that share a common output. This can be used to implement
* unions efficiently.
* @param name Name of the group.
* @param members {@link Vertex} members of the group
* @return {@link DAG}
public synchronized VertexGroup createVertexGroup(String name, Vertex... members) {
// vertex group name should be unique.
VertexGroup uv = new VertexGroup(name, members);
if (!vertexGroups.add(uv)) {
throw new IllegalStateException(
"VertexGroup " + name + " already defined!");
return uv;
public synchronized Credentials getCredentials() {
return this.credentials;
* Set Access controls for the DAG. Which user/groups can view the DAG progess/history and
* who can modify the DAG i.e. kill the DAG.
* The owner of the Tez Session and the user submitting the DAG are super-users and have access
* to all operations on the DAG.
* @param accessControls Access Controls
* @return {@link DAG}
public synchronized DAG setAccessControls(DAGAccessControls accessControls) {
this.dagAccessControls = accessControls;
return this;
public synchronized DAGAccessControls getDagAccessControls() {
return dagAccessControls;
* One of the methods that can be used to provide information about required
* Credentials when running on a secure cluster. A combination of this and
* setCredentials should be used to specify information about all credentials
* required by a DAG. AM specific credentials are not used when executing a
* DAG.
* This method can be used to specify a list of URIs for which Credentials
* need to be obtained so that the job can run. An incremental list of URIs
* can be provided by making multiple calls to the method.
* Currently, @{link credentials} can only be fetched for HDFS and other
* {@link org.apache.hadoop.fs.FileSystem} implementations that support
* credentials.
* @param uris
* a list of {@link URI}s
* @return {@link DAG}
public synchronized DAG addURIsForCredentials(Collection uris) {
Objects.requireNonNull(uris, "URIs cannot be null");
return this;
* @return an unmodifiable list representing the URIs for which credentials
* are required.
public synchronized Collection getURIsForCredentials() {
return Collections.unmodifiableCollection(urisForCredentials);
public synchronized Set getVertices() {
return Collections.unmodifiableSet(this.vertices.values());
* Add an {@link Edge} connecting vertices in the DAG
* @param edge The edge to be added
* @return {@link DAG}
public synchronized DAG addEdge(Edge edge) {
// Sanity checks
if (!vertices.containsValue(edge.getInputVertex())) {
throw new IllegalArgumentException(
"Input vertex " + edge.getInputVertex() + " doesn't exist!");
if (!vertices.containsValue(edge.getOutputVertex())) {
throw new IllegalArgumentException(
"Output vertex " + edge.getOutputVertex() + " doesn't exist!");
if (edges.contains(edge)) {
throw new IllegalArgumentException(
"Edge " + edge + " already defined!");
// inform the vertices
edge.getInputVertex().addOutputVertex(edge.getOutputVertex(), edge);
edge.getOutputVertex().addInputVertex(edge.getInputVertex(), edge);
return this;
* Add a {@link GroupInputEdge} to the DAG.
* @param edge {@link GroupInputEdge}
* @return {@link DAG}
public synchronized DAG addEdge(GroupInputEdge edge) {
// Sanity checks
if (!vertexGroups.contains(edge.getInputVertexGroup())) {
throw new IllegalArgumentException(
"Input vertex " + edge.getInputVertexGroup() + " doesn't exist!");
if (!vertices.containsValue(edge.getOutputVertex())) {
throw new IllegalArgumentException(
"Output vertex " + edge.getOutputVertex() + " doesn't exist!");
if (groupInputEdges.contains(edge)) {
throw new IllegalArgumentException(
"GroupInputEdge " + edge + " already defined!");
VertexGroup av = edge.getInputVertexGroup();
av.addOutputVertex(edge.getOutputVertex(), edge);
// add new edge between members of VertexGroup and destVertex of the GroupInputEdge
List newEdges = Lists.newLinkedList();
Vertex dstVertex = edge.getOutputVertex();
VertexGroup uv = edge.getInputVertexGroup();
for (Vertex member : uv.getMembers()) {
newEdges.add(Edge.create(member, dstVertex, edge.getEdgeProperty()));
dstVertex.addGroupInput(uv.getGroupName(), uv.getGroupInfo());
for (Edge e : newEdges) {
return this;
* Get the DAG name
* @return DAG name
public String getName() {
return this.name;
* This is currently used to setup additional configuration parameters which will be available
* in the DAG configuration used in the AppMaster. This API would be used for properties which
* are used by the Tez framework while executing the DAG. As an example, the number of attempts
* for a task.
* A DAG inherits it's base properties from the ApplicationMaster within which it's running. This
* method allows for these properties to be overridden.
* Currently, properties which are used by the task runtime, such as the task to AM
* heartbeat interval, cannot be changed using this method.
* Note: This API does not add any configuration to runtime components such as InputInitializers,
* OutputCommitters, Inputs and Outputs.
* @param property the property name
* @param value the value for the property
* @return the current DAG being constructed
public DAG setConf(String property, String value) {
TezConfiguration.validateProperty(property, Scope.DAG);
dagConf.put(property, value);
return this;
* Set history log level for this DAG. This config overrides the default or one set at the session
* level.
* @param historyLogLevel The ATS history log level for this DAG.
* @return this DAG
public DAG setHistoryLogLevel(HistoryLogLevel historyLogLevel) {
return this.setConf(TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL, historyLogLevel.name());
* Sets the default execution context for the DAG. This can be overridden at a per Vertex level.
* See {@link org.apache.tez.dag.api.Vertex#setExecutionContext(VertexExecutionContext)}
* @param vertexExecutionContext the default execution context for the DAG
* @return this DAG
public synchronized DAG setExecutionContext(VertexExecutionContext vertexExecutionContext) {
this.defaultExecutionContext = vertexExecutionContext;
return this;
VertexExecutionContext getDefaultExecutionContext() {
return this.defaultExecutionContext;
public Map getDagConf() {
return dagConf;
public Map getTaskLocalFiles() {
return commonTaskLocalFiles;
void checkAndInferOneToOneParallelism() {
// infer all 1-1 via dependencies
// collect all 1-1 edges where the source parallelism is set
Set newKnownTasksVertices = Sets.newHashSet();
for (Vertex vertex : vertices.values()) {
if (vertex.getParallelism() > -1) {
// walk through all known source 1-1 edges and infer parallelism
// add newly inferred vertices for consideration as known sources
// the outer loop will run for every new level of inferring the parallelism
// however, the entire logic will process each vertex only once
while(!newKnownTasksVertices.isEmpty()) {
Set knownTasksVertices = Sets.newHashSet(newKnownTasksVertices);
for (Vertex v : knownTasksVertices) {
for (Edge e : v.getOutputEdges()) {
if (e.getEdgeProperty().getDataMovementType() == DataMovementType.ONE_TO_ONE) {
Vertex outVertex = e.getOutputVertex();
if (outVertex.getParallelism() == -1) {
LOG.info("Inferring parallelism for vertex: "
+ outVertex.getName() + " to be " + v.getParallelism()
+ " from 1-1 connection with vertex " + v.getName());
// check for inconsistency and errors
for (Edge e : edges) {
Vertex inputVertex = e.getInputVertex();
Vertex outputVertex = e.getOutputVertex();
if (e.getEdgeProperty().getDataMovementType() == DataMovementType.ONE_TO_ONE) {
if (inputVertex.getParallelism() != outputVertex.getParallelism()) {
// both should be equal or equal to -1.
if (outputVertex.getParallelism() != -1) {
throw new TezUncheckedException(
"1-1 Edge. Destination vertex parallelism must match source vertex. "
+ "Vertex: " + inputVertex.getName() + " does not match vertex: "
+ outputVertex.getName());
// check the vertices with -1 parallelism, currently only 3 cases are allowed to has -1 parallelism.
// It is OK not using topological order to check vertices here.
// 1. has input initializers
// 2. 1-1 uninited sources
// 3. has custom vertex manager
for (Vertex vertex : vertices.values()) {
if (vertex.getParallelism() == -1) {
boolean hasInputInitializer = false;
if (vertex.getDataSources() != null && !vertex.getDataSources().isEmpty()) {
for (DataSourceDescriptor ds : vertex.getDataSources()) {
if (ds.getInputInitializerDescriptor() != null) {
hasInputInitializer = true;
if (hasInputInitializer) {
} else {
// Account for the case where the vertex has a data source with a determined number of
// shards e.g. splits calculated on the client and not in the AM
// In this case, vertex parallelism is setup later using the data source's numShards
// and as a result, an initializer is not needed.
if (vertex.getDataSources() != null
&& vertex.getDataSources().size() == 1
&& vertex.getDataSources().get(0).getNumberOfShards() > -1) {
boolean has1to1UninitedSources = false;
if (vertex.getInputVertices()!= null && !vertex.getInputVertices().isEmpty()) {
for (Vertex srcVertex : vertex.getInputVertices()) {
if (srcVertex.getParallelism() == -1) {
has1to1UninitedSources = true;
if (has1to1UninitedSources) {
if (vertex.getVertexManagerPlugin() != null) {
throw new IllegalStateException(vertex.getName() +
" has -1 tasks but does not have input initializers, " +
"1-1 uninited sources or custom vertex manager to set it at runtime");
// AnnotatedVertex is used by verify()
private static class AnnotatedVertex {
Vertex v;
int index; //for Tarjan's algorithm
int lowlink; //for Tarjan's algorithm
boolean onstack; //for Tarjan's algorithm
private AnnotatedVertex(Vertex v) {
this.v = v;
index = -1;
lowlink = -1;
// verify()
// Default rules
// Illegal:
// - duplicate vertex id
// - cycles
// Ok:
// - orphaned vertex. Occurs in map-only
// - islands. Occurs if job has unrelated workflows.
// Not yet categorized:
// - orphaned vertex in DAG of >1 vertex. Could be unrelated map-only job.
// - v1->v2 via two edges. perhaps some self-join job would use this?
// "restricted" mode:
// In short term, the supported DAGs are limited. Call with restricted=true for these verifications.
// Illegal:
// - any vertex with more than one input or output edge. (n-ary input, n-ary merge)
void verify() throws IllegalStateException {
Deque verify(boolean restricted) throws IllegalStateException {
if (vertices.isEmpty()) {
throw new IllegalStateException("Invalid dag containing 0 vertices");
// check for valid vertices, duplicate vertex names,
// and prepare for cycle detection
Map vertexMap = new HashMap();
Map> inboundVertexMap = new HashMap>();
Map> outboundVertexMap = new HashMap>();
for (Vertex v : vertices.values()) {
if (vertexMap.containsKey(v.getName())) {
throw new IllegalStateException("DAG contains multiple vertices"
+ " with name: " + v.getName());
vertexMap.put(v.getName(), new AnnotatedVertex(v));
Map> edgeMap = new HashMap>();
for (Edge e : edges) {
// Construct structure for cycle detection
Vertex inputVertex = e.getInputVertex();
Vertex outputVertex = e.getOutputVertex();
List edgeList = edgeMap.get(inputVertex);
if (edgeList == null) {
edgeList = new ArrayList();
edgeMap.put(inputVertex, edgeList);
// Construct map for Input name verification
Set inboundSet = inboundVertexMap.get(outputVertex);
if (inboundSet == null) {
inboundSet = new HashSet();
inboundVertexMap.put(outputVertex, inboundSet);
// Construct map for Output name verification
Set outboundSet = outboundVertexMap.get(inputVertex);
if (outboundSet == null) {
outboundSet = new HashSet();
outboundVertexMap.put(inputVertex, outboundSet);
// check input and output names don't collide with vertex names
for (Vertex vertex : vertices.values()) {
for (RootInputLeafOutput
input : vertex.getInputs()) {
if (vertexMap.containsKey(input.getName())) {
throw new IllegalStateException("Vertex: "
+ vertex.getName()
+ " contains an Input with the same name as vertex: "
+ input.getName());
for (RootInputLeafOutput
output : vertex.getOutputs()) {
if (vertexMap.containsKey(output.getName())) {
throw new IllegalStateException("Vertex: "
+ vertex.getName()
+ " contains an Output with the same name as vertex: "
+ output.getName());
// Check for valid InputNames
for (Entry> entry : inboundVertexMap.entrySet()) {
Vertex vertex = entry.getKey();
for (RootInputLeafOutput
input : vertex.getInputs()) {
if (entry.getValue().contains(input.getName())) {
throw new IllegalStateException("Vertex: "
+ vertex.getName()
+ " contains an incoming vertex and Input with the same name: "
+ input.getName());
// Check for valid OutputNames
for (Entry> entry : outboundVertexMap.entrySet()) {
Vertex vertex = entry.getKey();
for (RootInputLeafOutput
output : vertex.getOutputs()) {
if (entry.getValue().contains(output.getName())) {
throw new IllegalStateException("Vertex: "
+ vertex.getName()
+ " contains an outgoing vertex and Output with the same name: "
+ output.getName());
// Not checking for repeated input names / output names vertex names on the same vertex,
// since we only allow 1 at the moment.
// When additional inputs are supported, this can be chceked easily (and early)
// within the addInput / addOutput call itself.
Deque topologicalVertexStack = detectCycles(edgeMap, vertexMap);
if (restricted) {
for (Edge e : edges) {
DataSourceType dataSourceType = e.getEdgeProperty().getDataSourceType();
if (dataSourceType != DataSourceType.PERSISTED &&
dataSourceType != DataSourceType.EPHEMERAL) {
throw new IllegalStateException(
"Unsupported source type on edge. " + e);
// check for conflicts between dag level local resource and vertex level local resource
return topologicalVertexStack;
void verifyLocalResources(Configuration tezConf) {
for (Vertex v : vertices.values()) {
for (Map.Entry localResource : v
.getTaskLocalFiles().entrySet()) {
String resourceName = localResource.getKey();
LocalResource resource = localResource.getValue();
if (commonTaskLocalFiles.containsKey(resourceName)
&& !commonTaskLocalFiles.get(resourceName).equals(resource)) {
// Different for some reason. Compare size, and then eventually hash
try {
LocalResource commonLr = commonTaskLocalFiles.get(resourceName);
if (resource.getSize() != commonLr.getSize()) {
throw new IllegalStateException(
"There is conflicting local resource (size mismatch) (" +
+ ") between dag local resource and vertex " +
v.getName() + " local resource. "
+ "\nResource of dag : " +
+ "\nResource of vertex: " + resource);
Path vertexResourcePath =
Path commonResourcePath = ConverterUtils.getPathFromYarnURL(
byte[] vertexResourceSha = TezClientUtils
.getResourceSha(vertexResourcePath.toUri(), tezConf);
byte[] commonResourceSha = TezClientUtils
.getResourceSha(commonResourcePath.toUri(), tezConf);
if (!Arrays.equals(vertexResourceSha, commonResourceSha)) {
throw new IllegalStateException(
"There is conflicting local resource (sha mismatch) (" +
+ ") between dag local resource and vertex " +
v.getName() + " local resource. "
+ "\nResource of dag : " +
+ "\nResource of vertex: " + resource);
} catch (URISyntaxException | IOException e) {
throw new RuntimeException(
"Failed while attempting to validate sha for conflicting resources (" +
+ ") between dag local resource and vertex " + v.getName() +
" local resource. "
+ "\nResource of dag : " +
+ "\nResource of vertex: " + resource);
// Adaptation of Tarjan's algorithm for connected components.
// http://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm
private Deque detectCycles(Map> edgeMap,
Map vertexMap)
throws IllegalStateException {
Deque topologicalVertexStack = new LinkedList();
Integer nextIndex = 0; // boxed integer so it is passed by reference.
Stack stack = new Stack();
for (AnnotatedVertex av : vertexMap.values()) {
if (av.index == -1) {
assert stack.empty();
strongConnect(av, vertexMap, edgeMap, stack, nextIndex, topologicalVertexStack);
return topologicalVertexStack;
// part of Tarjan's algorithm for connected components.
// http://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm
private void strongConnect(
AnnotatedVertex av,
Map vertexMap,
Map> edgeMap,
Stack stack,
Integer nextIndex,
Deque topologicalVertexStack) throws IllegalStateException {
av.index = nextIndex;
av.lowlink = nextIndex;
av.onstack = true;
List edges = edgeMap.get(av.v);
if (edges != null) {
for (Edge e : edgeMap.get(av.v)) {
AnnotatedVertex outVertex = vertexMap.get(e.getOutputVertex().getName());
if (outVertex.index == -1) {
strongConnect(outVertex, vertexMap, edgeMap, stack, nextIndex, topologicalVertexStack);
av.lowlink = Math.min(av.lowlink, outVertex.lowlink);
} else if (outVertex.onstack) {
// strongly connected component detected, but we will wait till later so that the full cycle can be displayed.
// update lowlink in case outputVertex should be considered the root of this component.
av.lowlink = Math.min(av.lowlink, outVertex.index);
if (av.lowlink == av.index) {
AnnotatedVertex pop = stack.pop();
pop.onstack = false;
if (pop != av) {
// there was something on the stack other than this "av".
// this indicates there is a scc/cycle. It comprises all nodes from top of stack to "av"
StringBuilder message = new StringBuilder();
message.append(av.v.getName()).append(" <- ");
for (; pop != av; pop = stack.pop()) {
message.append(pop.v.getName()).append(" <- ");
pop.onstack = false;
throw new IllegalStateException("DAG contains a cycle: " + message);
} else {
// detect self-cycle
if (edgeMap.containsKey(pop.v)) {
for (Edge edge : edgeMap.get(pop.v)) {
if (edge.getOutputVertex().equals(pop.v)) {
throw new IllegalStateException("DAG contains a self-cycle on vertex:" + pop.v.getName());
// create protobuf message describing DAG
public DAGPlan createDag(Configuration tezConf, Credentials extraCredentials,
Map tezJarResources, LocalResource binaryConfig,
boolean tezLrsAsArchive) {
return createDag(tezConf, extraCredentials, tezJarResources, binaryConfig, tezLrsAsArchive,
null, null);
// create protobuf message describing DAG
public synchronized DAGPlan createDag(Configuration tezConf, Credentials extraCredentials,
Map tezJarResources, LocalResource binaryConfig,
boolean tezLrsAsArchive, ServicePluginsDescriptor servicePluginsDescriptor,
JavaOptsChecker javaOptsChecker) {
Deque topologicalVertexStack = verify(true);
DAGPlan.Builder dagBuilder = DAGPlan.newBuilder();
if (this.callerContext != null) {
if (this.dagInfo != null && !this.dagInfo.isEmpty()) {
// Setup default execution context.
VertexExecutionContext defaultContext = getDefaultExecutionContext();
verifyExecutionContext(defaultContext, servicePluginsDescriptor, "DAGDefault");
if (defaultContext != null) {
DAGProtos.VertexExecutionContextProto contextProto = DagTypeConverters.convertToProto(
if (!vertexGroups.isEmpty()) {
for (VertexGroup av : vertexGroups) {
GroupInfo groupInfo = av.getGroupInfo();
PlanVertexGroupInfo.Builder groupBuilder = PlanVertexGroupInfo.newBuilder();
for (Vertex v : groupInfo.getMembers()) {
for (Map.Entry entry :
groupInfo.edgeMergedInputs.entrySet()) {
Credentials dagCredentials = new Credentials();
if (extraCredentials != null) {
if (!commonTaskLocalFiles.isEmpty()) {
Preconditions.checkArgument(topologicalVertexStack.size() == vertices.size(),
"size of topologicalVertexStack is:" + topologicalVertexStack.size() +
" while size of vertices is:" + vertices.size() +
", make sure they are the same in order to sort the vertices");
while(!topologicalVertexStack.isEmpty()) {
Vertex vertex = vertices.get(topologicalVertexStack.pop());
// infer credentials, resources and parallelism from data source
Resource vertexTaskResource = vertex.getTaskResource();
if (vertexTaskResource == null) {
vertexTaskResource = Resource.newInstance(tezConf.getInt(
TezConfiguration.TEZ_TASK_RESOURCE_MEMORY_MB_DEFAULT), tezConf.getInt(
Map vertexLRs = Maps.newHashMap();
List dataSources = vertex.getDataSources();
for (DataSourceDescriptor dataSource : dataSources) {
if (dataSource.getCredentials() != null) {
if (dataSource.getAdditionalLocalFiles() != null) {
.addAdditionalLocalResources(dataSource.getAdditionalLocalFiles(), vertexLRs,
"Vertex " + vertex.getName());
if (tezJarResources != null) {
.addAdditionalLocalResources(tezJarResources, vertexLRs, "Vertex " + vertex.getName());
if (binaryConfig != null) {
vertexLRs.put(TezConstants.TEZ_PB_BINARY_CONF_NAME, binaryConfig);
int vertexParallelism = vertex.getParallelism();
VertexLocationHint vertexLocationHint = vertex.getLocationHint();
if (dataSources.size() == 1) {
DataSourceDescriptor dataSource = dataSources.get(0);
if (vertexParallelism == -1 && dataSource.getNumberOfShards() > -1) {
vertexParallelism = dataSource.getNumberOfShards();
if (vertexLocationHint == null && dataSource.getLocationHint() != null) {
vertexLocationHint = dataSource.getLocationHint();
if (vertexParallelism == -1) {
Preconditions.checkState(vertexLocationHint == null,
"Cannot specify vertex location hint without specifying vertex parallelism. Vertex: "
+ vertex.getName());
} else if (vertexLocationHint != null) {
Preconditions.checkState(vertexParallelism == vertexLocationHint.getTaskLocationHints().size(),
"vertex task location hint must equal vertex parallelism. Vertex: " + vertex.getName());
for (DataSinkDescriptor dataSink : vertex.getDataSinks()) {
if (dataSink.getCredentials() != null) {
VertexPlan.Builder vertexBuilder = VertexPlan.newBuilder();
vertexBuilder.setType(PlanVertexType.NORMAL); // vertex type is implicitly NORMAL until TEZ-46.
// Vertex ExecutionContext setup
VertexExecutionContext execContext = vertex.getVertexExecutionContext();
verifyExecutionContext(execContext, servicePluginsDescriptor, vertex.getName());
if (execContext != null) {
DAGProtos.VertexExecutionContextProto contextProto =
// End of VertexExecutionContext setup.
if (vertex.getInputs().size() > 0) {
for (RootInputLeafOutput input : vertex.getInputs()) {
if (vertex.getOutputs().size() > 0) {
for (RootInputLeafOutput output : vertex.getOutputs()) {
if (vertex.getConf()!= null && vertex.getConf().size() > 0) {
ConfigurationProto.Builder confBuilder = ConfigurationProto.newBuilder();
TezUtils.populateConfProtoFromEntries(vertex.getConf().entrySet(), confBuilder);
//task config
PlanTaskConfiguration.Builder taskConfigBuilder = PlanTaskConfiguration.newBuilder();
try {
TezClientUtils.addDefaultsToTaskLaunchCmdOpts(vertex.getTaskLaunchCmdOpts(), tezConf,
} catch (TezException e) {
throw new TezUncheckedException("Invalid TaskLaunchCmdOpts defined for Vertex "
+ vertex.getName() + " : " + e.getMessage(), e);
if (!vertexLRs.isEmpty()) {
Map taskEnv = Maps.newHashMap(vertex.getTaskEnvironment());
TezYARNUtils.setupDefaultEnv(taskEnv, tezConf,
for (Map.Entry entry : taskEnv.entrySet()) {
PlanKeyValuePair.Builder envSettingBuilder = PlanKeyValuePair.newBuilder();
if (vertexLocationHint != null) {
if (vertexLocationHint.getTaskLocationHints() != null) {
for (TaskLocationHint hint : vertexLocationHint.getTaskLocationHints()) {
PlanTaskLocationHint.Builder taskLocationHintBuilder = PlanTaskLocationHint.newBuilder();
// we can allow this later on if needed
if (hint.getAffinitizedTask() != null) {
throw new TezUncheckedException(
"Task based affinity may not be specified via the DAG API");
if (hint.getHosts() != null) {
if (hint.getRacks() != null) {
if (vertex.getVertexManagerPlugin() != null) {
for (Edge inEdge : vertex.getInputEdges()) {
for (Edge outEdge : vertex.getOutputEdges()) {
for (Edge edge : edges) {
EdgePlan.Builder edgeBuilder = EdgePlan.newBuilder();
if (edge.getEdgeProperty().getDataMovementType() == DataMovementType.CUSTOM) {
if (edge.getEdgeProperty().getEdgeManagerDescriptor() != null) {
} // else the AM will deal with this.
if (dagAccessControls != null) {
ConfigurationProto.Builder confProtoBuilder = ConfigurationProto.newBuilder();
if (!this.dagConf.isEmpty()) {
TezUtils.populateConfProtoFromEntries(this.dagConf.entrySet(), confProtoBuilder);
// Copy historyLogLevel from tezConf into dagConf if its not overridden in dagConf.
String logLevel = this.dagConf.get(TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL);
if (logLevel != null) {
// The config is from dagConf, we have already added it to the proto above, just check if
// the value is valid.
if (!HistoryLogLevel.validateLogLevel(logLevel)) {
throw new IllegalArgumentException(
"Config: " + TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL +
" is set to invalid value: " + logLevel);
} else {
// Validate and set value from tezConf.
logLevel = tezConf.get(TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL);
if (logLevel != null) {
if (!HistoryLogLevel.validateLogLevel(logLevel)) {
throw new IllegalArgumentException(
"Config: " + TezConfiguration.TEZ_HISTORY_LOGGING_LOGLEVEL +
" is set to invalid value: " + logLevel);
PlanKeyValuePair.Builder kvp = PlanKeyValuePair.newBuilder();
if (dagCredentials != null) {
TezCommonUtils.logCredentials(LOG, dagCredentials, "dag");
return dagBuilder.build();
private void verifyExecutionContext(VertexExecutionContext executionContext,
ServicePluginsDescriptor servicePluginsDescriptor,
String context) {
if (executionContext != null) {
if (executionContext.shouldExecuteInContainers()) {
if (servicePluginsDescriptor == null || !servicePluginsDescriptor.areContainersEnabled()) {
throw new IllegalStateException("Invalid configuration. ExecutionContext for " + context +
" specifies container execution but this is disabled in the ServicePluginDescriptor");
if (executionContext.shouldExecuteInAm()) {
if (servicePluginsDescriptor == null || !servicePluginsDescriptor.isUberEnabled()) {
throw new IllegalStateException("Invalid configuration. ExecutionContext for " + context +
" specifies AM execution but this is disabled in the ServicePluginDescriptor");
if (executionContext.getTaskSchedulerName() != null) {
boolean found = false;
if (servicePluginsDescriptor != null) {
found = checkNamedEntityExists(executionContext.getTaskSchedulerName(),
if (!found) {
throw new IllegalStateException("Invalid configuration. ExecutionContext for " + context +
" specifies task scheduler as " + executionContext.getTaskSchedulerName() +
" which is not part of the ServicePluginDescriptor");
if (executionContext.getContainerLauncherName() != null) {
boolean found = false;
if (servicePluginsDescriptor != null) {
found = checkNamedEntityExists(executionContext.getContainerLauncherName(),
if (!found) {
throw new IllegalStateException("Invalid configuration. ExecutionContext for " + context +
" specifies container launcher as " + executionContext.getContainerLauncherName() +
" which is not part of the ServicePluginDescriptor");
if (executionContext.getTaskCommName() != null) {
boolean found = false;
if (servicePluginsDescriptor != null) {
found = checkNamedEntityExists(executionContext.getTaskCommName(),
if (!found) {
throw new IllegalStateException("Invalid configuration. ExecutionContext for " + context +
" specifies task communicator as " + executionContext.getTaskCommName() +
" which is not part of the ServicePluginDescriptor");
private boolean checkNamedEntityExists(String expected, NamedEntityDescriptor[] namedEntities) {
if (namedEntities == null) {
return false;
for (NamedEntityDescriptor named : namedEntities) {
if (named.getEntityName().equals(expected)) {
return true;
return false;
public synchronized CallerContext getCallerContext() {
return this.callerContext;
© 2015 - 2025 Weber Informatics LLC | Privacy Policy