net.achalaggarwal.arbiter.workflow.WorkflowGraphBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of arbiter Show documentation
Show all versions of arbiter Show documentation
Utility for generating Oozie workflows
/*
* Copyright 2015-2016 Etsy
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* -----------------------------------------------------------------------
*
* This file has been modified from its original licensed form.
* Modifications are Copyright (C) 2016 Achal Aggarwal (achalaggarwal.net).
*/
package net.achalaggarwal.arbiter.workflow;
import net.achalaggarwal.arbiter.Action;
import net.achalaggarwal.arbiter.Workflow;
import net.achalaggarwal.arbiter.config.Config;
import net.achalaggarwal.arbiter.exception.WorkflowGraphException;
import net.achalaggarwal.arbiter.util.GraphvizGenerator;
import net.achalaggarwal.arbiter.util.NamedArgumentInterpolator;
import com.google.common.collect.ImmutableMap;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.lang3.tuple.Triple;
import org.jgrapht.Graphs;
import org.jgrapht.alg.ConnectivityInspector;
import org.jgrapht.experimental.dag.DirectedAcyclicGraph;
import org.jgrapht.graph.DefaultEdge;
import java.util.*;
/**
* Takes a workflow graph specified by YAML and builds a graph in an Oozie-friendly format
* This includes inserting start/end nodes, fork/join pairs, etc
*
* @author Andrew Johnson
*/
public class WorkflowGraphBuilder {
private WorkflowGraphBuilder() { }
// Every fork/join pair needs a unique name
// To keep the names short, we just number them sequentially
private static int forkCount = 0;
/**
* Build a workflow graph from the workflow definition, inserting fork/join pairs as appropriate for parallel
*
* @param workflow Arbiter Workflow object
* @param config Arbiter Config object
* @param outputFileAbsolutePath Output file path without extension for Graphviz graphs
* @param generateGraphviz Indicate if Graphviz graphs should be generated for workflows
* @param graphvizFormat The format in which Graphviz graphs should be generated if enabled
* @return DirectedAcyclicGraph DAG of the workflow
* @throws WorkflowGraphException
*/
public static DirectedAcyclicGraph buildWorkflowGraph(Workflow workflow, Config config, String outputFileAbsolutePath, boolean generateGraphviz, String graphvizFormat) throws WorkflowGraphException {
forkCount = 0;
Map actionsByName = new HashMap<>();
List workflowActions = workflow.getActions();
// Add all the actions to a map of string -> action
for (Action a : workflowActions) {
actionsByName.put(a.getName(), a);
}
// DAG of the workflow in its raw un-optimized state.
DirectedAcyclicGraph inputGraph = new DirectedAcyclicGraph<>(DefaultEdge.class);
// Add all the actions as vertices. At this point there are no connections within the graph, just vertices.
for (Action a : workflowActions) {
inputGraph.addVertex(a);
}
// We need to traverse a second time so all the vertices are present when adding edges
for (Action a : workflowActions) {
if (a.getDependencies() != null) {
for (String d : a.getDependencies()) {
Action source = actionsByName.get(d);
if (source == null) {
throw new WorkflowGraphException("Missing action for dependency " + d);
}
// Add the edge between the dep and the action
try {
inputGraph.addDagEdge(source, a);
} catch (DirectedAcyclicGraph.CycleFoundException e) {
throw new WorkflowGraphException("Cycle found while building original graph", e);
}
}
}
}
if (generateGraphviz) {
GraphvizGenerator.generateGraphviz(inputGraph, outputFileAbsolutePath + "-input.dot", graphvizFormat);
}
// Final DAG we will be returning
DirectedAcyclicGraph workflowGraph;
Action startTransitionNode;
Action endTransitionNode;
try {
// Process the graph into its properly connected and organized structure.
Triple, Action, Action> workflowGraphTriple = processSubcomponents(inputGraph);
workflowGraph = workflowGraphTriple.getLeft();
startTransitionNode = workflowGraphTriple.getMiddle();
endTransitionNode = workflowGraphTriple.getRight();
// These are the standard control flow nodes that must be present in every workflow
Action start = new Action();
start.setName("start");
start.setType("start");
workflowGraph.addVertex(start);
workflowGraph.addDagEdge(start, startTransitionNode);
Action end = new Action();
end.setName("end");
end.setType("end");
workflowGraph.addVertex(end);
if (workflow.getErrorHandler() != null) {
workflowGraph.addVertex(workflow.getErrorHandler());
workflowGraph.addDagEdge(workflow.getErrorHandler(), end);
workflowGraph.addDagEdge(endTransitionNode, workflow.getErrorHandler());
} else {
workflowGraph.addDagEdge(endTransitionNode, end);
}
// The kill node will be used as the error transition when generating the XML as appropriate
// These is no need to add any edges to it now
if (config.getKillMessage() != null && config.getKillName() != null) {
Action kill = new Action();
kill.setType("kill");
kill.setName(config.getKillName());
kill.setProperty("message", NamedArgumentInterpolator.interpolate(config.getKillMessage(), ImmutableMap.of("name", workflow.getName()), null));
workflowGraph.addVertex(kill);
}
} catch (DirectedAcyclicGraph.CycleFoundException e) {
throw new WorkflowGraphException("Cycle found while generating workflow", e);
}
return workflowGraph;
}
/**
* Recursively insert fork/joins for connected subcomponents of a graph
*
* @param vertices The set of vertices to process
* @param parentGraph The parentGraph graph of these vertices
* @return DirectedAcyclicGraph A new graph containing all the given vertices with appropriate fork/join pairs inserted
* @throws WorkflowGraphException
* @throws DirectedAcyclicGraph.CycleFoundException
*/
private static DirectedAcyclicGraph buildComponentGraph(Set vertices, DirectedAcyclicGraph parentGraph) throws WorkflowGraphException, DirectedAcyclicGraph.CycleFoundException {
DirectedAcyclicGraph subgraph = buildSubgraph(parentGraph, vertices);
// Start by pulling out the vertices with no incoming edges
// These can run in parallel in a fork-join
Set initialNodes = new HashSet<>();
for (Action vertex : subgraph.vertexSet()) {
if (subgraph.inDegreeOf(vertex) == 0) {
initialNodes.add(vertex);
}
}
DirectedAcyclicGraph result = new DirectedAcyclicGraph<>(DefaultEdge.class);
if (initialNodes.isEmpty()) {
// This is a very odd case, but just in case we'll fail if it happens
throw new WorkflowGraphException("No nodes with inDegree = 0 found. This shouldn't happen.");
} else if (initialNodes.size() == 1) {
// If there is only one node, we can't put it in a fork/join
// In this case we'll add just that vertex to the resulting graph
Action vertex = initialNodes.iterator().next();
result.addVertex(vertex);
// Remove the processed vertex so that we have new unprocessed subcomponents
subgraph.removeVertex(vertex);
} else {
// If there are multiple nodes, insert a fork/join pair to run them in parallel
Pair forkJoin = addForkJoin(result);
Action fork = forkJoin.getLeft();
Action join = forkJoin.getRight();
for (Action vertex : initialNodes) {
result.addVertex(vertex);
result.addDagEdge(fork, vertex);
result.addDagEdge(vertex, join);
// Remove the processed vertex so that we have new unprocessed subcomponents
subgraph.removeVertex(vertex);
}
}
// Now recursively process the graph with the processed nodes removed
Triple, Action, Action> subComponentGraphTriple = processSubcomponents(subgraph);
DirectedAcyclicGraph subComponentGraph = subComponentGraphTriple.getLeft();
// Having processed the subcomponents, we attach the "last" node of the graph created here to
// the "first" node of the subcomponent graph
Action noIncoming = subComponentGraphTriple.getMiddle();
Action noOutgoing = null;
for (Action vertex : result.vertexSet()) {
if (noOutgoing == null && result.outDegreeOf(vertex) == 0) {
noOutgoing = vertex;
}
}
Graphs.addGraph(result, subComponentGraph);
if (noOutgoing != null && noIncoming != null && !noOutgoing.equals(noIncoming)) {
result.addDagEdge(noOutgoing, noIncoming);
}
return result;
}
/**
* Processes all connected subcomponents of a given graph
*
* @param parentGraph The graph for which to process subcomponents
* @return A Triple with these elements - A new graph with fork/join pairs inserted, the "first" node in this graph, and the "last" node in this graph
* @throws WorkflowGraphException
* @throws DirectedAcyclicGraph.CycleFoundException
*/
private static Triple, Action, Action> processSubcomponents(DirectedAcyclicGraph parentGraph) throws WorkflowGraphException, DirectedAcyclicGraph.CycleFoundException {
ConnectivityInspector inspector = new ConnectivityInspector<>(parentGraph);
List> connectedComponents = inspector.connectedSets();
// Recursively process each connected subcomponent of the graph
List> componentGraphs = new ArrayList<>(connectedComponents.size());
for (Set subComponent : connectedComponents) {
componentGraphs.add(buildComponentGraph(subComponent, parentGraph));
}
DirectedAcyclicGraph result = new DirectedAcyclicGraph<>(DefaultEdge.class);
for (DirectedAcyclicGraph subSubgraph : componentGraphs) {
Graphs.addGraph(result, subSubgraph);
}
// If we have more than one subcomponent, we must insert a fork/join to run them in parallel
if (componentGraphs.size() > 1) {
Pair forkJoin = addForkJoin(result);
Action fork = forkJoin.getLeft();
Action join = forkJoin.getRight();
for (DirectedAcyclicGraph subSubgraph : componentGraphs) {
for (Action vertex : subSubgraph.vertexSet()) {
// Vertices with no incoming edges attach directly to the fork
if (subSubgraph.inDegreeOf(vertex) == 0) {
result.addDagEdge(fork, vertex);
}
// Vertices with no outgoing edges attach directly to the join
if (subSubgraph.outDegreeOf(vertex) == 0) {
result.addDagEdge(vertex, join);
}
}
}
}
// The graph will now have one node with no outgoing edges and one node with no incoming edges
// The node with no outgoing edges is the "last" node in the resulting graph
// The node with no incoming edges is the "first" node in the resulting graph
// These are pulled out specifically to make it easier to attach the resulting graph into another one
Action noOutgoing = null;
Action noIncoming = null;
for (Action vertex : result.vertexSet()) {
if (noIncoming == null && result.inDegreeOf(vertex) == 0) {
noIncoming = vertex;
}
}
for (Action vertex : result.vertexSet()) {
if (noOutgoing == null && result.outDegreeOf(vertex) == 0) {
noOutgoing = vertex;
}
}
return Triple.of(result, noIncoming, noOutgoing);
}
/**
* Build a subgraph of a parentGraph graph given a set of vertices
* This is a new object and not a view on the parentGraph graph
*
* @param parentGraph The parentGraph component for the new subgraph to start at
* @param vertices The set of actions (vertices) making up the subgraph
* @return DirectedAcyclicGraph A new graph containing only the given vertices
*/
private static DirectedAcyclicGraph buildSubgraph(DirectedAcyclicGraph parentGraph, Set vertices) throws DirectedAcyclicGraph.CycleFoundException {
// Create a new DAG to serve as the subgraph
DirectedAcyclicGraph subgraph = new DirectedAcyclicGraph<>(DefaultEdge.class);
// Add all the vertices for this subgraph
for (Action vertex : vertices) {
subgraph.addVertex(vertex);
}
// All vertices must exist in the graph before any edges can be added
for (Action vertex : vertices) {
// Grab the parentGraph's edges between the parentGraph and this vertex and then add an edge in the subgraph to
// match what the parentGraph has.
for (DefaultEdge edge : parentGraph.edgesOf(vertex)) {
subgraph.addDagEdge(parentGraph.getEdgeSource(edge), parentGraph.getEdgeTarget(edge), edge);
}
}
return subgraph;
}
/**
* Create a fork/join pair and add it to a graph
*
* @param parentGraph The graph to which to add the fork/join actions
* @return A Pair of actions. The left action is the fork and the right action is the join
*/
private static Pair addForkJoin(DirectedAcyclicGraph parentGraph) {
Action fork = new Action();
fork.setName("fork-" + forkCount);
fork.setType("fork");
Action join = new Action();
join.setName("join-" + forkCount);
join.setType("join");
forkCount++;
parentGraph.addVertex(fork);
parentGraph.addVertex(join);
return Pair.of(fork, join);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy