co.cask.cdap.datapipeline.SmartWorkflow Maven / Gradle / Ivy
/*
* Copyright © 2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.datapipeline;
import co.cask.cdap.api.app.ApplicationConfigurer;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.metrics.Metrics;
import co.cask.cdap.api.workflow.AbstractWorkflow;
import co.cask.cdap.api.workflow.WorkflowConfigurer;
import co.cask.cdap.api.workflow.WorkflowContext;
import co.cask.cdap.api.workflow.WorkflowForkConfigurer;
import co.cask.cdap.etl.api.LookupProvider;
import co.cask.cdap.etl.api.batch.BatchActionContext;
import co.cask.cdap.etl.api.batch.PostAction;
import co.cask.cdap.etl.api.batch.SparkCompute;
import co.cask.cdap.etl.api.batch.SparkSink;
import co.cask.cdap.etl.batch.ActionSpec;
import co.cask.cdap.etl.batch.BatchPhaseSpec;
import co.cask.cdap.etl.batch.BatchPipelineSpec;
import co.cask.cdap.etl.batch.WorkflowBackedActionContext;
import co.cask.cdap.etl.batch.connector.ConnectorSource;
import co.cask.cdap.etl.batch.mapreduce.ETLMapReduce;
import co.cask.cdap.etl.batch.spark.ETLSpark;
import co.cask.cdap.etl.common.Constants;
import co.cask.cdap.etl.common.DatasetContextLookupProvider;
import co.cask.cdap.etl.common.PipelinePhase;
import co.cask.cdap.etl.planner.ControlDag;
import co.cask.cdap.etl.planner.PipelinePlan;
import co.cask.cdap.etl.planner.StageInfo;
import co.cask.cdap.etl.proto.Engine;
import co.cask.cdap.internal.io.SchemaTypeAdapter;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
/**
* Data Pipeline Smart Workflow.
*/
public class SmartWorkflow extends AbstractWorkflow {
public static final String NAME = "DataPipelineWorkflow";
public static final String DESCRIPTION = "Data Pipeline Workflow";
private static final Logger LOG = LoggerFactory.getLogger(SmartWorkflow.class);
private static final Gson GSON = new GsonBuilder()
.registerTypeAdapter(Schema.class, new SchemaTypeAdapter()).create();
private final BatchPipelineSpec spec;
private final PipelinePlan plan;
private final ApplicationConfigurer applicationConfigurer;
// connector stage -> local dataset name
private final Map connectorDatasets;
private final Engine engine;
private ControlDag dag;
private int phaseNum;
private Map postActions;
// injected by cdap
@SuppressWarnings("unused")
private Metrics workflowMetrics;
public SmartWorkflow(BatchPipelineSpec spec,
PipelinePlan plan,
ApplicationConfigurer applicationConfigurer,
Engine engine) {
this.spec = spec;
this.plan = plan;
this.applicationConfigurer = applicationConfigurer;
this.phaseNum = 1;
this.connectorDatasets = new HashMap<>();
this.engine = engine;
}
@Override
protected void configure() {
setName(NAME);
setDescription(DESCRIPTION);
// set the pipeline spec as a property in case somebody like the UI wants to read it
Map properties = new HashMap<>();
properties.put(Constants.PIPELINE_SPEC_KEY, GSON.toJson(spec));
setProperties(properties);
// single phase, just add the program directly
if (plan.getPhases().size() == 1) {
addProgram(plan.getPhases().keySet().iterator().next(), new TrunkProgramAdder(getConfigurer()));
return;
}
// Dag classes don't allow a 'dag' without connections
if (plan.getPhaseConnections().isEmpty()) {
WorkflowProgramAdder programAdder;
// multiple phases, do a fork then join
WorkflowForkConfigurer forkConfigurer = getConfigurer().fork();
programAdder = new BranchProgramAdder(forkConfigurer);
for (String phaseName : plan.getPhases().keySet()) {
addProgram(phaseName, programAdder);
}
if (forkConfigurer != null) {
forkConfigurer.join();
}
return;
}
dag = new ControlDag(plan.getPhaseConnections());
// after flattening, there is guaranteed to be just one source
dag.flatten();
String start = dag.getSources().iterator().next();
addPrograms(start, getConfigurer());
}
@Override
public void initialize(WorkflowContext context) throws Exception {
super.initialize(context);
postActions = new LinkedHashMap<>();
BatchPipelineSpec batchPipelineSpec =
GSON.fromJson(context.getWorkflowSpecification().getProperty(Constants.PIPELINE_SPEC_KEY),
BatchPipelineSpec.class);
for (ActionSpec actionSpec : batchPipelineSpec.getEndingActions()) {
postActions.put(actionSpec.getName(), (PostAction) context.newPluginInstance(actionSpec.getName()));
}
}
@Override
public void destroy() {
WorkflowContext workflowContext = getContext();
LookupProvider lookupProvider = new DatasetContextLookupProvider(workflowContext);
Map runtimeArgs = workflowContext.getRuntimeArguments();
long logicalStartTime = workflowContext.getLogicalStartTime();
for (Map.Entry endingActionEntry : postActions.entrySet()) {
String name = endingActionEntry.getKey();
PostAction action = endingActionEntry.getValue();
BatchActionContext context = new WorkflowBackedActionContext(workflowContext, workflowMetrics, lookupProvider,
name, logicalStartTime, runtimeArgs);
try {
action.run(context);
} catch (Throwable t) {
LOG.error("Error while running ending action {}.", name, t);
}
}
}
private void addPrograms(String node, WorkflowConfigurer configurer) {
addProgram(node, new TrunkProgramAdder(configurer));
Set outputs = dag.getNodeOutputs(node);
if (outputs.isEmpty()) {
return;
}
// if this is a fork
if (outputs.size() > 1) {
WorkflowForkConfigurer extends WorkflowConfigurer> forkConfigurer = configurer.fork();
for (String output : outputs) {
// need to make sure we don't call also() if this is the final branch
if (!addBranchPrograms(output, forkConfigurer)) {
forkConfigurer = forkConfigurer.also();
}
}
} else {
addPrograms(outputs.iterator().next(), configurer);
}
}
// returns whether this is the final branch of the fork
private boolean addBranchPrograms(String node, WorkflowForkConfigurer extends WorkflowConfigurer> forkConfigurer) {
// if this is a join node
Set inputs = dag.getNodeInputs(node);
if (dag.getNodeInputs(node).size() > 1) {
// if we've reached the join from the final branch of the fork
if (dag.visit(node) == inputs.size()) {
// join the fork and continue on
addPrograms(node, forkConfigurer.join());
return true;
} else {
return false;
}
}
// a flattened control dag guarantees that if this is not a join node, there is exactly one output
addProgram(node, new BranchProgramAdder(forkConfigurer));
return addBranchPrograms(dag.getNodeOutputs(node).iterator().next(), forkConfigurer);
}
private void addProgram(String phaseName, WorkflowProgramAdder programAdder) {
PipelinePhase phase = plan.getPhase(phaseName);
// if the origin plan didn't have this name, it means it is a join node
// artificially added by the control dag flattening process. So nothing to add, skip it
if (phase == null) {
return;
}
// can't use phase name as a program name because it might contain invalid characters
String programName = "phase-" + phaseNum;
phaseNum++;
// TODO: add support for other program types based on the plugin types in the phase
// if this phase uses connectors, add the local dataset for that connector if we haven't already
for (StageInfo connectorInfo : phase.getStagesOfType(Constants.CONNECTOR_TYPE)) {
String connectorName = connectorInfo.getName();
String datasetName = connectorDatasets.get(connectorName);
if (datasetName == null) {
datasetName = UUID.randomUUID().toString();
connectorDatasets.put(connectorName, datasetName);
// add the local dataset
ConnectorSource connectorSource = new ConnectorSource(datasetName, null);
connectorSource.configure(getConfigurer());
}
}
Map phaseConnectorDatasets = new HashMap<>();
for (StageInfo connectorStage : phase.getStagesOfType(Constants.CONNECTOR_TYPE)) {
phaseConnectorDatasets.put(connectorStage.getName(), connectorDatasets.get(connectorStage.getName()));
}
BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(programName, phase, spec.getResources(),
spec.isStageLoggingEnabled(), phaseConnectorDatasets);
boolean hasSparkCompute = !batchPhaseSpec.getPhase().getStagesOfType(SparkCompute.PLUGIN_TYPE).isEmpty();
boolean hasSparkSink = !batchPhaseSpec.getPhase().getStagesOfType(SparkSink.PLUGIN_TYPE).isEmpty();
if (hasSparkCompute || hasSparkSink) {
// always use ETLSpark if this phase has a spark plugin
applicationConfigurer.addSpark(new ETLSpark(batchPhaseSpec));
programAdder.addSpark(programName);
} else {
switch (engine) {
case MAPREDUCE:
applicationConfigurer.addMapReduce(new ETLMapReduce(batchPhaseSpec));
programAdder.addMapReduce(programName);
break;
case SPARK:
applicationConfigurer.addSpark(new ETLSpark(batchPhaseSpec));
programAdder.addSpark(programName);
break;
default:
// should never happen
throw new IllegalStateException("Unsupported engine " + engine);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy