co.cask.cdap.etl.batch.TransformExecutorFactory Maven / Gradle / Ivy
/*
* Copyright © 2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.etl.batch;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.macro.MacroEvaluator;
import co.cask.cdap.api.metrics.Metrics;
import co.cask.cdap.api.preview.DataTracer;
import co.cask.cdap.etl.api.ErrorTransform;
import co.cask.cdap.etl.api.StageLifecycle;
import co.cask.cdap.etl.api.StageMetrics;
import co.cask.cdap.etl.api.Transformation;
import co.cask.cdap.etl.api.batch.BatchJoiner;
import co.cask.cdap.etl.api.batch.BatchRuntimeContext;
import co.cask.cdap.etl.batch.mapreduce.ConnectorSourceEmitter;
import co.cask.cdap.etl.batch.mapreduce.ErrorOutputWriter;
import co.cask.cdap.etl.batch.mapreduce.OutputWriter;
import co.cask.cdap.etl.batch.mapreduce.PipeTransformExecutor;
import co.cask.cdap.etl.batch.mapreduce.SinkEmitter;
import co.cask.cdap.etl.batch.mapreduce.TransformEmitter;
import co.cask.cdap.etl.common.Constants;
import co.cask.cdap.etl.common.PipelinePhase;
import co.cask.cdap.etl.common.TrackedTransform;
import co.cask.cdap.etl.common.TransformExecutor;
import co.cask.cdap.etl.planner.StageInfo;
import com.google.common.collect.Sets;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
/**
* Helps create {@link TransformExecutor TransformExecutors}.
*
* @param the type of input for the created transform executors
*/
public abstract class TransformExecutorFactory {
protected final Map> perStageInputSchemas;
private final String sourceStageName;
private final MacroEvaluator macroEvaluator;
protected final PipelinePluginInstantiator pluginInstantiator;
protected final Metrics metrics;
protected final Map outputSchemas;
protected boolean isMapPhase;
public TransformExecutorFactory(JobContext hadoopContext, PipelinePluginInstantiator pluginInstantiator,
Metrics metrics, @Nullable String sourceStageName, MacroEvaluator macroEvaluator) {
this.pluginInstantiator = pluginInstantiator;
this.metrics = metrics;
this.perStageInputSchemas = new HashMap<>();
this.outputSchemas = new HashMap<>();
this.sourceStageName = sourceStageName;
this.macroEvaluator = macroEvaluator;
this.isMapPhase = hadoopContext instanceof Mapper.Context;
}
protected abstract BatchRuntimeContext createRuntimeContext(StageInfo stageInfo);
protected abstract TrackedTransform getTransformation(StageInfo stageInfo) throws Exception;
/**
* Create a transform executor for the specified pipeline. Will instantiate and initialize all sources,
* transforms, and sinks in the pipeline.
*
* @param pipeline the pipeline to create a transform executor for
* @return executor for the pipeline
* @throws InstantiationException if there was an error instantiating a plugin
* @throws Exception if there was an error initializing a plugin
*/
public PipeTransformExecutor create(PipelinePhase pipeline,
OutputWriter outputWriter,
Map>
transformErrorSinkMap)
throws Exception {
Map transformations = new HashMap<>();
Set sources = pipeline.getSources();
// Set input and output schema for this stage
for (String pluginType : pipeline.getPluginTypes()) {
for (StageInfo stageInfo : pipeline.getStagesOfType(pluginType)) {
String stageName = stageInfo.getName();
outputSchemas.put(stageName, stageInfo.getOutputSchema());
perStageInputSchemas.put(stageName, stageInfo.getInputSchemas());
}
}
// recursively set PipeTransformDetail for all the stages
for (String source : sources) {
setPipeTransformDetail(pipeline, source, transformations, transformErrorSinkMap, outputWriter);
}
// sourceStageName will be null in reducers, so need to handle that case
Set startingPoints = (sourceStageName == null) ? pipeline.getSources() : Sets.newHashSet(sourceStageName);
return new PipeTransformExecutor<>(transformations, startingPoints);
}
private void setPipeTransformDetail(PipelinePhase pipeline, String stageName,
Map transformations,
Map>
transformErrorSinkMap,
OutputWriter outputWriter)
throws Exception {
if (pipeline.getSinks().contains(stageName)) {
StageInfo stageInfo = pipeline.getStage(stageName);
// If there is a connector sink/ joiner at the end of pipeline, do not remove stage name. This is needed to save
// stageName along with the record in connector sink and joiner takes input along with stageName
String pluginType = stageInfo.getPluginType();
boolean removeStageName = !(pluginType.equals(Constants.CONNECTOR_TYPE) ||
pluginType.equals(BatchJoiner.PLUGIN_TYPE));
boolean isErrorConsumer = pluginType.equals(ErrorTransform.PLUGIN_TYPE);
transformations.put(stageName, new PipeTransformDetail(stageName, removeStageName, isErrorConsumer,
getTransformation(stageInfo),
new SinkEmitter<>(stageName, outputWriter)));
return;
}
addTransformation(pipeline, stageName, transformations, transformErrorSinkMap);
for (String output : pipeline.getDag().getNodeOutputs(stageName)) {
setPipeTransformDetail(pipeline, output, transformations, transformErrorSinkMap, outputWriter);
transformations.get(stageName).addTransformation(output, transformations.get(output));
}
}
private void addTransformation(PipelinePhase pipeline, String stageName,
Map transformations,
Map> transformErrorSinkMap)
throws Exception {
StageInfo stageInfo = pipeline.getStage(stageName);
String pluginType = stageInfo.getPluginType();
ErrorOutputWriter © 2015 - 2025 Weber Informatics LLC | Privacy Policy