co.cask.cdap.etl.batch.TransformExecutorFactory Maven / Gradle / Ivy
/*
* Copyright © 2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.etl.batch;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.macro.MacroEvaluator;
import co.cask.cdap.api.metrics.Metrics;
import co.cask.cdap.etl.api.StageLifecycle;
import co.cask.cdap.etl.api.StageMetrics;
import co.cask.cdap.etl.api.Transformation;
import co.cask.cdap.etl.api.batch.BatchRuntimeContext;
import co.cask.cdap.etl.common.DefaultStageMetrics;
import co.cask.cdap.etl.common.PipelinePhase;
import co.cask.cdap.etl.common.TrackedTransform;
import co.cask.cdap.etl.common.TransformDetail;
import co.cask.cdap.etl.common.TransformExecutor;
import co.cask.cdap.etl.planner.StageInfo;
import com.google.common.collect.Sets;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
/**
* Helps create {@link TransformExecutor TransformExecutors}.
*
* @param the type of input for the created transform executors
*/
public abstract class TransformExecutorFactory {
protected final Map> perStageInputSchemas;
private final String sourceStageName;
private final MacroEvaluator macroEvaluator;
protected final PipelinePluginInstantiator pluginInstantiator;
protected final Metrics metrics;
protected Schema outputSchema;
protected boolean isMapPhase;
public TransformExecutorFactory(JobContext hadoopContext, PipelinePluginInstantiator pluginInstantiator,
Metrics metrics, @Nullable String sourceStageName, MacroEvaluator macroEvaluator) {
this.pluginInstantiator = pluginInstantiator;
this.metrics = metrics;
this.perStageInputSchemas = new HashMap<>();
this.outputSchema = null;
this.sourceStageName = sourceStageName;
this.macroEvaluator = macroEvaluator;
this.isMapPhase = hadoopContext instanceof Mapper.Context;
}
protected abstract BatchRuntimeContext createRuntimeContext(String stageName);
protected TrackedTransform getTransformation(String pluginType, String stageName) throws Exception {
return new TrackedTransform(KVTransformations.getKVTransformation(stageName, pluginType,
isMapPhase,
getInitializedTransformation(stageName)),
new DefaultStageMetrics(metrics, stageName));
}
/**
* Create a transform executor for the specified pipeline. Will instantiate and initialize all sources,
* transforms, and sinks in the pipeline.
*
* @param pipeline the pipeline to create a transform executor for
* @return executor for the pipeline
* @throws InstantiationException if there was an error instantiating a plugin
* @throws Exception if there was an error initializing a plugin
*/
public TransformExecutor create(PipelinePhase pipeline) throws Exception {
Map transformations = new HashMap<>();
for (String pluginType : pipeline.getPluginTypes()) {
for (StageInfo stageInfo : pipeline.getStagesOfType(pluginType)) {
String stageName = stageInfo.getName();
outputSchema = stageInfo.getOutputSchema();
perStageInputSchemas.put(stageName, stageInfo.getInputSchemas());
// Wrap each transformation so that each stage is emitting stageName along with the record
transformations.put(stageName,
new TransformDetail(getTransformation(pluginType, stageName),
pipeline.getStageOutputs(stageName)));
}
}
// sourceStageName will be null in reducers, so need to handle that case
Set startingPoints = (sourceStageName == null) ? pipeline.getSources() : Sets.newHashSet(sourceStageName);
return new TransformExecutor<>(transformations, startingPoints);
}
/**
* Instantiates and initializes the plugin for the stage.
*
* @param stageName the stage name.
* @return the initialized Transformation
* @throws InstantiationException if the plugin for the stage could not be instantiated
* @throws Exception if there was a problem initializing the plugin
*/
protected > Transformation
getInitializedTransformation(String stageName) throws Exception {
BatchRuntimeContext runtimeContext = createRuntimeContext(stageName);
T plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
plugin.initialize(runtimeContext);
return plugin;
}
protected static TrackedTransform getTrackedEmitKeyStep(Transformation transform,
StageMetrics stageMetrics) {
return new TrackedTransform<>(transform, stageMetrics, TrackedTransform.RECORDS_IN, null);
}
protected static TrackedTransform getTrackedAggregateStep(Transformation transform,
StageMetrics stageMetrics) {
// 'aggregator.groups' is the number of groups output by the aggregator
return new TrackedTransform<>(transform, stageMetrics, "aggregator.groups", TrackedTransform.RECORDS_OUT);
}
protected static TrackedTransform getTrackedMergeStep(Transformation transform,
StageMetrics stageMetrics) {
return new TrackedTransform<>(transform, stageMetrics, null, TrackedTransform.RECORDS_OUT);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy