co.cask.cdap.template.etl.batch.ETLMapReduce Maven / Gradle / Ivy
/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.template.etl.batch;
import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.dataset.lib.KeyValue;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.metrics.Metrics;
import co.cask.cdap.template.etl.api.Transform;
import co.cask.cdap.template.etl.api.Transformation;
import co.cask.cdap.template.etl.api.batch.BatchSink;
import co.cask.cdap.template.etl.api.batch.BatchSinkContext;
import co.cask.cdap.template.etl.api.batch.BatchSource;
import co.cask.cdap.template.etl.api.batch.BatchSourceContext;
import co.cask.cdap.template.etl.batch.config.ETLBatchConfig;
import co.cask.cdap.template.etl.common.Constants;
import co.cask.cdap.template.etl.common.Destroyables;
import co.cask.cdap.template.etl.common.ETLStage;
import co.cask.cdap.template.etl.common.StageMetrics;
import co.cask.cdap.template.etl.common.TransformExecutor;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.Lists;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.reflect.Type;
import java.util.List;
import java.util.Map;
/**
* MapReduce driver for Batch ETL Adapters.
*/
public class ETLMapReduce extends AbstractMapReduce {
private static final Logger LOG = LoggerFactory.getLogger(ETLMapReduce.class);
private static final Gson GSON = new Gson();
private BatchSource batchSource;
private BatchSink batchSink;
private String sourcePluginId;
private String sinkPluginId;
private Metrics mrMetrics;
@Override
public void configure() {
setName(ETLMapReduce.class.getSimpleName());
setDescription("MapReduce driver for Batch ETL Adapters");
}
@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
Job job = context.getHadoopJob();
Map runtimeArgs = context.getRuntimeArguments();
Preconditions.checkArgument(runtimeArgs.containsKey(Constants.ADAPTER_NAME));
Preconditions.checkArgument(runtimeArgs.containsKey(Constants.CONFIG_KEY));
Preconditions.checkArgument(runtimeArgs.containsKey(Constants.Source.PLUGINID));
Preconditions.checkArgument(runtimeArgs.containsKey(Constants.Sink.PLUGINID));
Preconditions.checkArgument(runtimeArgs.containsKey(Constants.Transform.PLUGINIDS));
ETLBatchConfig etlBatchConfig = GSON.fromJson(runtimeArgs.get(Constants.CONFIG_KEY), ETLBatchConfig.class);
prepareSource(context, etlBatchConfig.getSource());
prepareSink(context, etlBatchConfig.getSink());
if (etlBatchConfig.getResources() != null) {
context.setMapperResources(etlBatchConfig.getResources());
}
job.setMapperClass(ETLMapper.class);
job.setNumReduceTasks(0);
}
private void prepareSource(MapReduceContext context, ETLStage sourceStage) throws Exception {
sourcePluginId = context.getRuntimeArguments().get(Constants.Source.PLUGINID);
batchSource = context.newPluginInstance(sourcePluginId);
BatchSourceContext sourceContext = new MapReduceSourceContext(context, mrMetrics, sourcePluginId);
LOG.info("Source Stage : {}", sourceStage);
LOG.info("Source Class : {}", batchSource.getClass().getName());
batchSource.prepareRun(sourceContext);
}
private void prepareSink(MapReduceContext context, ETLStage sinkStage) throws Exception {
sinkPluginId = context.getRuntimeArguments().get(Constants.Sink.PLUGINID);
batchSink = context.newPluginInstance(sinkPluginId);
BatchSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics, sinkPluginId);
LOG.info("Sink Stage : {}", sinkStage);
LOG.info("Sink Class : {}", batchSink.getClass().getName());
batchSink.prepareRun(sinkContext);
}
@Override
public void onFinish(boolean succeeded, MapReduceContext context) throws Exception {
onRunFinishSource(context, succeeded);
onRunFinishSink(context, succeeded);
LOG.info("Batch Run for Adapter {} : {}", context.getRuntimeArguments().get(Constants.ADAPTER_NAME), succeeded);
}
private void onRunFinishSource(MapReduceContext context, boolean succeeded) {
BatchSourceContext sourceContext = new MapReduceSourceContext(context, mrMetrics, sourcePluginId);
LOG.info("On RunFinish Source : {}", batchSource.getClass().getName());
try {
batchSource.onRunFinish(succeeded, sourceContext);
} catch (Throwable t) {
LOG.warn("Exception when calling onRunFinish on {}", batchSource, t);
}
}
private void onRunFinishSink(MapReduceContext context, boolean succeeded) {
BatchSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics, sinkPluginId);
LOG.info("On RunFinish Sink : {}", batchSink.getClass().getName());
try {
batchSink.onRunFinish(succeeded, sinkContext);
} catch (Throwable t) {
LOG.warn("Exception when calling onRunFinish on {}", batchSink, t);
}
}
/**
* Mapper Driver for ETL Transforms.
*/
public static class ETLMapper extends Mapper implements ProgramLifecycle {
private static final Gson GSON = new Gson();
private static final Type STRING_LIST_TYPE = new TypeToken>() { }.getType();
private List transforms;
private TransformExecutor transformExecutor;
private Metrics mapperMetrics;
@Override
public void initialize(MapReduceContext context) throws Exception {
Map runtimeArgs = context.getRuntimeArguments();
ETLBatchConfig etlConfig = GSON.fromJson(runtimeArgs.get(Constants.CONFIG_KEY), ETLBatchConfig.class);
String sourcePluginId = runtimeArgs.get(Constants.Source.PLUGINID);
String sinkPluginId = runtimeArgs.get(Constants.Sink.PLUGINID);
List transformIds = GSON.fromJson(runtimeArgs.get(Constants.Transform.PLUGINIDS), STRING_LIST_TYPE);
List stageList = etlConfig.getTransforms();
LOG.info("Transform Stages : {}", stageList);
List pipeline = Lists.newArrayListWithCapacity(stageList.size() + 2);
List stageMetrics = Lists.newArrayListWithCapacity(stageList.size() + 2);
transforms = Lists.newArrayListWithCapacity(stageList.size());
BatchSource source = context.newPluginInstance(sourcePluginId);
BatchSourceContext batchSourceContext = new MapReduceSourceContext(context, mapperMetrics, sourcePluginId);
source.initialize(batchSourceContext);
pipeline.add(source);
stageMetrics.add(new StageMetrics(mapperMetrics, StageMetrics.Type.SOURCE, etlConfig.getSource().getName()));
addTransforms(stageList, pipeline, stageMetrics, transformIds, context);
BatchSink sink = context.newPluginInstance(sinkPluginId);
BatchSinkContext batchSinkContext = new MapReduceSinkContext(context, mapperMetrics, sinkPluginId);
sink.initialize(batchSinkContext);
pipeline.add(sink);
stageMetrics.add(new StageMetrics(mapperMetrics, StageMetrics.Type.SINK, etlConfig.getSink().getName()));
transformExecutor = new TransformExecutor<>(pipeline, stageMetrics);
}
private void addTransforms(List stageConfigs, List pipeline,
List stageMetrics, List transformIds,
MapReduceContext context) throws Exception {
Preconditions.checkArgument(stageConfigs.size() == transformIds.size());
for (int i = 0; i < stageConfigs.size(); i++) {
ETLStage stageConfig = stageConfigs.get(i);
String transformId = transformIds.get(i);
Transform transform = context.newPluginInstance(transformId);
BatchTransformContext transformContext = new BatchTransformContext(context, mapperMetrics, transformId);
transform.initialize(transformContext);
pipeline.add(transform);
transforms.add(transform);
stageMetrics.add(new StageMetrics(mapperMetrics, StageMetrics.Type.TRANSFORM, stageConfig.getName()));
}
}
@Override
public void map(Object key, Object value, Context context) throws IOException, InterruptedException {
try {
KeyValue input = new KeyValue(key, value);
for (KeyValue output : transformExecutor.runOneIteration(input)) {
context.write(output.getKey(), output.getValue());
}
} catch (Exception e) {
LOG.error("Exception thrown in BatchDriver Mapper : {}", e);
Throwables.propagate(e);
}
}
@Override
public void destroy() {
// Both BatchSource and BatchSink implements Transform, hence are inside the transformExecutor as well
Destroyables.destroyQuietly(transformExecutor);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy