co.cask.cdap.etl.batch.ETLMapReduce Maven / Gradle / Ivy
/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.etl.batch;
import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.lib.FileSetProperties;
import co.cask.cdap.api.dataset.lib.KeyValue;
import co.cask.cdap.api.dataset.lib.TimePartitionedFileSet;
import co.cask.cdap.api.dataset.lib.TimePartitionedFileSetArguments;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.mapreduce.MapReduceTaskContext;
import co.cask.cdap.api.metrics.Metrics;
import co.cask.cdap.etl.api.InvalidEntry;
import co.cask.cdap.etl.api.Transform;
import co.cask.cdap.etl.api.batch.BatchConfigurable;
import co.cask.cdap.etl.api.batch.BatchRuntimeContext;
import co.cask.cdap.etl.api.batch.BatchSink;
import co.cask.cdap.etl.api.batch.BatchSinkContext;
import co.cask.cdap.etl.api.batch.BatchSource;
import co.cask.cdap.etl.api.batch.BatchSourceContext;
import co.cask.cdap.etl.batch.config.ETLBatchConfig;
import co.cask.cdap.etl.common.Constants;
import co.cask.cdap.etl.common.DefaultEmitter;
import co.cask.cdap.etl.common.Destroyables;
import co.cask.cdap.etl.common.Pipeline;
import co.cask.cdap.etl.common.PipelineRegisterer;
import co.cask.cdap.etl.common.PluginID;
import co.cask.cdap.etl.common.SinkInfo;
import co.cask.cdap.etl.common.StageMetrics;
import co.cask.cdap.etl.common.StructuredRecordStringConverter;
import co.cask.cdap.etl.common.TransformDetail;
import co.cask.cdap.etl.common.TransformExecutor;
import co.cask.cdap.etl.common.TransformInfo;
import co.cask.cdap.etl.common.TransformResponse;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.Lists;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.reflect.Type;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
/**
* MapReduce Driver for ETL Batch Applications.
*/
public class ETLMapReduce extends AbstractMapReduce {
public static final String NAME = ETLMapReduce.class.getSimpleName();
private static final Logger LOG = LoggerFactory.getLogger(ETLMapReduce.class);
private static final String SINK_OUTPUTS_KEY = "cdap.etl.sink.outputs";
private static final Type SINK_OUTPUTS_TYPE = new TypeToken>() { }.getType();
private static final Type SINK_INFO_TYPE = new TypeToken>() { }.getType();
private static final Type TRANSFORMINFO_LIST_TYPE = new TypeToken>() { }.getType();
private static final Gson GSON = new Gson();
@VisibleForTesting
static final Schema ERROR_SCHEMA = Schema.recordOf(
"error",
Schema.Field.of(Constants.ErrorDataset.ERRCODE, Schema.of(Schema.Type.INT)),
Schema.Field.of(Constants.ErrorDataset.ERRMSG, Schema.unionOf(Schema.of(Schema.Type.STRING),
Schema.of(Schema.Type.NULL))),
Schema.Field.of(Constants.ErrorDataset.INVALIDENTRY, Schema.of(Schema.Type.STRING)));
private static final org.apache.avro.Schema AVRO_ERROR_SCHEMA =
new org.apache.avro.Schema.Parser().parse(ERROR_SCHEMA.toString());
private BatchConfigurable batchSource;
private List> batchSinks;
// injected by CDAP
@SuppressWarnings("unused")
private Metrics mrMetrics;
// this is only visible at configure time, not at runtime
private final ETLBatchConfig config;
public ETLMapReduce(ETLBatchConfig config) {
this.config = config;
}
@Override
public void configure() {
setName(NAME);
setDescription("MapReduce Driver for ETL Batch Applications");
PipelineRegisterer pipelineRegisterer = new PipelineRegisterer(getConfigurer(), "batch");
Pipeline pipelineIds =
pipelineRegisterer.registerPlugins(
config, TimePartitionedFileSet.class,
FileSetProperties.builder()
.setInputFormat(AvroKeyInputFormat.class)
.setOutputFormat(AvroKeyOutputFormat.class)
.setEnableExploreOnCreate(true)
.setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe")
.setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat")
.setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat")
.setTableProperty("avro.schema.literal", ERROR_SCHEMA.toString())
.build(), true);
if (config.getResources() != null) {
setMapperResources(config.getResources());
}
// add source, sink, transform ids to the properties. These are needed at runtime to instantiate the plugins
Map properties = new HashMap<>();
properties.put(Constants.Source.PLUGINID, pipelineIds.getSource());
properties.put(Constants.Sink.PLUGINIDS, GSON.toJson(pipelineIds.getSinks()));
properties.put(Constants.Transform.PLUGINIDS, GSON.toJson(pipelineIds.getTransforms()));
setProperties(properties);
}
@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
Job job = context.getHadoopJob();
Map properties = context.getSpecification().getProperties();
String sourcePluginId = properties.get(Constants.Source.PLUGINID);
batchSource = context.newPluginInstance(sourcePluginId);
BatchSourceContext sourceContext = new MapReduceSourceContext(context, mrMetrics, sourcePluginId);
batchSource.prepareRun(sourceContext);
String transformInfosStr = properties.get(Constants.Transform.PLUGINIDS);
Preconditions.checkNotNull(transformInfosStr, "Transform plugin ids not found in program properties.");
List transformInfos = GSON.fromJson(transformInfosStr, TRANSFORMINFO_LIST_TYPE);
// setup time partition for each error dataset
for (TransformInfo transformInfo : transformInfos) {
if (transformInfo.getErrorDatasetName() != null) {
addPropertiesToErrorDataset(transformInfo.getErrorDatasetName(), context);
}
}
List sinkOutputs = new ArrayList<>();
String sinkPluginIdsStr = properties.get(Constants.Sink.PLUGINIDS);
// should never happen
Preconditions.checkNotNull(sinkPluginIdsStr, "Sink plugin ids could not be found in program properties.");
List sinkInfos = GSON.fromJson(sinkPluginIdsStr, SINK_INFO_TYPE);
batchSinks = Lists.newArrayListWithCapacity(sinkInfos.size());
for (SinkInfo sinkInfo : sinkInfos) {
BatchConfigurable batchSink = context.newPluginInstance(sinkInfo.getSinkId());
MapReduceSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics, sinkInfo.getSinkId());
batchSink.prepareRun(sinkContext);
batchSinks.add(batchSink);
sinkOutputs.add(new SinkOutput(sinkInfo.getSinkId(), sinkContext.getOutputNames(),
sinkInfo.getErrorDatasetName()));
if (sinkInfo.getErrorDatasetName() != null) {
addPropertiesToErrorDataset(sinkInfo.getErrorDatasetName(), context);
}
}
job.getConfiguration().set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));
job.setMapperClass(ETLMapper.class);
job.setNumReduceTasks(0);
}
private void addPropertiesToErrorDataset(String errorDatasetName, MapReduceContext context) {
Map args = new HashMap<>();
args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key", ERROR_SCHEMA.toString());
TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime());
context.addOutput(errorDatasetName, args);
}
@Override
public void onFinish(boolean succeeded, MapReduceContext context) throws Exception {
onRunFinishSource(context, succeeded);
onRunFinishSink(context, succeeded);
LOG.info("Batch Run finished : succeeded = {}", succeeded);
}
private void onRunFinishSource(MapReduceContext context, boolean succeeded) {
String sourcePluginId = context.getSpecification().getProperty(Constants.Source.PLUGINID);
BatchSourceContext sourceContext = new MapReduceSourceContext(context, mrMetrics, sourcePluginId);
LOG.info("On RunFinish Source : {}", batchSource.getClass().getName());
try {
batchSource.onRunFinish(succeeded, sourceContext);
} catch (Throwable t) {
LOG.warn("Exception when calling onRunFinish on {}", batchSource, t);
}
}
private void onRunFinishSink(MapReduceContext context, boolean succeeded) {
String sinkPluginIdsStr = context.getSpecification().getProperty(Constants.Sink.PLUGINIDS);
// should never happen
Preconditions.checkNotNull(sinkPluginIdsStr, "Sink plugin ids could not be found in program properties.");
List sinkInfos = GSON.fromJson(sinkPluginIdsStr, SINK_INFO_TYPE);
for (int i = 0; i < sinkInfos.size(); i++) {
BatchConfigurable batchSink = batchSinks.get(i);
String sinkPluginId = sinkInfos.get(i).getSinkId();
BatchSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics, sinkPluginId);
try {
batchSink.onRunFinish(succeeded, sinkContext);
} catch (Throwable t) {
LOG.warn("Exception when calling onRunFinish on {}", batchSink, t);
}
}
}
/**
* Mapper Driver for ETL Transforms.
*/
public static class ETLMapper extends Mapper implements ProgramLifecycle> {
private static final Logger LOG = LoggerFactory.getLogger(ETLMapper.class);
private static final Gson GSON = new Gson();
private static final Type TRANSFORMDETAILS_LIST_TYPE = new TypeToken>() { }.getType();
private Set transformsWithoutErrorDataset;
private TransformExecutor transformExecutor;
// injected by CDAP
@SuppressWarnings("unused")
private Metrics mapperMetrics;
private List> sinks;
private Map> transformErrorSinkMap;
@Override
public void initialize(MapReduceTaskContext
© 2015 - 2025 Weber Informatics LLC | Privacy Policy