co.cask.cdap.etl.batch.ETLMapReduce Maven / Gradle / Ivy

Go to download
/*
 * Copyright © 2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.etl.batch;

import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.lib.FileSetProperties;
import co.cask.cdap.api.dataset.lib.KeyValue;
import co.cask.cdap.api.dataset.lib.TimePartitionedFileSet;
import co.cask.cdap.api.dataset.lib.TimePartitionedFileSetArguments;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.mapreduce.MapReduceTaskContext;
import co.cask.cdap.api.metrics.Metrics;
import co.cask.cdap.etl.api.InvalidEntry;
import co.cask.cdap.etl.api.Transform;
import co.cask.cdap.etl.api.batch.BatchConfigurable;
import co.cask.cdap.etl.api.batch.BatchRuntimeContext;
import co.cask.cdap.etl.api.batch.BatchSink;
import co.cask.cdap.etl.api.batch.BatchSinkContext;
import co.cask.cdap.etl.api.batch.BatchSource;
import co.cask.cdap.etl.api.batch.BatchSourceContext;
import co.cask.cdap.etl.batch.config.ETLBatchConfig;
import co.cask.cdap.etl.common.Constants;
import co.cask.cdap.etl.common.DefaultEmitter;
import co.cask.cdap.etl.common.Destroyables;
import co.cask.cdap.etl.common.Pipeline;
import co.cask.cdap.etl.common.PipelineRegisterer;
import co.cask.cdap.etl.common.PluginID;
import co.cask.cdap.etl.common.SinkInfo;
import co.cask.cdap.etl.common.StageMetrics;
import co.cask.cdap.etl.common.StructuredRecordStringConverter;
import co.cask.cdap.etl.common.TransformDetail;
import co.cask.cdap.etl.common.TransformExecutor;
import co.cask.cdap.etl.common.TransformInfo;
import co.cask.cdap.etl.common.TransformResponse;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.Lists;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.lang.reflect.Type;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;

/**
 * MapReduce Driver for ETL Batch Applications.
 */
public class ETLMapReduce extends AbstractMapReduce {
  public static final String NAME = ETLMapReduce.class.getSimpleName();
  private static final Logger LOG = LoggerFactory.getLogger(ETLMapReduce.class);
  private static final String SINK_OUTPUTS_KEY = "cdap.etl.sink.outputs";
  private static final Type SINK_OUTPUTS_TYPE = new TypeToken>() { }.getType();
  private static final Type SINK_INFO_TYPE = new TypeToken>() { }.getType();
  private static final Type TRANSFORMINFO_LIST_TYPE = new TypeToken>() { }.getType();

  private static final Gson GSON = new Gson();

  @VisibleForTesting
  static final Schema ERROR_SCHEMA = Schema.recordOf(
    "error",
    Schema.Field.of(Constants.ErrorDataset.ERRCODE, Schema.of(Schema.Type.INT)),
    Schema.Field.of(Constants.ErrorDataset.ERRMSG, Schema.unionOf(Schema.of(Schema.Type.STRING),
                                                                  Schema.of(Schema.Type.NULL))),
    Schema.Field.of(Constants.ErrorDataset.INVALIDENTRY, Schema.of(Schema.Type.STRING)));

  private static final org.apache.avro.Schema AVRO_ERROR_SCHEMA =
    new org.apache.avro.Schema.Parser().parse(ERROR_SCHEMA.toString());

  private BatchConfigurable batchSource;
  private List> batchSinks;
  // injected by CDAP
  @SuppressWarnings("unused")
  private Metrics mrMetrics;

  // this is only visible at configure time, not at runtime
  private final ETLBatchConfig config;

  public ETLMapReduce(ETLBatchConfig config) {
    this.config = config;
  }

  @Override
  public void configure() {
    setName(NAME);
    setDescription("MapReduce Driver for ETL Batch Applications");

    PipelineRegisterer pipelineRegisterer = new PipelineRegisterer(getConfigurer(), "batch");

    Pipeline pipelineIds =
      pipelineRegisterer.registerPlugins(
        config, TimePartitionedFileSet.class,
        FileSetProperties.builder()
          .setInputFormat(AvroKeyInputFormat.class)
          .setOutputFormat(AvroKeyOutputFormat.class)
          .setEnableExploreOnCreate(true)
          .setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe")
          .setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat")
          .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat")
          .setTableProperty("avro.schema.literal", ERROR_SCHEMA.toString())
          .build(), true);

    if (config.getResources() != null) {
      setMapperResources(config.getResources());
    }

    // add source, sink, transform ids to the properties. These are needed at runtime to instantiate the plugins
    Map properties = new HashMap<>();
    properties.put(Constants.Source.PLUGINID, pipelineIds.getSource());
    properties.put(Constants.Sink.PLUGINIDS, GSON.toJson(pipelineIds.getSinks()));
    properties.put(Constants.Transform.PLUGINIDS, GSON.toJson(pipelineIds.getTransforms()));
    setProperties(properties);
  }

  @Override
  public void beforeSubmit(MapReduceContext context) throws Exception {
    Job job = context.getHadoopJob();

    Map properties = context.getSpecification().getProperties();
    String sourcePluginId = properties.get(Constants.Source.PLUGINID);

    batchSource = context.newPluginInstance(sourcePluginId);
    BatchSourceContext sourceContext = new MapReduceSourceContext(context, mrMetrics, sourcePluginId);
    batchSource.prepareRun(sourceContext);

    String transformInfosStr = properties.get(Constants.Transform.PLUGINIDS);
    Preconditions.checkNotNull(transformInfosStr, "Transform plugin ids not found in program properties.");

    List transformInfos = GSON.fromJson(transformInfosStr, TRANSFORMINFO_LIST_TYPE);

    // setup time partition for each error dataset
    for (TransformInfo transformInfo : transformInfos) {
      if (transformInfo.getErrorDatasetName() != null) {
        addPropertiesToErrorDataset(transformInfo.getErrorDatasetName(), context);
      }
    }

    List sinkOutputs = new ArrayList<>();
    String sinkPluginIdsStr = properties.get(Constants.Sink.PLUGINIDS);
    // should never happen
    Preconditions.checkNotNull(sinkPluginIdsStr, "Sink plugin ids could not be found in program properties.");

    List sinkInfos = GSON.fromJson(sinkPluginIdsStr, SINK_INFO_TYPE);
    batchSinks = Lists.newArrayListWithCapacity(sinkInfos.size());
    for (SinkInfo sinkInfo : sinkInfos) {
      BatchConfigurable batchSink = context.newPluginInstance(sinkInfo.getSinkId());
      MapReduceSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics, sinkInfo.getSinkId());
      batchSink.prepareRun(sinkContext);
      batchSinks.add(batchSink);
      sinkOutputs.add(new SinkOutput(sinkInfo.getSinkId(), sinkContext.getOutputNames(),
                                     sinkInfo.getErrorDatasetName()));

      if (sinkInfo.getErrorDatasetName() != null) {
        addPropertiesToErrorDataset(sinkInfo.getErrorDatasetName(), context);
      }

    }
    job.getConfiguration().set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));

    job.setMapperClass(ETLMapper.class);
    job.setNumReduceTasks(0);
  }

  private void addPropertiesToErrorDataset(String errorDatasetName, MapReduceContext context) {
    Map args = new HashMap<>();
    args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key", ERROR_SCHEMA.toString());
    TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime());
    context.addOutput(errorDatasetName, args);
  }

  @Override
  public void onFinish(boolean succeeded, MapReduceContext context) throws Exception {
    onRunFinishSource(context, succeeded);
    onRunFinishSink(context, succeeded);
    LOG.info("Batch Run finished : succeeded = {}", succeeded);
  }

  private void onRunFinishSource(MapReduceContext context, boolean succeeded) {
    String sourcePluginId = context.getSpecification().getProperty(Constants.Source.PLUGINID);
    BatchSourceContext sourceContext = new MapReduceSourceContext(context, mrMetrics, sourcePluginId);
    LOG.info("On RunFinish Source : {}", batchSource.getClass().getName());
    try {
      batchSource.onRunFinish(succeeded, sourceContext);
    } catch (Throwable t) {
      LOG.warn("Exception when calling onRunFinish on {}", batchSource, t);
    }
  }

  private void onRunFinishSink(MapReduceContext context, boolean succeeded) {
    String sinkPluginIdsStr = context.getSpecification().getProperty(Constants.Sink.PLUGINIDS);
    // should never happen
    Preconditions.checkNotNull(sinkPluginIdsStr, "Sink plugin ids could not be found in program properties.");

    List sinkInfos = GSON.fromJson(sinkPluginIdsStr, SINK_INFO_TYPE);
    for (int i = 0; i < sinkInfos.size(); i++) {
      BatchConfigurable batchSink = batchSinks.get(i);
      String sinkPluginId = sinkInfos.get(i).getSinkId();
      BatchSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics, sinkPluginId);
      try {
        batchSink.onRunFinish(succeeded, sinkContext);
      } catch (Throwable t) {
        LOG.warn("Exception when calling onRunFinish on {}", batchSink, t);
      }
    }
  }

  /**
   * Mapper Driver for ETL Transforms.
   */
  public static class ETLMapper extends Mapper implements ProgramLifecycle> {
    private static final Logger LOG = LoggerFactory.getLogger(ETLMapper.class);
    private static final Gson GSON = new Gson();
    private static final Type TRANSFORMDETAILS_LIST_TYPE = new TypeToken>() { }.getType();
    private Set transformsWithoutErrorDataset;

    private TransformExecutor transformExecutor;
    // injected by CDAP
    @SuppressWarnings("unused")
    private Metrics mapperMetrics;
    private List> sinks;
    private Map> transformErrorSinkMap;

    @Override
    public void initialize(MapReduceTaskContext context) throws Exception {
      // get source, transform, sink ids from program properties
      context.getSpecification().getProperties();
      Map properties = context.getSpecification().getProperties();

      String sourcePluginId = properties.get(Constants.Source.PLUGINID);
      // should never happen
      String transformInfosStr = properties.get(Constants.Transform.PLUGINIDS);
      Preconditions.checkNotNull(transformInfosStr, "Transform plugin ids not found in program properties.");

      List transformInfos = GSON.fromJson(transformInfosStr, TRANSFORMDETAILS_LIST_TYPE);
      List pipeline = Lists.newArrayListWithCapacity(transformInfos.size() + 2);

      BatchSource source = context.newPluginInstance(sourcePluginId);
      BatchRuntimeContext runtimeContext = new MapReduceRuntimeContext(context, mapperMetrics, sourcePluginId);
      source.initialize(runtimeContext);
      pipeline.add(new TransformDetail(sourcePluginId, source,
        new StageMetrics(mapperMetrics, PluginID.from(sourcePluginId))));

      transformErrorSinkMap = new HashMap<>();
      transformsWithoutErrorDataset = new HashSet<>();
      addTransforms(pipeline, transformInfos, context);

      // get the list of sinks, and the names of the outputs each sink writes to
      Context hadoopContext = context.getHadoopContext();
      String sinkOutputsStr = hadoopContext.getConfiguration().get(SINK_OUTPUTS_KEY);
      // should never happen, this is set in beforeSubmit
      Preconditions.checkNotNull(sinkOutputsStr, "Sink outputs not found in Hadoop conf.");

      List sinkOutputs = GSON.fromJson(sinkOutputsStr, SINK_OUTPUTS_TYPE);

      // should never happen, this is checked and set in beforeSubmit
      Preconditions.checkArgument(!sinkOutputs.isEmpty(), "Sink outputs not found in Hadoop conf.");

      boolean hasOneOutput = hasOneOutput(transformInfos, sinkOutputs);
      sinks = new ArrayList<>(sinkOutputs.size());
      for (SinkOutput sinkOutput : sinkOutputs) {
        String sinkPluginId = sinkOutput.getSinkPluginId();
        Set sinkOutputNames = sinkOutput.getSinkOutputs();

        BatchSink sink = context.newPluginInstance(sinkPluginId);
        runtimeContext = new MapReduceRuntimeContext(context, mapperMetrics, sinkPluginId);
        sink.initialize(runtimeContext);
        if (hasOneOutput) {
          sinks.add(new SingleOutputSink<>(sinkPluginId, sink, context, mapperMetrics));
        } else {
          sinks.add(new MultiOutputSink<>(sinkPluginId, sink, context, mapperMetrics, sinkOutputNames,
                                          sinkOutput.getErrorDatasetName()));
        }
      }

      transformExecutor = new TransformExecutor<>(pipeline);
    }

    // this is needed because we need to write to the context differently depending on the number of outputs
    private boolean hasOneOutput(List transformInfos, List sinkOutputs) {
      // if there are any error datasets, we know we have at least one sink, and one error dataset
      for (TransformInfo info : transformInfos) {
        if (info.getErrorDatasetName() != null) {
          return false;
        }
      }
      // if no error datasets, check if we have more than one sink
      Set allOutputs = new HashSet<>();

      for (SinkOutput sinkOutput : sinkOutputs) {
        if (sinkOutput.getErrorDatasetName() != null) {
          return false;
        }
        allOutputs.addAll(sinkOutput.getSinkOutputs());
      }
      return allOutputs.size() == 1;
    }

    private void addTransforms(List pipeline,
                               List transformInfos,
                               MapReduceTaskContext context) throws Exception {

      for (TransformInfo transformInfo : transformInfos) {
        String transformId = transformInfo.getTransformId();
        Transform transform = context.newPluginInstance(transformId);
        BatchRuntimeContext transformContext = new MapReduceRuntimeContext(context, mapperMetrics, transformId);
        LOG.debug("Transform Class : {}", transform.getClass().getName());
        transform.initialize(transformContext);
        pipeline.add(new TransformDetail(transformId, transform,
                                               new StageMetrics(mapperMetrics, PluginID.from(transformId))));
        if (transformInfo.getErrorDatasetName() != null) {
          transformErrorSinkMap.put(transformId,
                                    new ErrorSink<>(context, transformInfo.getErrorDatasetName()));
        }
      }
    }

    @Override
    public void map(Object key, Object value, Context context) throws IOException, InterruptedException {
      try {
        KeyValue input = new KeyValue<>(key, value);
        TransformResponse