co.cask.cdap.etl.batch.mapreduce.ETLMapReduce Maven / Gradle / Ivy

Go to download
/*
 * Copyright © 2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.etl.batch.mapreduce;

import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.Resources;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.dataset.lib.FileSetProperties;
import co.cask.cdap.api.dataset.lib.KeyValue;
import co.cask.cdap.api.dataset.lib.TimePartitionedFileSet;
import co.cask.cdap.api.dataset.lib.TimePartitionedFileSetArguments;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.mapreduce.MapReduceTaskContext;
import co.cask.cdap.api.metrics.Metrics;
import co.cask.cdap.etl.api.InvalidEntry;
import co.cask.cdap.etl.api.Transform;
import co.cask.cdap.etl.api.batch.BatchConfigurable;
import co.cask.cdap.etl.api.batch.BatchRuntimeContext;
import co.cask.cdap.etl.api.batch.BatchSink;
import co.cask.cdap.etl.api.batch.BatchSinkContext;
import co.cask.cdap.etl.api.batch.BatchSource;
import co.cask.cdap.etl.api.batch.BatchSourceContext;
import co.cask.cdap.etl.batch.LoggedBatchConfigurable;
import co.cask.cdap.etl.batch.LoggedBatchSink;
import co.cask.cdap.etl.batch.LoggedBatchSource;
import co.cask.cdap.etl.batch.config.ETLBatchConfig;
import co.cask.cdap.etl.common.Constants;
import co.cask.cdap.etl.common.DatasetContextLookupProvider;
import co.cask.cdap.etl.common.DefaultEmitter;
import co.cask.cdap.etl.common.DefaultStageMetrics;
import co.cask.cdap.etl.common.Destroyables;
import co.cask.cdap.etl.common.LoggedTransform;
import co.cask.cdap.etl.common.Pipeline;
import co.cask.cdap.etl.common.PipelineRegisterer;
import co.cask.cdap.etl.common.SinkInfo;
import co.cask.cdap.etl.common.TransformDetail;
import co.cask.cdap.etl.common.TransformExecutor;
import co.cask.cdap.etl.common.TransformInfo;
import co.cask.cdap.etl.common.TransformResponse;
import co.cask.cdap.etl.log.LogStageInjector;
import co.cask.cdap.format.StructuredRecordStringConverter;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.lang.reflect.Type;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * MapReduce Driver for ETL Batch Applications.
 */
public class ETLMapReduce extends AbstractMapReduce {
  public static final String NAME = ETLMapReduce.class.getSimpleName();
  private static final Logger LOG = LoggerFactory.getLogger(ETLMapReduce.class);
  private static final String SINK_OUTPUTS_KEY = "cdap.etl.sink.outputs";
  private static final Type SINK_OUTPUTS_TYPE = new TypeToken>() { }.getType();
  private static final Type RUNTIME_ARGS_TYPE = new TypeToken>() { }.getType();
  private static final String RUNTIME_ARGS_KEY_PREFIX = "cdap.etl.runtime.args.";


  private static final Gson GSON = new Gson();

  private static final org.apache.avro.Schema AVRO_ERROR_SCHEMA =
    new org.apache.avro.Schema.Parser().parse(Constants.ERROR_SCHEMA.toString());

  private BatchConfigurable batchSource;
  private MapReduceSourceContext sourceContext;

  private Map> batchSinks;
  private Map sinkContexts;

  // injected by CDAP
  @SuppressWarnings("unused")
  private Metrics mrMetrics;

  // this is only visible at configure time, not at runtime
  private final ETLBatchConfig config;

  public ETLMapReduce(ETLBatchConfig config) {
    this.config = config;
  }

  @Override
  public void configure() {
    setName(NAME);
    setDescription("MapReduce Driver for ETL Batch Applications");

    PipelineRegisterer pipelineRegisterer = new PipelineRegisterer(getConfigurer(), "batch");

    Pipeline pipeline =
      pipelineRegisterer.registerPlugins(
        config, TimePartitionedFileSet.class,
        FileSetProperties.builder()
          .setInputFormat(AvroKeyInputFormat.class)
          .setOutputFormat(AvroKeyOutputFormat.class)
          .setEnableExploreOnCreate(true)
          .setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe")
          .setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat")
          .setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat")
          .setTableProperty("avro.schema.literal", Constants.ERROR_SCHEMA.toString())
          .build(), true);

    Resources resources = config.getResources();
    if (resources != null) {
      setMapperResources(resources);
    }
    Resources driverResources = config.getDriverResources();
    if (driverResources != null) {
      setDriverResources(driverResources);
    }

    // add source, sink, transform ids to the properties. These are needed at runtime to instantiate the plugins
    Map properties = new HashMap<>();
    properties.put(Constants.PIPELINEID, GSON.toJson(pipeline));
    properties.put(Constants.STAGE_LOGGING_ENABLED, String.valueOf(config.isStageLoggingEnabled()));
    setProperties(properties);
  }

  @Override
  public void beforeSubmit(MapReduceContext context) throws Exception {
    if (Boolean.valueOf(context.getSpecification().getProperty(Constants.STAGE_LOGGING_ENABLED))) {
      LogStageInjector.start();
    }
    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();

    Map properties = context.getSpecification().getProperties();
    Pipeline pipeline = GSON.fromJson(properties.get(Constants.PIPELINEID), Pipeline.class);
    // following should never happen
    Preconditions.checkNotNull(pipeline, "Pipeline is null");
    Preconditions.checkNotNull(pipeline.getSinks(), "Sinks could not be found in program properties");
    // empty transform list is created during pipeline register
    Preconditions.checkNotNull(pipeline.getTransforms());
    Preconditions.checkNotNull(pipeline.getConnections(), "Connections could not be found in program properties");

    String sourcePluginId = pipeline.getSource();

    batchSource = context.newPluginInstance(sourcePluginId);
    batchSource = new LoggedBatchConfigurable<>(sourcePluginId, batchSource);
    sourceContext = new MapReduceSourceContext(context, mrMetrics, new DatasetContextLookupProvider(context),
                                               sourcePluginId, context.getRuntimeArguments());
    batchSource.prepareRun(sourceContext);

    hConf.set(RUNTIME_ARGS_KEY_PREFIX + sourcePluginId,
              GSON.toJson(sourceContext.getRuntimeArguments(), RUNTIME_ARGS_TYPE));


    List transformInfos = pipeline.getTransforms();

    // setup time partition for each error dataset
    for (TransformInfo transformInfo : transformInfos) {
      if (transformInfo.getErrorDatasetName() != null) {
        addPropertiesToErrorDataset(transformInfo.getErrorDatasetName(), context);
      }
    }

    List sinkOutputs = new ArrayList<>();

    List sinkInfos = pipeline.getSinks();
    batchSinks = new HashMap<>(sinkInfos.size());
    sinkContexts = new HashMap<>(sinkInfos.size());
    for (SinkInfo sinkInfo : sinkInfos) {
      BatchConfigurable batchSink = context.newPluginInstance(sinkInfo.getSinkId());
      batchSink = new LoggedBatchConfigurable<>(sinkInfo.getSinkId(), batchSink);
      MapReduceSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics,
                                                                  new DatasetContextLookupProvider(context),
                                                                  sinkInfo.getSinkId(), context.getRuntimeArguments());
      sinkContexts.put(sinkInfo.getSinkId(), sinkContext);
      batchSinks.put(sinkInfo.getSinkId(), batchSink);

      batchSink.prepareRun(sinkContext);
      sinkOutputs.add(new SinkOutput(sinkInfo.getSinkId(), sinkContext.getOutputNames(),
                                     sinkInfo.getErrorDatasetName()));

      if (sinkInfo.getErrorDatasetName() != null) {
        addPropertiesToErrorDataset(sinkInfo.getErrorDatasetName(), context);
      }
      hConf.set(RUNTIME_ARGS_KEY_PREFIX + sinkInfo.getSinkId(),
                GSON.toJson(sinkContext.getRuntimeArguments(), RUNTIME_ARGS_TYPE));
    }
    hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));

    job.setMapperClass(ETLMapper.class);
    job.setNumReduceTasks(0);
  }

  private void addPropertiesToErrorDataset(String errorDatasetName, MapReduceContext context) {
    Map args = new HashMap<>();
    args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key",
             Constants.ERROR_SCHEMA.toString());
    TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime());
    context.addOutput(errorDatasetName, args);
  }

  @Override
  public void onFinish(boolean succeeded, MapReduceContext context) throws Exception {
    onRunFinishSource(succeeded);
    onRunFinishSinks(context, succeeded);
    LOG.info("Batch Run finished : succeeded = {}", succeeded);
  }


  private void onRunFinishSource(boolean succeeded) {

    LOG.info("On RunFinish Source : {}", batchSource.getClass().getName());
    try {
      batchSource.onRunFinish(succeeded, sourceContext);
    } catch (Throwable t) {
      LOG.warn("Exception when calling onRunFinish on {}", batchSource, t);
    }
  }


  private void onRunFinishSinks(MapReduceContext context, boolean succeeded) {
    String pipelineStr = context.getSpecification().getProperty(Constants.PIPELINEID);
    // should never happen
    Preconditions.checkNotNull(pipelineStr, "pipeline could not be found in program properties.");

    List sinkInfos = GSON.fromJson(pipelineStr, Pipeline.class).getSinks();
    for (SinkInfo sinkInfo : sinkInfos) {
      BatchConfigurable batchSink = batchSinks.get(sinkInfo.getSinkId());
      MapReduceSinkContext sinkContext = sinkContexts.get(sinkInfo.getSinkId());
      try {
        batchSink.onRunFinish(succeeded, sinkContext);
      } catch (Throwable t) {
        LOG.warn("Exception when calling onRunFinish on {}", batchSink, t);
      }
    }
  }

  /**
   * Mapper Driver for ETL Transforms.
   */
  public static class ETLMapper extends Mapper implements ProgramLifecycle> {
    private static final Logger LOG = LoggerFactory.getLogger(ETLMapper.class);
    private static final Gson GSON = new Gson();
    private Set transformsWithoutErrorDataset;

    private TransformExecutor transformExecutor;
    // injected by CDAP
    @SuppressWarnings("unused")
    private Metrics mapperMetrics;
    private Map> sinks;
    private Map> transformErrorSinkMap;

    @Override
    public void initialize(MapReduceTaskContext context) throws Exception {
      // get source, transform, sink ids from program properties
      Map properties = context.getSpecification().getProperties();
      if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
      }

      // get the list of sinks, and the names of the outputs each sink writes to
      Context hadoopContext = context.getHadoopContext();
      Configuration hConf = hadoopContext.getConfiguration();

      Pipeline pipeline = GSON.fromJson(properties.get(Constants.PIPELINEID), Pipeline.class);
      // following should never happen
      Preconditions.checkNotNull(pipeline, "Pipeline is null");
      Preconditions.checkNotNull(pipeline.getSinks(), "Sinks could not be found in program properties");
      // empty transform list is created during pipeline register
      Preconditions.checkNotNull(pipeline.getTransforms());
      Preconditions.checkNotNull(pipeline.getConnections(), "Connections could not be found in program properties");

      String sourcePluginId = pipeline.getSource();
      DefaultEmitter defaultEmitter = new DefaultEmitter(mapperMetrics);

      List transformInfos = pipeline.getTransforms();
      Map> connectionsMap = pipeline.getConnections();
      Map transformations = new HashMap<>();

      BatchSource source = context.newPluginInstance(sourcePluginId);
      source = new LoggedBatchSource(sourcePluginId, source);
      BatchRuntimeContext runtimeContext = new MapReduceRuntimeContext(
        context, mapperMetrics, new DatasetContextLookupProvider(context), sourcePluginId,
        GSON.>fromJson(hConf.get(RUNTIME_ARGS_KEY_PREFIX + sourcePluginId), RUNTIME_ARGS_TYPE));
      source.initialize(runtimeContext);
      transformations.put(sourcePluginId,
                          new TransformDetail(source, new DefaultStageMetrics(mapperMetrics, sourcePluginId),
                                              connectionsMap.get(sourcePluginId)));

      transformErrorSinkMap = new HashMap<>();
      transformsWithoutErrorDataset = new HashSet<>();
      addTransforms(transformations, connectionsMap, transformInfos, context);

      String sinkOutputsStr = hadoopContext.getConfiguration().get(SINK_OUTPUTS_KEY);
      // should never happen, this is set in beforeSubmit
      Preconditions.checkNotNull(sinkOutputsStr, "Sink outputs not found in Hadoop conf.");

      List sinkOutputs = GSON.fromJson(sinkOutputsStr, SINK_OUTPUTS_TYPE);

      // should never happen, this is checked and set in beforeSubmit
      Preconditions.checkArgument(!sinkOutputs.isEmpty(), "Sink outputs not found in Hadoop conf.");

      boolean hasOneOutput = hasOneOutput(transformInfos, sinkOutputs);
      sinks = new HashMap<>(sinkOutputs.size());
      for (SinkOutput sinkOutput : sinkOutputs) {
        String sinkPluginId = sinkOutput.getSinkPluginId();
        Set sinkOutputNames = sinkOutput.getSinkOutputs();

        BatchSink sink = context.newPluginInstance(sinkPluginId);
        sink = new LoggedBatchSink<>(sinkPluginId, sink);
        runtimeContext = new MapReduceRuntimeContext(
          context, mapperMetrics, new DatasetContextLookupProvider(context), sinkPluginId,
          GSON.>fromJson(hConf.get(RUNTIME_ARGS_KEY_PREFIX + sinkPluginId), RUNTIME_ARGS_TYPE));
        sink.initialize(runtimeContext);
        if (hasOneOutput) {
          sinks.put(sinkPluginId, new SingleOutputSink<>(sink, context));
        } else {
          sinks.put(sinkPluginId, new MultiOutputSink<>(sink, context, sinkOutputNames));
        }
        transformations.put(sinkPluginId,
                            new TransformDetail(sink, new DefaultStageMetrics(mapperMetrics, sinkPluginId),
                                                new ArrayList()));
      }

      transformExecutor = new TransformExecutor<>(transformations, ImmutableList.of(sourcePluginId));

    }


    // this is needed because we need to write to the context differently depending on the number of outputs
    private boolean hasOneOutput(List transformInfos, List sinkOutputs) {
      // if there are any error datasets, we know we have at least one sink, and one error dataset
      for (TransformInfo info : transformInfos) {
        if (info.getErrorDatasetName() != null) {
          return false;
        }
      }
      // if no error datasets, check if we have more than one sink
      Set allOutputs = new HashSet<>();

      for (SinkOutput sinkOutput : sinkOutputs) {
        if (sinkOutput.getErrorDatasetName() != null) {
          return false;
        }
        allOutputs.addAll(sinkOutput.getSinkOutputs());
      }
      return allOutputs.size() == 1;
    }

    private void addTransforms(Map pipeline,
                               Map> connectionsMap,
                               List transformInfos,
                               MapReduceTaskContext context) throws Exception {

      for (TransformInfo transformInfo : transformInfos) {
        String transformId = transformInfo.getTransformId();
        Transform transform = context.newPluginInstance(transformId);
        transform = new LoggedTransform<>(transformId, transform);
        BatchRuntimeContext transformContext = new MapReduceRuntimeContext(
          context, mapperMetrics, new DatasetContextLookupProvider(context), transformId,
          context.getRuntimeArguments());
        LOG.debug("Transform Class : {}", transform.getClass().getName());
        transform.initialize(transformContext);
        pipeline.put(transformId,
                     new TransformDetail(transform, new DefaultStageMetrics(mapperMetrics, transformId),
                                         connectionsMap.get(transformId)));
        if (transformInfo.getErrorDatasetName() != null) {
          transformErrorSinkMap.put(transformId,
                                    new ErrorSink<>(context, transformInfo.getErrorDatasetName()));
        }
      }
    }

    @Override
    public void map(Object key, Object value, Context context) throws IOException, InterruptedException {
      try {
        KeyValue input = new KeyValue<>(key, value);
        TransformResponse transformResponse = transformExecutor.runOneIteration(input);
        for (Map.Entry> transformedEntry : transformResponse.getSinksResults().entrySet()) {
          WrappedSink sink = sinks.get(transformedEntry.getKey());
          for (Object transformedRecord : transformedEntry.getValue()) {
            sink.write((KeyValue) transformedRecord);
          }
        }

        for (Map.Entry>> errorEntries :
          transformResponse.getMapTransformIdToErrorEmitter().entrySet()) {

          if (transformsWithoutErrorDataset.contains(errorEntries.getKey())) {
            continue;
          }
          if (!errorEntries.getValue().isEmpty()) {
            if (!transformErrorSinkMap.containsKey(errorEntries.getKey())) {
              LOG.warn("Transform : {} has error records, but does not have a error dataset configured.",
                       errorEntries.getKey());
              transformsWithoutErrorDataset.add(errorEntries.getKey());
            } else {
              transformErrorSinkMap.get(errorEntries.getKey()).write(errorEntries.getValue());
            }
          }
        }
        transformExecutor.resetEmitter();
      } catch (Exception e) {
        LOG.error("Exception thrown in BatchDriver Mapper.", e);
        Throwables.propagate(e);
      }
    }

    @Override
    public void destroy() {
      // BatchSource implements Transform, hence is inside the transformExecutor as well
      Destroyables.destroyQuietly(transformExecutor);
      // Cleanup BatchSinks separately, since they are not part of the transformExecutor
      LOG.debug("Number of sinks to destroy: {}", sinks.size());
      for (WrappedSink sink : sinks.values()) {
        LOG.trace("Destroying sink: {}", sink.sink);
        Destroyables.destroyQuietly(sink.sink);
      }
    }
  }

  private static class ErrorSink {
    private final MapReduceTaskContext context;
    private final String errorDatasetName;

    private ErrorSink(MapReduceTaskContext context, String errorDatasetName) {
      this.context = context;
      this.errorDatasetName = errorDatasetName;
    }

    public void write(Collection> input) throws Exception {
      for (InvalidEntry entry : input) {
        context.write(errorDatasetName, new AvroKey<>(getGenericRecordForInvalidEntry(entry)),
                      NullWritable.get());
      }
    }
  }

  // wrapper around sinks to help writing sink output to the correct named output
  private abstract static class WrappedSink {
    protected final BatchSink sink;
    protected final MapReduceTaskContext context;

    protected WrappedSink(BatchSink sink,
                          MapReduceTaskContext context) {
      this.sink = sink;
      this.context = context;
    }

    protected abstract void write(KeyValue output) throws Exception;
  }

  // need to write with a different method if there is only one output for the mapreduce
  // TODO: remove if the fix to CDAP-3628 allows us to write using the same method
  private static class SingleOutputSink extends WrappedSink {

    protected SingleOutputSink(BatchSink sink,
                               MapReduceTaskContext context) {
      super(sink, context);
    }

    public void write(KeyValue output) throws Exception {
      context.write(output.getKey(), output.getValue());
    }
  }

  // writes sink output to the correct named output
  private static class MultiOutputSink extends WrappedSink {
    private final Set outputNames;

    private MultiOutputSink(BatchSink sink,
                            MapReduceTaskContext context,
                            Set outputNames) {
      super(sink, context);
      this.outputNames = outputNames;
    }

    public void write(KeyValue output) throws Exception {
      for (String outputName : outputNames) {
        context.write(outputName, output.getKey(), output.getValue());
      }
    }
  }

  private static GenericRecord getGenericRecordForInvalidEntry(InvalidEntry invalidEntry) {
    GenericRecordBuilder recordBuilder = new GenericRecordBuilder(AVRO_ERROR_SCHEMA);
    recordBuilder.set(Constants.ErrorDataset.ERRCODE, invalidEntry.getErrorCode());
    recordBuilder.set(Constants.ErrorDataset.ERRMSG, invalidEntry.getErrorMsg());

    String errorMsg;
    if (invalidEntry.getInvalidRecord() instanceof StructuredRecord) {
      StructuredRecord record = (StructuredRecord) invalidEntry.getInvalidRecord();
      try {
        errorMsg = StructuredRecordStringConverter.toJsonString(record);
      } catch (IOException e) {
        errorMsg = "Exception while converting StructuredRecord to String, " + e.getCause();
      }
    } else {
      errorMsg = String.format("Error Entry is of type %s, only a record of type %s is supported currently",
                               invalidEntry.getInvalidRecord().getClass().getName(),
                               StructuredRecord.class.getName());
    }
    recordBuilder.set(Constants.ErrorDataset.INVALIDENTRY, errorMsg);
    return recordBuilder.build();
  }

  private static class SinkOutput {
    private String sinkPluginId;
    private Set sinkOutputs;
    private String errorDatasetName;

    private SinkOutput(String sinkPluginId, Set sinkOutputs, String errorDatasetName) {
      this.sinkPluginId = sinkPluginId;
      this.sinkOutputs = sinkOutputs;
      this.errorDatasetName = errorDatasetName;
    }

    public String getSinkPluginId() {
      return sinkPluginId;
    }

    public Set getSinkOutputs() {
      return sinkOutputs;
    }

    public String getErrorDatasetName() {
      return errorDatasetName;
    }

  }
}