co.cask.cdap.etl.batch.mapreduce.ETLMapReduce Maven / Gradle / Ivy

Go to download
/*
 * Copyright © 2015-2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.etl.batch.mapreduce;

import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.ProgramStatus;
import co.cask.cdap.api.annotation.TransactionControl;
import co.cask.cdap.api.annotation.TransactionPolicy;
import co.cask.cdap.api.data.DatasetContext;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.macro.MacroEvaluator;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.mapreduce.MapReduceTaskContext;
import co.cask.cdap.api.metrics.Metrics;
import co.cask.cdap.api.workflow.WorkflowToken;
import co.cask.cdap.etl.api.AlertPublisher;
import co.cask.cdap.etl.api.Transform;
import co.cask.cdap.etl.api.batch.BatchAggregator;
import co.cask.cdap.etl.api.batch.BatchConfigurable;
import co.cask.cdap.etl.api.batch.BatchJoiner;
import co.cask.cdap.etl.api.batch.BatchSink;
import co.cask.cdap.etl.api.batch.BatchSinkContext;
import co.cask.cdap.etl.api.batch.BatchSource;
import co.cask.cdap.etl.api.batch.BatchSourceContext;
import co.cask.cdap.etl.api.lineage.field.FieldOperation;
import co.cask.cdap.etl.batch.BatchPhaseSpec;
import co.cask.cdap.etl.batch.DefaultAggregatorContext;
import co.cask.cdap.etl.batch.DefaultJoinerContext;
import co.cask.cdap.etl.batch.PipelinePluginInstantiator;
import co.cask.cdap.etl.batch.StageFailureException;
import co.cask.cdap.etl.batch.connector.MultiConnectorFactory;
import co.cask.cdap.etl.batch.conversion.WritableConversion;
import co.cask.cdap.etl.batch.conversion.WritableConversions;
import co.cask.cdap.etl.common.Constants;
import co.cask.cdap.etl.common.DefaultMacroEvaluator;
import co.cask.cdap.etl.common.FieldOperationTypeAdapter;
import co.cask.cdap.etl.common.LocationAwareMDCWrapperLogger;
import co.cask.cdap.etl.common.PipelinePhase;
import co.cask.cdap.etl.common.PipelineRuntime;
import co.cask.cdap.etl.common.SetMultimapCodec;
import co.cask.cdap.etl.common.TypeChecker;
import co.cask.cdap.etl.common.submit.AggregatorContextProvider;
import co.cask.cdap.etl.common.submit.CompositeFinisher;
import co.cask.cdap.etl.common.submit.ContextProvider;
import co.cask.cdap.etl.common.submit.Finisher;
import co.cask.cdap.etl.common.submit.JoinerContextProvider;
import co.cask.cdap.etl.common.submit.SubmitterPlugin;
import co.cask.cdap.etl.log.LogStageInjector;
import co.cask.cdap.etl.spec.StageSpec;
import co.cask.cdap.internal.io.SchemaTypeAdapter;
import com.google.common.base.Joiner;
import com.google.common.base.Throwables;
import com.google.common.collect.SetMultimap;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.lang.reflect.Type;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * MapReduce Driver for ETL Batch Applications.
 */
public class ETLMapReduce extends AbstractMapReduce {
  public static final String NAME = ETLMapReduce.class.getSimpleName();
  public static final String MAP_KEY_CLASS = "cdap.etl.map.key.class";
  public static final String MAP_VAL_CLASS = "cdap.etl.map.val.class";
  static final String RUNTIME_ARGS_KEY = "cdap.etl.runtime.args";
  static final String INPUT_ALIAS_KEY = "cdap.etl.source.alias.key";
  static final String SINK_OUTPUTS_KEY = "cdap.etl.sink.outputs";
  static final Type RUNTIME_ARGS_TYPE = new TypeToken>() { }.getType();
  static final Type INPUT_ALIAS_TYPE = new TypeToken>() { }.getType();
  static final Type SINK_OUTPUTS_TYPE = new TypeToken>() { }.getType();
  static final Type CONNECTOR_DATASETS_TYPE = new TypeToken>() { }.getType();
  private static final Logger LOG = LoggerFactory.getLogger(ETLMapReduce.class);
  private static final Logger PIPELINE_LOG = new LocationAwareMDCWrapperLogger(LOG, Constants.EVENT_TYPE_TAG,
                                                                              Constants.PIPELINE_LIFECYCLE_TAG_VALUE);
  private static final Gson GSON = new GsonBuilder()
    .registerTypeAdapter(Schema.class, new SchemaTypeAdapter())
    .registerTypeAdapter(SetMultimap.class, new SetMultimapCodec<>())
    .registerTypeAdapter(FieldOperation.class, new FieldOperationTypeAdapter())
    .create();

  private Finisher finisher;

  // injected by CDAP
  @SuppressWarnings("unused")
  private Metrics mrMetrics;

  // this is only visible at configure time, not at runtime
  private final BatchPhaseSpec phaseSpec;

  private final Set connectorDatasets;

  public ETLMapReduce(BatchPhaseSpec phaseSpec) {
    this(phaseSpec, new HashSet());
  }

  public ETLMapReduce(BatchPhaseSpec phaseSpec, Set connectorDatasets) {
    this.phaseSpec = phaseSpec;
    this.connectorDatasets = connectorDatasets;
  }

  @Override
  public void configure() {
    setName(phaseSpec.getPhaseName());
    setDescription("MapReduce phase executor. " + phaseSpec.getDescription());

    // register the plugins at program level so that the program can be failed by the platform early in case of
    // plugin requirements not being meet
    phaseSpec.getPhase().registerPlugins(getConfigurer());

    // Set resources for mapper, reducer and driver
    setMapperResources(phaseSpec.getResources());
    setReducerResources(phaseSpec.getResources());
    setDriverResources(phaseSpec.getDriverResources());

    Set sources = phaseSpec.getPhase().getSources();
    // Planner should make sure this never happens
    if (sources.isEmpty()) {
      throw new IllegalArgumentException(String.format(
        "Pipeline phase '%s' must contain at least one source but it has no sources.", phaseSpec.getPhaseName()));
    }
    if (phaseSpec.getPhase().getSinks().isEmpty()) {
      throw new IllegalArgumentException(String.format(
        "Pipeline phase '%s' must contain at least one sink but does not have any.", phaseSpec.getPhaseName()));
    }
    Set reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE,
                                                                   BatchJoiner.PLUGIN_TYPE);
    if (reducers.size() > 1) {
      throw new IllegalArgumentException(String.format(
        "Pipeline phase '%s' cannot contain more than one reducer but it has reducers '%s'.",
        phaseSpec.getPhaseName(), Joiner.on(',').join(reducers)));
    }

    // add source, sink, transform ids to the properties. These are needed at runtime to instantiate the plugins
    Map properties = new HashMap<>();
    properties.put(Constants.PIPELINEID, GSON.toJson(phaseSpec));
    properties.put(Constants.CONNECTOR_DATASETS, GSON.toJson(connectorDatasets));
    setProperties(properties);
  }

  @Override
  @TransactionPolicy(TransactionControl.EXPLICIT)
  public void initialize() throws Exception {
    final MapReduceContext context = getContext();
    Map properties = context.getSpecification().getProperties();
    if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
      LogStageInjector.start();
    }
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context, mrMetrics);
    List finishers = new ArrayList<>();

    final Job job = context.getHadoopJob();
    final Configuration hConf = job.getConfiguration();
    hConf.setBoolean("mapreduce.map.speculative", false);
    hConf.setBoolean("mapreduce.reduce.speculative", false);

    // plugin name -> runtime args for that plugin
    MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(),
                                                         context.getLogicalStartTime(),
                                                         context, context.getNamespace());

    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    Set connectorDatasets = GSON.fromJson(properties.get(Constants.CONNECTOR_DATASETS),
                                                  CONNECTOR_DATASETS_TYPE);

    for (Map.Entry pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
      hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }

    final PipelinePhase phase = phaseSpec.getPhase();
    PipelinePluginInstantiator pluginInstantiator =
      new PipelinePluginInstantiator(context, mrMetrics, phaseSpec, new MultiConnectorFactory());

    // should never happen if planner is correct
    Set reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE,
                                                                   BatchJoiner.PLUGIN_TYPE);
    if (reducers.size() > 1) {
      Iterator reducerIter = reducers.iterator();
      StringBuilder reducersStr = new StringBuilder(reducerIter.next().getName());
      while (reducerIter.hasNext()) {
        reducersStr.append(",");
        reducersStr.append(reducerIter.next().getName());
      }
      throw new IllegalStateException("Found multiple reducers ( " + reducersStr + " ) in the same pipeline phase. " +
                                        "This means there was a bug in planning the pipeline when it was deployed. ");
    }

    job.setMapperClass(ETLMapper.class);
    if (reducers.isEmpty()) {
      job.setNumReduceTasks(0);
    } else {
      job.setReducerClass(ETLReducer.class);
    }

    final Map sinkOutputs = new HashMap<>();
    final Map inputAliasToStage = new HashMap<>();
    // Collect field operations emitted by various stages in this MapReduce program
    final Map> stageOperations = new HashMap<>();
    // call prepareRun on each stage in order so that any arguments set by a stage will be visible to subsequent stages
    for (final String stageName : phase.getDag().getTopologicalOrder()) {
      final StageSpec stageSpec = phase.getStage(stageName);
      String pluginType = stageSpec.getPluginType();
      boolean isConnectorSource =
        Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
      boolean isConnectorSink =
        Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);

      SubmitterPlugin submitterPlugin = null;
      if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {

        BatchConfigurable batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
        ContextProvider contextProvider =
          new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
        submitterPlugin = new SubmitterPlugin<>(
          stageName, context, batchSource, contextProvider,
          new SubmitterPlugin.PrepareAction() {
            @Override
            public void act(MapReduceBatchContext sourceContext) {
              for (String inputAlias : sourceContext.getInputNames()) {
                inputAliasToStage.put(inputAlias, stageName);
              }
              stageOperations.put(stageName, sourceContext.getFieldOperations());
            }
          });

      } else if (BatchSink.PLUGIN_TYPE.equals(pluginType) ||
        AlertPublisher.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {

        BatchConfigurable batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
        ContextProvider contextProvider =
          new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
        submitterPlugin = new SubmitterPlugin<>(
          stageName, context, batchSink, contextProvider,
          new SubmitterPlugin.PrepareAction() {
            @Override
            public void act(MapReduceBatchContext sinkContext) {
              sinkOutputs.put(stageName, new SinkOutput(sinkContext.getOutputNames()));
              stageOperations.put(stageName, sinkContext.getFieldOperations());
            }
          });

      } else if (Transform.PLUGIN_TYPE.equals(pluginType)) {

        Transform transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
        ContextProvider contextProvider =
          new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
        submitterPlugin = new SubmitterPlugin<>(
                stageName, context, transform, contextProvider,
                new SubmitterPlugin.PrepareAction() {
                  @Override
                  public void act(MapReduceBatchContext context) {
                    stageOperations.put(stageName, context.getFieldOperations());
                  }
            });

      } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {

        final BatchAggregator aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
        ContextProvider contextProvider =
          new AggregatorContextProvider(pipelineRuntime, stageSpec, context.getAdmin());
        submitterPlugin = new SubmitterPlugin<>(
          stageName, context, aggregator, contextProvider,
          new SubmitterPlugin.PrepareAction() {
            @Override
            public void act(DefaultAggregatorContext aggregatorContext) {
              if (aggregatorContext.getNumPartitions() != null) {
                job.setNumReduceTasks(aggregatorContext.getNumPartitions());
              }
              Class outputKeyClass = aggregatorContext.getGroupKeyClass();
              Class outputValClass = aggregatorContext.getGroupValueClass();

              if (outputKeyClass == null) {
                outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
              }
              if (outputValClass == null) {
                outputValClass = TypeChecker.getGroupValueClass(aggregator);
              }
              hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
              hConf.set(MAP_VAL_CLASS, outputValClass.getName());
              job.setMapOutputKeyClass(getOutputKeyClass(stageName, outputKeyClass));
              job.setMapOutputValueClass(getOutputValClass(stageName, outputValClass));
              stageOperations.put(stageName, aggregatorContext.getFieldOperations());
            }
          });

      } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {

        final BatchJoiner batchJoiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
        ContextProvider contextProvider =
          new JoinerContextProvider(pipelineRuntime, stageSpec, context.getAdmin());
        submitterPlugin = new SubmitterPlugin<>(
          stageName, context, batchJoiner, contextProvider,
          new SubmitterPlugin.PrepareAction() {
            @Override
            public void act(DefaultJoinerContext joinerContext) {
              if (joinerContext.getNumPartitions() != null) {
                job.setNumReduceTasks(joinerContext.getNumPartitions());
              }
              Class outputKeyClass = joinerContext.getJoinKeyClass();
              Class inputRecordClass = joinerContext.getJoinInputRecordClass();

              if (outputKeyClass == null) {
                outputKeyClass = TypeChecker.getJoinKeyClass(batchJoiner);
              }
              if (inputRecordClass == null) {
                inputRecordClass = TypeChecker.getJoinInputRecordClass(batchJoiner);
              }
              hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
              hConf.set(MAP_VAL_CLASS, inputRecordClass.getName());
              job.setMapOutputKeyClass(getOutputKeyClass(stageName, outputKeyClass));
              getOutputValClass(stageName, inputRecordClass);
              // for joiner plugin map output is tagged with stageName
              job.setMapOutputValueClass(TaggedWritable.class);
              stageOperations.put(stageName, joinerContext.getFieldOperations());
            }
          });
      }
      if (submitterPlugin != null) {
        submitterPlugin.prepareRun();
        finishers.add(submitterPlugin);
      }
    }

    hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));
    hConf.set(INPUT_ALIAS_KEY, GSON.toJson(inputAliasToStage));
    finisher = new CompositeFinisher(finishers);

    job.setMapperClass(ETLMapper.class);

    WorkflowToken token = context.getWorkflowToken();
    if (token != null) {
      for (Map.Entry entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
        token.put(entry.getKey(), entry.getValue());
      }

      // Put the collected field operations in workflow token
      token.put(Constants.FIELD_OPERATION_KEY_IN_WORKFLOW_TOKEN, GSON.toJson(stageOperations));
    }
    // token is null when just the mapreduce job is run but not the entire workflow
    // we still want things to work in that case.
    hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(pipelineRuntime.getArguments().asMap()));
  }

  private Class getOutputKeyClass(String reducerName, Class outputKeyClass) {
    // in case the classes are not a WritableComparable, but is some common type we support
    // for example, a String or a StructuredRecord
    WritableConversion writableConversion = WritableConversions.getConversion(outputKeyClass.getName());
    // if the conversion is null, it means the user is using their own object.
    if (writableConversion != null) {
      outputKeyClass = writableConversion.getWritableClass();
    }
    // check classes here instead of letting mapreduce do it, since mapreduce throws a cryptic error
    if (!WritableComparable.class.isAssignableFrom(outputKeyClass)) {
      throw new IllegalArgumentException(String.format(
        "Invalid reducer %s. The key class %s must implement Hadoop's WritableComparable.",
        reducerName, outputKeyClass));
    }
    return outputKeyClass;
  }

  private Class getOutputValClass(String reducerName, Class outputValClass) {
    WritableConversion writableConversion;
    writableConversion = WritableConversions.getConversion(outputValClass.getName());
    if (writableConversion != null) {
      outputValClass = writableConversion.getWritableClass();
    }
    if (!Writable.class.isAssignableFrom(outputValClass)) {
      throw new IllegalArgumentException(String.format(
        "Invalid reducer %s. The value class %s must implement Hadoop's Writable.",
        reducerName, outputValClass));
    }
    return outputValClass;
  }

  @Override
  @TransactionPolicy(TransactionControl.EXPLICIT)
  public void destroy() {
    boolean isSuccessful = getContext().getState().getStatus() == ProgramStatus.COMPLETED;
    if (finisher != null) {
      // this can be null if the initialize() method failed.
      finisher.onFinish(isSuccessful);
    }
    LOG.info("Batch Run finished : status = {}", getContext().getState());
  }

  /**
   * Provider for MapReduceBatchContexts.
   */
  private static class MapReduceBatchContextProvider implements ContextProvider {
    private final MapReduceContext context;
    private final PipelineRuntime pipelineRuntime;
    private final StageSpec stageSpec;
    private final Set connectorDatasets;

    private MapReduceBatchContextProvider(MapReduceContext context, PipelineRuntime pipelineRuntime, StageSpec
      stageSpec, Set connectorDatasets) {
      this.context = context;
      this.pipelineRuntime = pipelineRuntime;
      this.stageSpec = stageSpec;
      this.connectorDatasets = connectorDatasets;
    }

    @Override
    public MapReduceBatchContext getContext(DatasetContext datasetContext) {
      return new MapReduceBatchContext(context, pipelineRuntime, stageSpec, connectorDatasets, datasetContext);
    }
  }


  /**
   * Mapper Driver for ETL Transforms.
   */
  public static class ETLMapper extends Mapper implements ProgramLifecycle> {

    private TransformRunner transformRunner;
    // injected by CDAP
    @SuppressWarnings("unused")
    private Metrics mapperMetrics;

    @Override
    public void initialize(MapReduceTaskContext context) throws Exception {
      // get source, transform, sink ids from program properties
      Map properties = context.getSpecification().getProperties();
      if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
      }
      transformRunner = new TransformRunner<>(context, mapperMetrics);
    }

    @Override
    public void map(Object key, Object value, Mapper.Context context) throws IOException, InterruptedException {
      try {
        transformRunner.transform(key, value);
      } catch (StageFailureException e) {
        PIPELINE_LOG.error("{}", e.getMessage(), e.getCause());
        Throwables.propagate(e.getCause());
      } catch (Exception e) {
        Throwables.propagate(e);
      }
    }

    @Override
    public void destroy() {
      transformRunner.destroy();
    }
  }

  /**
   * Reducer for a phase of an ETL pipeline.
   */
  public static class ETLReducer extends Reducer implements ProgramLifecycle> {

    // injected by CDAP
    @SuppressWarnings("unused")
    private Metrics reducerMetrics;
    private TransformRunner transformRunner;

    @Override
    public void initialize(MapReduceTaskContext context) throws Exception {
      // get source, transform, sink ids from program properties
      Map properties = context.getSpecification().getProperties();
      if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
      }
      transformRunner = new TransformRunner<>(context, reducerMetrics);
    }

    @Override
    protected void reduce(Object key, Iterable values, Context context) throws IOException, InterruptedException {
      try {
        transformRunner.transform(key, values.iterator());
      } catch (StageFailureException e) {
        PIPELINE_LOG.error("{}", e.getMessage(), e.getCause());
        Throwables.propagate(e.getCause());
      } catch (Exception e) {
        Throwables.propagate(e);
      }
    }

    @Override
    public void destroy() {
      transformRunner.destroy();
    }
  }
}