io.cdap.cdap.internal.app.runtime.batch.dataset.AbstractBatchWritableOutputFormat Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of cdap-app-fabric Show documentation
There is a newer version: 6.10.1
/*
 * Copyright © 2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.cdap.internal.app.runtime.batch.dataset;

import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import io.cdap.cdap.api.data.batch.BatchWritable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.IOException;
import java.lang.reflect.Type;
import java.util.Map;

/**
 * An abstract base implementation of {@link OutputFormat} for writing to {@link BatchWritable} from batch job.
 *
 * @param  type of the key
 * @param  type of the value
 */
public abstract class AbstractBatchWritableOutputFormat extends OutputFormat {

  private static final Gson GSON = new Gson();
  private static final Type DATASET_ARGS_TYPE = new TypeToken>() { }.getType();

  private static final String DATASET_NAMESPACE = "output.datasetoutputformat.dataset.namespace";
  private static final String DATASET_NAME = "output.datasetoutputformat.dataset.name";
  private static final String DATASET_ARGS = "output.datasetoutputformat.dataset.args";

  /**
   * Sets dataset information into the given {@link Configuration}.
   *
   * @param hConf       configuration to modify
   * @param datasetName name of the dataset
   * @param datasetArgs arguments for the dataset
   */
  public static void setDataset(Configuration hConf, String namespace,
                                String datasetName, Map datasetArgs) {
    hConf.set(DATASET_NAMESPACE, namespace);
    hConf.set(DATASET_NAME, datasetName);
    hConf.set(DATASET_ARGS, GSON.toJson(datasetArgs, DATASET_ARGS_TYPE));
  }

  @Override
  public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    String namespace = conf.get(DATASET_NAMESPACE);
    String datasetName = conf.get(DATASET_NAME);
    Map datasetArgs = GSON.fromJson(conf.get(DATASET_ARGS), DATASET_ARGS_TYPE);
    return new BatchWritableRecordWriter<>(createBatchWritable(context, namespace, datasetName, datasetArgs));
  }

  @Override
  public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
    Configuration hConf = context.getConfiguration();
    if (hConf.get(DATASET_NAMESPACE) == null || hConf.get(DATASET_NAME) == null || hConf.get(DATASET_ARGS) == null) {
      throw new IOException("Dataset configurations are missing in the job configuration");
    }
  }

  @Override
  public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
    return new NoopOutputCommitter();
  }

  /**
   * Subclass needs to implementation this method to return a {@link BatchWritable} for writing records to
   * the given dataset.
   *
   * @param context the hadoop task context
   * @param namespace namespace of the dataset to write to
   * @param datasetName name of the dataset to write to
   * @param datasetArgs arguments of the dataset to write to
   */
  protected abstract CloseableBatchWritable createBatchWritable(TaskAttemptContext context,
                                                                            String namespace,
                                                                            String datasetName,
                                                                            Map datasetArgs);

  /**
   * Implementation of {@link RecordWriter} to write through a {@link CloseableBatchWritable}.
   */
  private static final class BatchWritableRecordWriter extends RecordWriter {

    private final CloseableBatchWritable delegate;

    private BatchWritableRecordWriter(CloseableBatchWritable delegate) {
      this.delegate = delegate;
    }

    @Override
    public void write(K key, V value) throws IOException, InterruptedException {
      delegate.write(key, value);
    }

    @Override
    public void close(TaskAttemptContext context) throws IOException, InterruptedException {
      delegate.close();
    }
  }

  /**
   * A no-op implementation of {@link OutputCommitter}.
   */
  private static final class NoopOutputCommitter extends OutputCommitter {
    @Override
    public void setupJob(final JobContext jobContext) throws IOException {
      // DO NOTHING, see needsTaskCommit() comment
    }

    @Override
    public boolean needsTaskCommit(final TaskAttemptContext taskContext) throws IOException {
      // Don't do commit of individual task work. Work is committed on job level. Ops are flushed on RecordWriter.close.
      return false;
    }

    @Override
    public void setupTask(final TaskAttemptContext taskContext) throws IOException {
      // DO NOTHING, see needsTaskCommit() comment
    }

    @Override
    public void commitTask(final TaskAttemptContext taskContext) throws IOException {
      // DO NOTHING, see needsTaskCommit() comment
    }

    @Override
    public void abortTask(final TaskAttemptContext taskContext) throws IOException {
      // DO NOTHING, see needsTaskCommit() comment
    }
  }
}