io.cdap.cdap.internal.app.runtime.batch.dataset.AbstractBatchReadableInputFormat Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of cdap-app-fabric Show documentation
The newest version!
/*
 * Copyright © 2015-2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package io.cdap.cdap.internal.app.runtime.batch.dataset;

import com.google.common.io.ByteArrayDataInput;
import com.google.common.io.ByteArrayDataOutput;
import com.google.common.io.ByteStreams;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import io.cdap.cdap.api.common.Bytes;
import io.cdap.cdap.api.data.batch.BatchReadable;
import io.cdap.cdap.api.data.batch.Split;
import io.cdap.cdap.api.data.batch.SplitReader;
import java.io.IOException;
import java.lang.reflect.Type;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

/**
 * An abstract {@link InputFormat} implementation that reads from {@link BatchReadable}.
 *
 * @param  Type of key.
 * @param  Type of value.
 */
public abstract class AbstractBatchReadableInputFormat extends InputFormat {

  private static final Gson GSON = new Gson();
  private static final Type DATASET_ARGS_TYPE = new TypeToken>() {
  }.getType();

  // Keys for storing values in Hadoop Configuration
  private static final String DATASET_NAMESPACE = "input.datasetinputformat.dataset.namespace";
  private static final String DATASET_NAME = "input.datasetinputformat.dataset.name";
  private static final String DATASET_ARGS = "input.datasetinputformat.dataset.args";
  private static final String SPLITS = "input.datasetinputformat.splits";

  /**
   * Sets dataset and splits information into the given {@link Configuration}.
   *
   * @param hConf configuration to modify
   * @param datasetNamespace namespace of the dataset
   * @param datasetName name of the dataset
   * @param datasetArguments arguments for the dataset
   * @param splits list of splits on the dataset
   */
  public static void setDatasetSplits(Configuration hConf, @Nullable String datasetNamespace,
      String datasetName, Map datasetArguments,
      List splits) throws IOException {
    if (datasetNamespace != null) {
      hConf.set(DATASET_NAMESPACE, datasetNamespace);
    }
    hConf.set(DATASET_NAME, datasetName);
    hConf.set(DATASET_ARGS, GSON.toJson(datasetArguments, DATASET_ARGS_TYPE));

    // Encode the list of splits with size followed by that many of DataSetInputSplit objects.
    ByteArrayDataOutput dataOutput = ByteStreams.newDataOutput();
    dataOutput.writeInt(splits.size());
    for (Split split : splits) {
      new DataSetInputSplit(split).write(dataOutput);
    }
    hConf.set(SPLITS, Bytes.toStringBinary(dataOutput.toByteArray()));
  }

  @Override
  public List getSplits(final JobContext context)
      throws IOException, InterruptedException {
    // Decode splits from Configuration
    String splitsConf = context.getConfiguration().get(SPLITS);
    if (splitsConf == null) {
      throw new IOException("No input splits available from job configuration.");
    }
    ByteArrayDataInput dataInput = ByteStreams.newDataInput(Bytes.toBytesBinary(splitsConf));
    int size = dataInput.readInt();
    List splits = new ArrayList<>(size);
    for (int i = 0; i < size; i++) {
      DataSetInputSplit inputSplit = new DataSetInputSplit();
      inputSplit.readFields(dataInput);
      splits.add(inputSplit);
    }
    return splits;
  }

  @Override
  public RecordReader createRecordReader(InputSplit split,
      TaskAttemptContext context)
      throws IOException, InterruptedException {

    DataSetInputSplit inputSplit = (DataSetInputSplit) split;
    Configuration conf = context.getConfiguration();

    String datasetNamespace = conf.get(DATASET_NAMESPACE);
    String datasetName = conf.get(DATASET_NAME);
    Map datasetArgs = GSON.fromJson(conf.get(DATASET_ARGS), DATASET_ARGS_TYPE);

    @SuppressWarnings("unchecked")
    BatchReadable batchReadable = createBatchReadable(context, datasetNamespace,
        datasetName, datasetArgs);
    SplitReader splitReader = batchReadable.createSplitReader(inputSplit.getSplit());
    return new SplitReaderRecordReader<>(splitReader);
  }

  /**
   * Subclass needs to implementation this method to return a {@link BatchReadable} for reading
   * records from the given dataset.
   *
   * @param context the hadoop task context
   * @param datasetName name of the dataset to read from
   * @param datasetArgs arguments of the dataset to read from
   */
  protected abstract BatchReadable createBatchReadable(TaskAttemptContext context,
      String datasetName, Map datasetArgs);

  /**
   * Subclass needs to implementation this method to return a {@link BatchReadable} for reading
   * records from the given dataset from another namespace.
   *
   * @param context the hadoop task context
   * @param datasetNamespace namesoace of the dataset
   * @param datasetName name of the dataset to read from
   * @param datasetArgs arguments of the dataset to read from
   */
  protected abstract BatchReadable createBatchReadable(TaskAttemptContext context,
      @Nullable String datasetNamespace,
      String datasetName, Map datasetArgs);


  /**
   * Implementation of {@link RecordReader} by delegating to the underlying {@link SplitReader}.
   *
   * @param  type of key returned by this reader
   * @param  type of value returned by this reader
   */
  private static final class SplitReaderRecordReader extends RecordReader {

    private final SplitReader splitReader;

    SplitReaderRecordReader(final SplitReader splitReader) {
      this.splitReader = splitReader;
    }

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException {
      DataSetInputSplit inputSplit = (DataSetInputSplit) split;
      splitReader.initialize(inputSplit.getSplit());
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
      return splitReader.nextKeyValue();
    }

    @Override
    public KEY getCurrentKey() throws IOException, InterruptedException {
      return splitReader.getCurrentKey();
    }

    @Override
    public VALUE getCurrentValue() throws IOException, InterruptedException {
      return splitReader.getCurrentValue();
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
      return splitReader.getProgress();
    }

    @Override
    public void close() throws IOException {
      splitReader.close();
    }
  }
}