org.kitesdk.data.mapreduce.DatasetKeyInputFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of kite-data-mapreduce Show documentation
The Kite Data MapReduce module provides MapReduce support for working with Kite datasets.
The newest version!
/**
 * Copyright 2014 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.kitesdk.data.mapreduce;

import com.google.common.base.Preconditions;
import java.io.IOException;
import java.net.URI;
import java.util.List;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.kitesdk.compat.Hadoop;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.DatasetException;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.spi.DefaultConfiguration;
import org.kitesdk.data.spi.PartitionKey;
import org.kitesdk.data.spi.PartitionedDataset;
import org.kitesdk.data.TypeNotFoundException;
import org.kitesdk.data.View;
import org.kitesdk.data.spi.DataModelUtil;
import org.kitesdk.data.spi.InputFormatAccessor;
import org.kitesdk.data.spi.filesystem.FileSystemDataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A MapReduce {@code InputFormat} for reading from a {@link Dataset}.
 *
 * Since a {@code Dataset} only contains entities (not key/value pairs), this output
 * format ignores the value.
 *
 * @param  The type of entities in the {@code Dataset}.
 */
public class DatasetKeyInputFormat extends InputFormat
    implements Configurable {

  private static final Logger LOG =
      LoggerFactory.getLogger(DatasetKeyInputFormat.class);

  public static final String KITE_INPUT_URI = "kite.inputUri";
  public static final String KITE_PARTITION_DIR = "kite.inputPartitionDir";
  public static final String KITE_TYPE = "kite.inputEntityType";
  public static final String KITE_READER_SCHEMA = "kite.readerSchema";

  private Configuration conf;
  private InputFormat delegate;

  public static class ConfigBuilder {
    private final Configuration conf;

    private ConfigBuilder(Configuration conf) {
      this.conf = conf;
    }

    /**
     * Adds configuration for {@code DatasetKeyInputFormat} to read from the
     * given dataset or view URI.
     * 
     * URI formats are defined by {@link Dataset} implementations, but must
     * begin with "dataset:" or "view:". For more information, see
     * {@link Datasets}.
     *
     * @param uri a dataset or view URI
     * @return this for method chaining
     */
    public ConfigBuilder readFrom(URI uri) {
      return readFrom(Datasets.load(uri));
    }

    /**
     * Adds configuration for {@code DatasetKeyInputFormat} to read from the
     * given {@link Dataset} or {@link View} instance.
     *
     * @param view a dataset or view
     * @return this for method chaining
     */
    public ConfigBuilder readFrom(View view) {
      DatasetDescriptor descriptor = view.getDataset().getDescriptor();
      // if this is a partitioned dataset, add the partition location
      if (view instanceof FileSystemDataset) {
        conf.set(KITE_PARTITION_DIR, String.valueOf(descriptor.getLocation()));
      }
      // add descriptor properties to the config
      for (String property : descriptor.listProperties()) {
        conf.set(property, descriptor.getProperty(property));
      }

      if (DataModelUtil.isGeneric(view.getType())) {
        Schema datasetSchema = view.getDataset().getDescriptor().getSchema();
        // only set the read schema if the view is a projection
        if (!datasetSchema.equals(view.getSchema())) {
          withSchema(view.getSchema());
        }
      } else {
        withType(view.getType());
      }

      conf.set(KITE_INPUT_URI, view.getUri().toString());
      return this;
    }

    /**
     * Adds configuration for {@code DatasetKeyInputFormat} to read from the
     * given dataset or view URI string.
     * 

     * URI formats are defined by {@link Dataset} implementations, but must
     * begin with "dataset:" or "view:". For more information, see
     * {@link Datasets}.
     *
     * @param uri a dataset or view URI string
     * @return this for method chaining
     */
    public ConfigBuilder readFrom(String uri) {
      return readFrom(URI.create(uri));
    }

    /**
     * Sets the entity Class that the input Dataset should produce.
     * 

     * This Class is used to configure the input {@code Dataset}. If this class
     * cannot be found during job setup, the job will fail and throw a
     * {@link org.kitesdk.data.TypeNotFoundException}.
     * 

     * If the type is set, then the type's schema is used for the expected
     * schema and {@link #withSchema(Schema)} should not be called. This may,
     * however, be used at the same time if the type is a generic record
     * subclass.
     *
     * @param type the entity Class that will be produced
     * @return this for method chaining
     */
    public  ConfigBuilder withType(Class type) {
      String readerSchema = conf.get(KITE_READER_SCHEMA);
      Preconditions.checkArgument(
          DataModelUtil.isGeneric(type) || readerSchema == null,
          "Can't configure a type when a reader schema is already set: {}",
          readerSchema);

      conf.setClass(KITE_TYPE, type, type);
      return this;
    }

    /**
     * Sets the expected schema to use when reading records from the Dataset.
     * 
     * If this schema is set, {@link #withType(Class)} should only be called
     * with a generic record subclass.
     *
     * @param readerSchema the expected entity schema
     * @return this for method chaining
     * @since 1.1.0
     */
    public ConfigBuilder withSchema(Schema readerSchema) {
      Class type = conf.getClass(KITE_TYPE, null);
      Preconditions.checkArgument(
          type == null || DataModelUtil.isGeneric(type),
          "Can't configure a reader schema when a type is already set: {}",
          type);

      conf.set(KITE_READER_SCHEMA, readerSchema.toString());
      return this;
    }

  }

  /**
   * Configures the {@code Job} to use the {@code DatasetKeyInputFormat} and
   * returns a helper to add further configuration.
   *
   * @param job the {@code Job} to configure
   *
   * @since 0.15.0
   */
  public static ConfigBuilder configure(Job job) {
    job.setInputFormatClass(DatasetKeyInputFormat.class);
    Configuration conf = Hadoop.JobContext.getConfiguration.invoke(job);
    return new ConfigBuilder(conf);
  }

  /**
   * Adds settings to {@code Configuration} to use {@code DatasetKeyInputFormat}
   * and returns a helper to add further configuration.
   *
   * @param conf a {@code Configuration}
   *
   * @since 0.15.0
   */
  public static ConfigBuilder configure(Configuration conf) {
    setInputFormatClass(conf);
    return new ConfigBuilder(conf);
  }

  private static void setInputFormatClass(Configuration conf) {
    if (Hadoop.isHadoop1()) {
      conf.set("mapreduce.inputformat.class",
          DatasetKeyInputFormat.class.getName());
    } else {
      // build a job with an empty conf
      Job fakeJob = Hadoop.Job.newInstance.invoke(new Configuration(false));
      fakeJob.setInputFormatClass(DatasetKeyInputFormat.class);
      // then copy any created entries into the real conf
      for (Map.Entry entry : fakeJob.getConfiguration()) {
        conf.set(entry.getKey(), entry.getValue());
      }
    }
  }

  @Override
  public Configuration getConf() {
    return conf;
  }

  @Override
  public void setConf(Configuration configuration) {
    conf = configuration;
    View view = load(configuration);

    String partitionDir = conf.get(KITE_PARTITION_DIR);
    if (view.getDataset().getDescriptor().isPartitioned() && partitionDir != null) {
      delegate = getDelegateInputFormatForPartition(view.getDataset(), partitionDir, conf);
    } else {
      delegate = getDelegateInputFormat(view, conf);
    }
  }

  @SuppressWarnings("unchecked")
  private InputFormat getDelegateInputFormat(View view, Configuration conf) {
    if (view instanceof InputFormatAccessor) {
      return ((InputFormatAccessor) view).getInputFormat(conf);
    }
    throw new UnsupportedOperationException("Implementation " +
          "does not provide InputFormat support. View: " + view);
  }

  private InputFormat getDelegateInputFormatForPartition(Dataset dataset,
      String partitionDir, Configuration conf) {
    if (!(dataset instanceof FileSystemDataset)) {
      throw new UnsupportedOperationException("Partitions only supported for " +
          "FileSystemDataset. Dataset: " + dataset);
    }
    FileSystemDataset fsDataset = (FileSystemDataset) dataset;
    LOG.debug("Getting delegate input format for dataset {} with partition directory {}",
        dataset, partitionDir);
    PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir));
    LOG.debug("Partition key: {}", key);
    if (key != null) {
      PartitionedDataset partition = fsDataset.getPartition(key, false);
      LOG.debug("Partition: {}", partition);
      return getDelegateInputFormat(partition, conf);
    }
    throw new DatasetException("Cannot find partition " + partitionDir);
  }

  @SuppressWarnings({"deprecation", "unchecked"})
  private static  View load(Configuration conf) {
    Class type;
    try {
      type = (Class)conf.getClass(KITE_TYPE, GenericData.Record.class);
    } catch (RuntimeException e) {
      if (e.getCause() instanceof ClassNotFoundException) {
        throw new TypeNotFoundException(String.format(
            "The Java class %s for the entity type could not be found",
            conf.get(KITE_TYPE)),
            e.getCause());
      } else {
        throw e;
      }
    }

    String schemaStr = conf.get(KITE_READER_SCHEMA);
    Schema projection = null;
    if (schemaStr != null) {
      projection = new Schema.Parser().parse(schemaStr);
    }

    String inputUri = conf.get(KITE_INPUT_URI);
    if (projection != null) {
      return Datasets.load(inputUri).asSchema(projection).asType(type);
    } else {
      return Datasets.load(inputUri, type);
    }
  }

  @Override
  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UWF_FIELD_NOT_INITIALIZED_IN_CONSTRUCTOR",
      justification="Delegate set by setConf")
  public List getSplits(JobContext jobContext) throws IOException,
      InterruptedException {
    return delegate.getSplits(jobContext);
  }

  @Override
  @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UWF_FIELD_NOT_INITIALIZED_IN_CONSTRUCTOR",
      justification="Delegate set by setConf")
  public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
    Configuration conf = Hadoop.TaskAttemptContext.getConfiguration.invoke(taskAttemptContext);
    DefaultConfiguration.init(conf);
    return delegate.createRecordReader(inputSplit, taskAttemptContext);
  }

}