All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.kitesdk.data.mapreduce.DatasetKeyInputFormat Maven / Gradle / Ivy

Go to download

The Kite Data MapReduce module provides MapReduce support for working with Kite datasets.

The newest version!
/**
 * Copyright 2014 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.kitesdk.data.mapreduce;

import com.google.common.base.Preconditions;
import java.io.IOException;
import java.net.URI;
import java.util.List;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.kitesdk.compat.Hadoop;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.DatasetException;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.spi.DefaultConfiguration;
import org.kitesdk.data.spi.PartitionKey;
import org.kitesdk.data.spi.PartitionedDataset;
import org.kitesdk.data.TypeNotFoundException;
import org.kitesdk.data.View;
import org.kitesdk.data.spi.DataModelUtil;
import org.kitesdk.data.spi.InputFormatAccessor;
import org.kitesdk.data.spi.filesystem.FileSystemDataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A MapReduce {@code InputFormat} for reading from a {@link Dataset}.
 *
 * Since a {@code Dataset} only contains entities (not key/value pairs), this output
 * format ignores the value.
 *
 * @param  The type of entities in the {@code Dataset}.
 */
public class DatasetKeyInputFormat extends InputFormat
    implements Configurable {

  private static final Logger LOG =
      LoggerFactory.getLogger(DatasetKeyInputFormat.class);

  public static final String KITE_INPUT_URI = "kite.inputUri";
  public static final String KITE_PARTITION_DIR = "kite.inputPartitionDir";
  public static final String KITE_TYPE = "kite.inputEntityType";
  public static final String KITE_READER_SCHEMA = "kite.readerSchema";

  private Configuration conf;
  private InputFormat delegate;

  public static class ConfigBuilder {
    private final Configuration conf;

    private ConfigBuilder(Configuration conf) {
      this.conf = conf;
    }

    /**
     * Adds configuration for {@code DatasetKeyInputFormat} to read from the
     * given dataset or view URI.
     * 

* URI formats are defined by {@link Dataset} implementations, but must * begin with "dataset:" or "view:". For more information, see * {@link Datasets}. * * @param uri a dataset or view URI * @return this for method chaining */ public ConfigBuilder readFrom(URI uri) { return readFrom(Datasets.load(uri)); } /** * Adds configuration for {@code DatasetKeyInputFormat} to read from the * given {@link Dataset} or {@link View} instance. * * @param view a dataset or view * @return this for method chaining */ public ConfigBuilder readFrom(View view) { DatasetDescriptor descriptor = view.getDataset().getDescriptor(); // if this is a partitioned dataset, add the partition location if (view instanceof FileSystemDataset) { conf.set(KITE_PARTITION_DIR, String.valueOf(descriptor.getLocation())); } // add descriptor properties to the config for (String property : descriptor.listProperties()) { conf.set(property, descriptor.getProperty(property)); } if (DataModelUtil.isGeneric(view.getType())) { Schema datasetSchema = view.getDataset().getDescriptor().getSchema(); // only set the read schema if the view is a projection if (!datasetSchema.equals(view.getSchema())) { withSchema(view.getSchema()); } } else { withType(view.getType()); } conf.set(KITE_INPUT_URI, view.getUri().toString()); return this; } /** * Adds configuration for {@code DatasetKeyInputFormat} to read from the * given dataset or view URI string. *

* URI formats are defined by {@link Dataset} implementations, but must * begin with "dataset:" or "view:". For more information, see * {@link Datasets}. * * @param uri a dataset or view URI string * @return this for method chaining */ public ConfigBuilder readFrom(String uri) { return readFrom(URI.create(uri)); } /** * Sets the entity Class that the input Dataset should produce. *

* This Class is used to configure the input {@code Dataset}. If this class * cannot be found during job setup, the job will fail and throw a * {@link org.kitesdk.data.TypeNotFoundException}. *

* If the type is set, then the type's schema is used for the expected * schema and {@link #withSchema(Schema)} should not be called. This may, * however, be used at the same time if the type is a generic record * subclass. * * @param type the entity Class that will be produced * @return this for method chaining */ public ConfigBuilder withType(Class type) { String readerSchema = conf.get(KITE_READER_SCHEMA); Preconditions.checkArgument( DataModelUtil.isGeneric(type) || readerSchema == null, "Can't configure a type when a reader schema is already set: {}", readerSchema); conf.setClass(KITE_TYPE, type, type); return this; } /** * Sets the expected schema to use when reading records from the Dataset. *

* If this schema is set, {@link #withType(Class)} should only be called * with a generic record subclass. * * @param readerSchema the expected entity schema * @return this for method chaining * @since 1.1.0 */ public ConfigBuilder withSchema(Schema readerSchema) { Class type = conf.getClass(KITE_TYPE, null); Preconditions.checkArgument( type == null || DataModelUtil.isGeneric(type), "Can't configure a reader schema when a type is already set: {}", type); conf.set(KITE_READER_SCHEMA, readerSchema.toString()); return this; } } /** * Configures the {@code Job} to use the {@code DatasetKeyInputFormat} and * returns a helper to add further configuration. * * @param job the {@code Job} to configure * * @since 0.15.0 */ public static ConfigBuilder configure(Job job) { job.setInputFormatClass(DatasetKeyInputFormat.class); Configuration conf = Hadoop.JobContext.getConfiguration.invoke(job); return new ConfigBuilder(conf); } /** * Adds settings to {@code Configuration} to use {@code DatasetKeyInputFormat} * and returns a helper to add further configuration. * * @param conf a {@code Configuration} * * @since 0.15.0 */ public static ConfigBuilder configure(Configuration conf) { setInputFormatClass(conf); return new ConfigBuilder(conf); } private static void setInputFormatClass(Configuration conf) { if (Hadoop.isHadoop1()) { conf.set("mapreduce.inputformat.class", DatasetKeyInputFormat.class.getName()); } else { // build a job with an empty conf Job fakeJob = Hadoop.Job.newInstance.invoke(new Configuration(false)); fakeJob.setInputFormatClass(DatasetKeyInputFormat.class); // then copy any created entries into the real conf for (Map.Entry entry : fakeJob.getConfiguration()) { conf.set(entry.getKey(), entry.getValue()); } } } @Override public Configuration getConf() { return conf; } @Override public void setConf(Configuration configuration) { conf = configuration; View view = load(configuration); String partitionDir = conf.get(KITE_PARTITION_DIR); if (view.getDataset().getDescriptor().isPartitioned() && partitionDir != null) { delegate = getDelegateInputFormatForPartition(view.getDataset(), partitionDir, conf); } else { delegate = getDelegateInputFormat(view, conf); } } @SuppressWarnings("unchecked") private InputFormat getDelegateInputFormat(View view, Configuration conf) { if (view instanceof InputFormatAccessor) { return ((InputFormatAccessor) view).getInputFormat(conf); } throw new UnsupportedOperationException("Implementation " + "does not provide InputFormat support. View: " + view); } private InputFormat getDelegateInputFormatForPartition(Dataset dataset, String partitionDir, Configuration conf) { if (!(dataset instanceof FileSystemDataset)) { throw new UnsupportedOperationException("Partitions only supported for " + "FileSystemDataset. Dataset: " + dataset); } FileSystemDataset fsDataset = (FileSystemDataset) dataset; LOG.debug("Getting delegate input format for dataset {} with partition directory {}", dataset, partitionDir); PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir)); LOG.debug("Partition key: {}", key); if (key != null) { PartitionedDataset partition = fsDataset.getPartition(key, false); LOG.debug("Partition: {}", partition); return getDelegateInputFormat(partition, conf); } throw new DatasetException("Cannot find partition " + partitionDir); } @SuppressWarnings({"deprecation", "unchecked"}) private static View load(Configuration conf) { Class type; try { type = (Class)conf.getClass(KITE_TYPE, GenericData.Record.class); } catch (RuntimeException e) { if (e.getCause() instanceof ClassNotFoundException) { throw new TypeNotFoundException(String.format( "The Java class %s for the entity type could not be found", conf.get(KITE_TYPE)), e.getCause()); } else { throw e; } } String schemaStr = conf.get(KITE_READER_SCHEMA); Schema projection = null; if (schemaStr != null) { projection = new Schema.Parser().parse(schemaStr); } String inputUri = conf.get(KITE_INPUT_URI); if (projection != null) { return Datasets.load(inputUri).asSchema(projection).asType(type); } else { return Datasets.load(inputUri, type); } } @Override @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UWF_FIELD_NOT_INITIALIZED_IN_CONSTRUCTOR", justification="Delegate set by setConf") public List getSplits(JobContext jobContext) throws IOException, InterruptedException { return delegate.getSplits(jobContext); } @Override @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UWF_FIELD_NOT_INITIALIZED_IN_CONSTRUCTOR", justification="Delegate set by setConf") public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { Configuration conf = Hadoop.TaskAttemptContext.getConfiguration.invoke(taskAttemptContext); DefaultConfiguration.init(conf); return delegate.createRecordReader(inputSplit, taskAttemptContext); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy