org.kitesdk.data.mapreduce.DatasetKeyInputFormat Maven / Gradle / Ivy
Show all versions of kite-data-mapreduce Show documentation
/**
* Copyright 2014 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.data.mapreduce;
import com.google.common.base.Preconditions;
import java.io.IOException;
import java.net.URI;
import java.util.List;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.kitesdk.compat.Hadoop;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.DatasetException;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.spi.DefaultConfiguration;
import org.kitesdk.data.spi.PartitionKey;
import org.kitesdk.data.spi.PartitionedDataset;
import org.kitesdk.data.TypeNotFoundException;
import org.kitesdk.data.View;
import org.kitesdk.data.spi.DataModelUtil;
import org.kitesdk.data.spi.InputFormatAccessor;
import org.kitesdk.data.spi.filesystem.FileSystemDataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A MapReduce {@code InputFormat} for reading from a {@link Dataset}.
*
* Since a {@code Dataset} only contains entities (not key/value pairs), this output
* format ignores the value.
*
* @param The type of entities in the {@code Dataset}.
*/
public class DatasetKeyInputFormat extends InputFormat
implements Configurable {
private static final Logger LOG =
LoggerFactory.getLogger(DatasetKeyInputFormat.class);
public static final String KITE_INPUT_URI = "kite.inputUri";
public static final String KITE_PARTITION_DIR = "kite.inputPartitionDir";
public static final String KITE_TYPE = "kite.inputEntityType";
public static final String KITE_READER_SCHEMA = "kite.readerSchema";
private Configuration conf;
private InputFormat delegate;
public static class ConfigBuilder {
private final Configuration conf;
private ConfigBuilder(Configuration conf) {
this.conf = conf;
}
/**
* Adds configuration for {@code DatasetKeyInputFormat} to read from the
* given dataset or view URI.
*
* URI formats are defined by {@link Dataset} implementations, but must
* begin with "dataset:" or "view:". For more information, see
* {@link Datasets}.
*
* @param uri a dataset or view URI
* @return this for method chaining
*/
public ConfigBuilder readFrom(URI uri) {
return readFrom(Datasets.load(uri));
}
/**
* Adds configuration for {@code DatasetKeyInputFormat} to read from the
* given {@link Dataset} or {@link View} instance.
*
* @param view a dataset or view
* @return this for method chaining
*/
public ConfigBuilder readFrom(View> view) {
DatasetDescriptor descriptor = view.getDataset().getDescriptor();
// if this is a partitioned dataset, add the partition location
if (view instanceof FileSystemDataset) {
conf.set(KITE_PARTITION_DIR, String.valueOf(descriptor.getLocation()));
}
// add descriptor properties to the config
for (String property : descriptor.listProperties()) {
conf.set(property, descriptor.getProperty(property));
}
if (DataModelUtil.isGeneric(view.getType())) {
Schema datasetSchema = view.getDataset().getDescriptor().getSchema();
// only set the read schema if the view is a projection
if (!datasetSchema.equals(view.getSchema())) {
withSchema(view.getSchema());
}
} else {
withType(view.getType());
}
conf.set(KITE_INPUT_URI, view.getUri().toString());
return this;
}
/**
* Adds configuration for {@code DatasetKeyInputFormat} to read from the
* given dataset or view URI string.
*
* URI formats are defined by {@link Dataset} implementations, but must
* begin with "dataset:" or "view:". For more information, see
* {@link Datasets}.
*
* @param uri a dataset or view URI string
* @return this for method chaining
*/
public ConfigBuilder readFrom(String uri) {
return readFrom(URI.create(uri));
}
/**
* Sets the entity Class that the input Dataset should produce.
*
* This Class is used to configure the input {@code Dataset}. If this class
* cannot be found during job setup, the job will fail and throw a
* {@link org.kitesdk.data.TypeNotFoundException}.
*
* If the type is set, then the type's schema is used for the expected
* schema and {@link #withSchema(Schema)} should not be called. This may,
* however, be used at the same time if the type is a generic record
* subclass.
*
* @param type the entity Class that will be produced
* @return this for method chaining
*/
public ConfigBuilder withType(Class type) {
String readerSchema = conf.get(KITE_READER_SCHEMA);
Preconditions.checkArgument(
DataModelUtil.isGeneric(type) || readerSchema == null,
"Can't configure a type when a reader schema is already set: {}",
readerSchema);
conf.setClass(KITE_TYPE, type, type);
return this;
}
/**
* Sets the expected schema to use when reading records from the Dataset.
*
* If this schema is set, {@link #withType(Class)} should only be called
* with a generic record subclass.
*
* @param readerSchema the expected entity schema
* @return this for method chaining
* @since 1.1.0
*/
public ConfigBuilder withSchema(Schema readerSchema) {
Class> type = conf.getClass(KITE_TYPE, null);
Preconditions.checkArgument(
type == null || DataModelUtil.isGeneric(type),
"Can't configure a reader schema when a type is already set: {}",
type);
conf.set(KITE_READER_SCHEMA, readerSchema.toString());
return this;
}
}
/**
* Configures the {@code Job} to use the {@code DatasetKeyInputFormat} and
* returns a helper to add further configuration.
*
* @param job the {@code Job} to configure
*
* @since 0.15.0
*/
public static ConfigBuilder configure(Job job) {
job.setInputFormatClass(DatasetKeyInputFormat.class);
Configuration conf = Hadoop.JobContext.getConfiguration.invoke(job);
return new ConfigBuilder(conf);
}
/**
* Adds settings to {@code Configuration} to use {@code DatasetKeyInputFormat}
* and returns a helper to add further configuration.
*
* @param conf a {@code Configuration}
*
* @since 0.15.0
*/
public static ConfigBuilder configure(Configuration conf) {
setInputFormatClass(conf);
return new ConfigBuilder(conf);
}
private static void setInputFormatClass(Configuration conf) {
if (Hadoop.isHadoop1()) {
conf.set("mapreduce.inputformat.class",
DatasetKeyInputFormat.class.getName());
} else {
// build a job with an empty conf
Job fakeJob = Hadoop.Job.newInstance.invoke(new Configuration(false));
fakeJob.setInputFormatClass(DatasetKeyInputFormat.class);
// then copy any created entries into the real conf
for (Map.Entry entry : fakeJob.getConfiguration()) {
conf.set(entry.getKey(), entry.getValue());
}
}
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public void setConf(Configuration configuration) {
conf = configuration;
View view = load(configuration);
String partitionDir = conf.get(KITE_PARTITION_DIR);
if (view.getDataset().getDescriptor().isPartitioned() && partitionDir != null) {
delegate = getDelegateInputFormatForPartition(view.getDataset(), partitionDir, conf);
} else {
delegate = getDelegateInputFormat(view, conf);
}
}
@SuppressWarnings("unchecked")
private InputFormat getDelegateInputFormat(View view, Configuration conf) {
if (view instanceof InputFormatAccessor) {
return ((InputFormatAccessor) view).getInputFormat(conf);
}
throw new UnsupportedOperationException("Implementation " +
"does not provide InputFormat support. View: " + view);
}
private InputFormat getDelegateInputFormatForPartition(Dataset dataset,
String partitionDir, Configuration conf) {
if (!(dataset instanceof FileSystemDataset)) {
throw new UnsupportedOperationException("Partitions only supported for " +
"FileSystemDataset. Dataset: " + dataset);
}
FileSystemDataset fsDataset = (FileSystemDataset) dataset;
LOG.debug("Getting delegate input format for dataset {} with partition directory {}",
dataset, partitionDir);
PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir));
LOG.debug("Partition key: {}", key);
if (key != null) {
PartitionedDataset partition = fsDataset.getPartition(key, false);
LOG.debug("Partition: {}", partition);
return getDelegateInputFormat(partition, conf);
}
throw new DatasetException("Cannot find partition " + partitionDir);
}
@SuppressWarnings({"deprecation", "unchecked"})
private static View load(Configuration conf) {
Class type;
try {
type = (Class)conf.getClass(KITE_TYPE, GenericData.Record.class);
} catch (RuntimeException e) {
if (e.getCause() instanceof ClassNotFoundException) {
throw new TypeNotFoundException(String.format(
"The Java class %s for the entity type could not be found",
conf.get(KITE_TYPE)),
e.getCause());
} else {
throw e;
}
}
String schemaStr = conf.get(KITE_READER_SCHEMA);
Schema projection = null;
if (schemaStr != null) {
projection = new Schema.Parser().parse(schemaStr);
}
String inputUri = conf.get(KITE_INPUT_URI);
if (projection != null) {
return Datasets.load(inputUri).asSchema(projection).asType(type);
} else {
return Datasets.load(inputUri, type);
}
}
@Override
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UWF_FIELD_NOT_INITIALIZED_IN_CONSTRUCTOR",
justification="Delegate set by setConf")
public List getSplits(JobContext jobContext) throws IOException,
InterruptedException {
return delegate.getSplits(jobContext);
}
@Override
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UWF_FIELD_NOT_INITIALIZED_IN_CONSTRUCTOR",
justification="Delegate set by setConf")
public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
Configuration conf = Hadoop.TaskAttemptContext.getConfiguration.invoke(taskAttemptContext);
DefaultConfiguration.init(conf);
return delegate.createRecordReader(inputSplit, taskAttemptContext);
}
}