org.kitesdk.data.mapreduce.DatasetKeyOutputFormat Maven / Gradle / Ivy
Show all versions of kite-data-mapreduce Show documentation
/**
* Copyright 2014 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.data.mapreduce;
import com.google.common.base.Preconditions;
import java.io.IOException;
import java.net.URI;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.reflect.ReflectData;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobStatus;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.kitesdk.compat.Hadoop;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.DatasetException;
import org.kitesdk.data.DatasetWriter;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.PartitionStrategy;
import org.kitesdk.data.TypeNotFoundException;
import org.kitesdk.data.View;
import org.kitesdk.data.spi.AbstractDataset;
import org.kitesdk.data.spi.Constraints;
import org.kitesdk.data.spi.DataModelUtil;
import org.kitesdk.data.spi.DatasetRepositories;
import org.kitesdk.data.spi.DatasetRepository;
import org.kitesdk.data.spi.Mergeable;
import org.kitesdk.data.spi.PartitionKey;
import org.kitesdk.data.spi.Registration;
import org.kitesdk.data.spi.TemporaryDatasetRepository;
import org.kitesdk.data.spi.TemporaryDatasetRepositoryAccessor;
import org.kitesdk.data.spi.filesystem.FileSystemDataset;
import org.kitesdk.data.spi.filesystem.FileSystemProperties;
/**
* A MapReduce {@code OutputFormat} for writing to a {@link Dataset}.
*
* Since a {@code Dataset} only contains entities (not key/value pairs), this output
* format ignores the value.
*
* @param The type of entities in the {@code Dataset}.
*/
public class DatasetKeyOutputFormat extends OutputFormat {
public static final String KITE_OUTPUT_URI = "kite.outputUri";
public static final String KITE_PARTITION_DIR = "kite.outputPartitionDir";
public static final String KITE_TYPE = "kite.outputEntityType";
private static final String KITE_WRITE_MODE = "kite.outputMode";
private static final String TEMP_NAMESPACE = "mr";
private static enum WriteMode {
DEFAULT, APPEND, OVERWRITE
}
public static class ConfigBuilder {
private final Configuration conf;
private ConfigBuilder(Job job) {
this(Hadoop.JobContext.getConfiguration.invoke(job));
}
private ConfigBuilder(Configuration conf) {
this.conf = conf;
// always use the new API for OutputCommitters, even for 0 reducers
conf.setBoolean("mapred.reducer.new-api", true);
}
/**
* Adds configuration for {@code DatasetKeyOutputFormat} to write to the
* given dataset or view URI.
*
* URI formats are defined by {@link Dataset} implementations, but must
* begin with "dataset:" or "view:". For more information, see
* {@link Datasets}.
*
* @param uri a dataset or view URI
* @return this for method chaining
*/
public ConfigBuilder writeTo(URI uri) {
conf.set(KITE_OUTPUT_URI, uri.toString());
return this;
}
/**
* Adds configuration for {@code DatasetKeyOutputFormat} to write to the
* given dataset or view URI after removing any existing data.
*
* The underlying dataset implementation must support View#deleteAll for
* the view identified by the URI or the job will fail.
*
* URI formats are defined by {@link Dataset} implementations, but must
* begin with "dataset:" or "view:". For more information, see
* {@link Datasets}.
*
* @param uri a dataset or view URI
* @return this for method chaining
*
* @since 0.16.0
*/
public ConfigBuilder overwrite(URI uri) {
setOverwrite();
return writeTo(uri);
}
/**
* Adds configuration for {@code DatasetKeyOutputFormat} to append to the
* given dataset or view URI, leaving any existing data intact.
*
* URI formats are defined by {@link Dataset} implementations, but must
* begin with "dataset:" or "view:". For more information, see
* {@link Datasets}.
*
* @param uri a dataset or view URI
* @return this for method chaining
*
* @since 0.16.0
*/
public ConfigBuilder appendTo(URI uri) {
setAppend();
return writeTo(uri);
}
/**
* Adds configuration for {@code DatasetKeyOutputFormat} to write to the
* given {@link Dataset} or {@link View} instance.
*
* @param view a dataset or view
* @return this for method chaining
*/
public ConfigBuilder writeTo(View> view) {
if (view instanceof FileSystemDataset) {
FileSystemDataset dataset = (FileSystemDataset) view;
conf.set(KITE_PARTITION_DIR,
String.valueOf(dataset.getDescriptor().getLocation()));
}
withType(view.getType());
return writeTo(view.getUri());
}
/**
* Adds configuration for {@code DatasetKeyOutputFormat} to write to the
* given {@link Dataset} or {@link View} instance after removing any
* existing data.
*
* The underlying dataset implementation must support View#deleteAll for
* the {@code view} or the job will fail.
*
* @param view a dataset or view
* @return this for method chaining
*
* @since 0.16.0
*/
public ConfigBuilder overwrite(View> view) {
setOverwrite();
return writeTo(view);
}
/**
* Adds configuration for {@code DatasetKeyOutputFormat} to append to the
* given dataset or view URI, leaving any existing data intact.
*
* @param view a dataset or view
* @return this for method chaining
*
* @since 0.16.0
*/
public ConfigBuilder appendTo(View> view) {
setAppend();
return writeTo(view);
}
/**
* Adds configuration for {@code DatasetKeyOutputFormat} to write to the
* given dataset or view URI string.
*
* URI formats are defined by {@link Dataset} implementations, but must
* begin with "dataset:" or "view:". For more information, see
* {@link Datasets}.
*
* @param uri a dataset or view URI string
* @return this for method chaining
*/
public ConfigBuilder writeTo(String uri) {
return writeTo(URI.create(uri));
}
/**
* Adds configuration for {@code DatasetKeyOutputFormat} to write to the
* given dataset or view URI string after removing any existing data.
*
* The underlying dataset implementation must support View#deleteAll for
* the view identified by the URI string or the job will fail.
*
* URI formats are defined by {@link Dataset} implementations, but must
* begin with "dataset:" or "view:". For more information, see
* {@link Datasets}.
*
* @param uri a dataset or view URI string
* @return this for method chaining
*
* @since 0.16.0
*/
public ConfigBuilder overwrite(String uri) {
setOverwrite();
return writeTo(uri);
}
/**
* Adds configuration for {@code DatasetKeyOutputFormat} to append to the
* given dataset or view URI, leaving any existing data intact.
*
* URI formats are defined by {@link Dataset} implementations, but must
* begin with "dataset:" or "view:". For more information, see
* {@link Datasets}.
*
* @param uri a dataset or view URI string
* @return this for method chaining
*
* @since 0.16.0
*/
public ConfigBuilder appendTo(String uri) {
setAppend();
return writeTo(uri);
}
/**
* Sets the entity Class that will be output by the Job.
*
* This Class is used to configure the output {@code Dataset}.
*
* @param type the entity Class that will be produced
* @return this for method chaining
*/
public ConfigBuilder withType(Class type) {
conf.setClass(KITE_TYPE, type, type);
return this;
}
private void setOverwrite() {
String mode = conf.get(KITE_WRITE_MODE);
Preconditions.checkState(mode == null,
"Cannot replace existing write mode: " + mode);
conf.setEnum(KITE_WRITE_MODE, WriteMode.OVERWRITE);
}
private void setAppend() {
String mode = conf.get(KITE_WRITE_MODE);
Preconditions.checkState(mode == null,
"Cannot replace existing write mode: " + mode);
conf.setEnum(KITE_WRITE_MODE, WriteMode.APPEND);
}
}
/**
* Configures the {@code Job} to use the {@code DatasetKeyOutputFormat} and
* returns a helper to add further configuration.
*
* @param job the {@code Job} to configure
*
* @since 0.15.0
*/
public static ConfigBuilder configure(Job job) {
job.setOutputFormatClass(DatasetKeyOutputFormat.class);
return new ConfigBuilder(job);
}
/**
* Returns a helper to add output options to the given {@code Configuration}.
*
* @param conf a {@code Configuration}
*
* @since 0.15.0
*/
public static ConfigBuilder configure(Configuration conf) {
return new ConfigBuilder(conf);
}
static class DatasetRecordWriter extends RecordWriter {
private DatasetWriter datasetWriter;
private GenericData dataModel;
private boolean copyEntities;
private Schema schema;
public DatasetRecordWriter(View view) {
this.datasetWriter = view.newWriter();
this.schema = view.getDataset().getDescriptor().getSchema();
this.dataModel = DataModelUtil.getDataModelForType(
view.getType());
if (dataModel.getClass() != ReflectData.class) {
copyEntities = true;
}
}
@Override
public void write(E key, Void v) {
if (copyEntities) {
key = copy(key);
}
datasetWriter.write(key);
}
private E copy(E key) {
return dataModel.deepCopy(schema, key);
}
@Override
public void close(TaskAttemptContext taskAttemptContext) {
datasetWriter.close();
}
}
static class NullOutputCommitter extends OutputCommitter {
@Override
public void setupJob(JobContext jobContext) { }
@Override
public void setupTask(TaskAttemptContext taskContext) { }
@Override
public boolean needsTaskCommit(TaskAttemptContext taskContext) {
return false;
}
@Override
public void commitTask(TaskAttemptContext taskContext) { }
@Override
public void abortTask(TaskAttemptContext taskContext) { }
}
static class MergeOutputCommitter extends OutputCommitter {
@Override
public void setupJob(JobContext jobContext) {
createJobDataset(jobContext);
}
@Override
@SuppressWarnings("unchecked")
public void commitJob(JobContext jobContext) throws IOException {
DatasetRepository repo = getDatasetRepository(jobContext);
boolean isTemp = repo instanceof TemporaryDatasetRepository;
String jobDatasetName = getJobDatasetName(jobContext);
View targetView = load(jobContext);
Dataset jobDataset = repo.load(TEMP_NAMESPACE, jobDatasetName);
((Mergeable>) targetView.getDataset()).merge(jobDataset);
if (isTemp) {
((TemporaryDatasetRepository) repo).delete();
} else {
repo.delete(TEMP_NAMESPACE, jobDatasetName);
}
}
@Override
public void abortJob(JobContext jobContext, JobStatus.State state) {
deleteJobDataset(jobContext);
}
@Override
public void setupTask(TaskAttemptContext taskContext) {
// do nothing: the task attempt dataset is created in getRecordWriter
}
@Override
public boolean needsTaskCommit(TaskAttemptContext taskContext) {
return true;
}
@Override
@SuppressWarnings("unchecked")
public void commitTask(TaskAttemptContext taskContext) throws IOException {
DatasetRepository repo = getDatasetRepository(taskContext);
boolean inTempRepo = repo instanceof TemporaryDatasetRepository;
Dataset jobDataset = repo.load(TEMP_NAMESPACE, getJobDatasetName(taskContext));
String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext);
if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) {
Dataset taskAttemptDataset = repo.load(TEMP_NAMESPACE, taskAttemptDatasetName);
((Mergeable>) jobDataset).merge(taskAttemptDataset);
if (!inTempRepo) {
repo.delete(TEMP_NAMESPACE, taskAttemptDatasetName);
}
}
}
@Override
public void abortTask(TaskAttemptContext taskContext) {
deleteTaskAttemptDataset(taskContext);
}
}
@Override
@SuppressWarnings("unchecked")
public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext) {
Configuration conf = Hadoop.TaskAttemptContext
.getConfiguration.invoke(taskAttemptContext);
View target = load(taskAttemptContext);
View working;
if (usePerTaskAttemptDatasets(target)) {
working = loadOrCreateTaskAttemptView(taskAttemptContext);
} else {
working = target;
}
String partitionDir = conf.get(KITE_PARTITION_DIR);
if (working.getDataset().getDescriptor().isPartitioned() &&
partitionDir != null) {
if (!(target instanceof FileSystemDataset)) {
throw new UnsupportedOperationException("Partitions only supported for " +
"FileSystemDataset. Dataset: " + target);
}
FileSystemDataset fsDataset = (FileSystemDataset) target;
PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir));
if (key != null) {
working = fsDataset.getPartition(key, true);
}
return new DatasetRecordWriter(working);
} else {
return new DatasetRecordWriter(working);
}
}
@Override
public void checkOutputSpecs(JobContext jobContext) {
// The committer setup will fail if the output dataset does not exist
View target = load(jobContext);
Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
switch (conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT)) {
case APPEND:
break;
case OVERWRITE:
if (!target.isEmpty()) {
target.deleteAll();
}
break;
default:
case DEFAULT:
if (!target.isEmpty()) {
throw new DatasetException("View is not empty: " + target);
}
break;
}
}
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) {
View view = load(taskAttemptContext);
return usePerTaskAttemptDatasets(view) ?
new MergeOutputCommitter() : new NullOutputCommitter();
}
private static boolean usePerTaskAttemptDatasets(View target) {
// new API output committers are not called properly in Hadoop 1
return !Hadoop.isHadoop1() && target.getDataset() instanceof Mergeable;
}
// TODO: Remove the need to use DatasetRepositories.repositoryFor()
private static DatasetRepository getDatasetRepository(JobContext jobContext) {
Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
DatasetRepository repo = DatasetRepositories.repositoryFor(conf.get(KITE_OUTPUT_URI));
if (repo instanceof TemporaryDatasetRepositoryAccessor) {
Dataset