All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.kitesdk.data.crunch.DatasetSourceTarget Maven / Gradle / Ivy
/**
* Copyright 2014 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.data.crunch;
import java.io.IOException;
import java.net.URI;
import java.util.Map;
import java.util.Set;
import com.google.common.collect.ImmutableSet;
import org.apache.avro.generic.GenericData;
import org.apache.crunch.ReadableData;
import org.apache.crunch.Source;
import org.apache.crunch.SourceTarget;
import org.apache.crunch.impl.mr.run.CrunchMapper;
import org.apache.crunch.impl.mr.run.RuntimeParameters;
import org.apache.crunch.io.CrunchInputs;
import org.apache.crunch.io.FormatBundle;
import org.apache.crunch.io.ReadableSourceTarget;
import org.apache.crunch.types.Converter;
import org.apache.crunch.types.PType;
import org.apache.crunch.types.avro.AvroType;
import org.apache.crunch.types.avro.Avros;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.Format;
import org.kitesdk.data.Formats;
import org.kitesdk.data.View;
import org.kitesdk.data.mapreduce.DatasetKeyInputFormat;
import org.kitesdk.data.spi.LastModifiedAccessor;
import org.kitesdk.data.spi.SizeAccessor;
import org.kitesdk.data.spi.filesystem.FileSystemDataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
class DatasetSourceTarget extends DatasetTarget implements ReadableSourceTarget {
private static final Logger LOG = LoggerFactory
.getLogger(DatasetSourceTarget.class);
private View view;
private FormatBundle formatBundle;
private AvroType avroType;
public DatasetSourceTarget(View view) {
this(view, view.getType());
}
@SuppressWarnings("unchecked")
public DatasetSourceTarget(View view, Class type) {
this(view, toAvroType(view, type));
}
public DatasetSourceTarget(URI uri, Class type) {
this(Datasets.load(uri, type));
}
public DatasetSourceTarget(View view, AvroType avroType) {
super(view);
this.view = view;
this.avroType = avroType;
Configuration temp = new Configuration(false /* use an empty conf */ );
DatasetKeyInputFormat.configure(temp).readFrom(view);
this.formatBundle = inputBundle(temp);
Dataset dataset = view.getDataset();
// Disable CombineFileInputFormat in Crunch unless we're dealing with Avro or Parquet files
Format format = dataset.getDescriptor().getFormat();
boolean isAvroOrParquetFile = (dataset instanceof FileSystemDataset)
&& (Formats.AVRO.equals(format) || Formats.PARQUET.equals(format));
formatBundle.set(RuntimeParameters.DISABLE_COMBINE_FILE, Boolean.toString(!isAvroOrParquetFile));
}
public DatasetSourceTarget(URI uri, AvroType avroType) {
this(Datasets.load(uri, avroType.getTypeClass()), avroType);
}
@SuppressWarnings("unchecked")
private static AvroType toAvroType(View view, Class type) {
if (type.isAssignableFrom(GenericData.Record.class)) {
return (AvroType) Avros.generics(
view.getDataset().getDescriptor().getSchema());
} else {
return Avros.records(type);
}
}
@Override
public Source inputConf(String key, String value) {
formatBundle.set(key, value);
return this;
}
@Override
public PType getType() {
return avroType;
}
@Override
public Converter, ?, ?, ?> getConverter() {
return new KeyConverter(avroType);
}
@Override
@SuppressWarnings("unchecked")
public void configureSource(Job job, int inputId) throws IOException {
Configuration conf = job.getConfiguration();
if (inputId == -1) {
job.setMapperClass(CrunchMapper.class);
job.setInputFormatClass(formatBundle.getFormatClass());
formatBundle.configure(conf);
} else {
Path dummy = new Path("/view/" + view.getDataset().getName());
CrunchInputs.addInputPath(job, dummy, formatBundle, inputId);
}
}
@Override
public long getSize(Configuration configuration) {
if (view instanceof SizeAccessor) {
return ((SizeAccessor) view).getSize();
}
LOG.warn("Cannot determine size for view: " + toString());
return 1000L * 1000L * 1000L; // fallback to HBase default size
}
@Override
public long getLastModifiedAt(Configuration configuration) {
if (view instanceof LastModifiedAccessor) {
return ((LastModifiedAccessor) view).getLastModified();
}
LOG.warn("Cannot determine last modified time for source: " + toString());
return -1;
}
@Override
public Iterable read(Configuration configuration) throws IOException {
// TODO: what to do with Configuration? create new view?
return view.newReader(); // TODO: who calls close?
}
@Override
public ReadableData asReadable() {
return new ReadableData() {
@Override
public Set> getSourceTargets() {
return ImmutableSet.>of(DatasetSourceTarget.this);
}
@Override
public void configure(Configuration conf) {
// TODO: optimize for file-based datasets by using distributed cache
}
@Override
public Iterable read(TaskInputOutputContext, ?, ?, ?> context) throws IOException {
return view.newReader();
}
};
}
@Override
public SourceTarget conf(String key, String value) {
inputConf(key, value);
outputConf(key, value);
return this;
}
/**
* Builds a FormatBundle for DatasetKeyInputFormat by copying a temp config.
*
* All properties will be copied from the temporary configuration
*
* @param conf A Configuration that will be copied
* @return a FormatBundle with the contents of conf
*/
private static FormatBundle inputBundle(Configuration conf) {
FormatBundle bundle = FormatBundle
.forInput(DatasetKeyInputFormat.class);
for (Map.Entry entry : conf) {
bundle.set(entry.getKey(), entry.getValue());
}
return bundle;
}
}