com.twitter.scalding.parquet.cascading.ParquetValueScheme Maven / Gradle / Ivy
The newest version!
package com.twitter.scalding.parquet.cascading;
import java.io.IOException;
import java.io.Serializable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import cascading.flow.FlowProcess;
import cascading.scheme.Scheme;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.tap.Tap;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import org.apache.parquet.filter2.predicate.FilterPredicate;
import org.apache.parquet.hadoop.ParquetInputFormat;
import org.apache.parquet.hadoop.mapred.Container;
import org.apache.parquet.hadoop.thrift.ParquetThriftInputFormat;
import org.apache.parquet.hadoop.thrift.ThriftReadSupport;
import static org.apache.parquet.Preconditions.checkNotNull;
/**
* A Cascading Scheme that returns a simple Tuple with a single value, the "value" object
* coming out of the underlying InputFormat.
*
* This is an abstract class; implementations are expected to set up their Input/Output Formats
* correctly in the respective Init methods.
*/
public abstract class ParquetValueScheme extends Scheme{
public static final class Config implements Serializable {
private final FilterPredicate filterPredicate;
private final String deprecatedProjectionString;
private final String strictProjectionString;
private final Class klass;
private Config(Class klass, FilterPredicate filterPredicate, String deprecatedProjectionString, String strictProjectionString) {
this.filterPredicate = filterPredicate;
this.deprecatedProjectionString = deprecatedProjectionString;
this.strictProjectionString = strictProjectionString;
this.klass = klass;
}
public Config() {
filterPredicate = null;
deprecatedProjectionString = null;
strictProjectionString = null;
klass = null;
}
public FilterPredicate getFilterPredicate() {
return filterPredicate;
}
@Deprecated
public String getProjectionString() {
return deprecatedProjectionString;
}
public String getStrictProjectionString() {
return strictProjectionString;
}
public Class getKlass() {
return klass;
}
public Config withFilterPredicate(FilterPredicate f) {
return new Config(this.klass, checkNotNull(f, "filterPredicate"), this.deprecatedProjectionString, this.strictProjectionString);
}
@Deprecated
public Config withProjectionString(String p) {
return new Config(this.klass, this.filterPredicate, checkNotNull(p, "projectionString"), this.strictProjectionString);
}
public Config withStrictProjectionString(String p) {
return new Config(this.klass, this.filterPredicate, this.deprecatedProjectionString, checkNotNull(p, "projectionString"));
}
public Config withRecordClass(Class klass) {
return new Config(checkNotNull(klass, "recordClass"), this.filterPredicate, this.deprecatedProjectionString, this.strictProjectionString);
}
}
private static final long serialVersionUID = 157560846420730043L;
protected final Config config;
public ParquetValueScheme() {
this(new Config());
}
public ParquetValueScheme(FilterPredicate filterPredicate) {
this(new Config().withFilterPredicate(filterPredicate));
}
public ParquetValueScheme(Config config) {
this.config = config;
}
@Deprecated
private void setProjectionPushdown(JobConf jobConf) {
if (this.config.deprecatedProjectionString != null) {
ThriftReadSupport.setProjectionPushdown(jobConf, this.config.deprecatedProjectionString);
}
}
private void setStrictProjectionPushdown(JobConf jobConf) {
if (this.config.strictProjectionString != null) {
ThriftReadSupport.setStrictFieldProjectionFilter(jobConf, this.config.strictProjectionString);
}
}
private void setPredicatePushdown(JobConf jobConf) {
if (this.config.filterPredicate != null) {
ParquetInputFormat.setFilterPredicate(jobConf, this.config.filterPredicate);
}
}
@Override
public void sourceConfInit(FlowProcess extends JobConf> jobConfFlowProcess, Tap jobConfRecordReaderOutputCollectorTap, final JobConf jobConf) {
setPredicatePushdown(jobConf);
setProjectionPushdown(jobConf);
setStrictProjectionPushdown(jobConf);
setRecordClass(jobConf);
}
private void setRecordClass(JobConf jobConf) {
if (config.klass != null) {
ParquetThriftInputFormat.setThriftClass(jobConf, config.klass);
}
}
@SuppressWarnings("unchecked")
@Override
public boolean source(FlowProcess extends JobConf> fp, SourceCall