gorsat.spark.GorFileFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gor-spark Show documentation
Show all versions of gor-spark Show documentation
GORpipe allows analysis of large sets of genomic and phenotypic tabular data using a declarative query language in a parallel execution engine
package gorsat.spark;
import org.gorpipe.spark.SparkGOR;
import gorsat.process.SparkRowSource;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.parquet.hadoop.codec.CodecConfig;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.execution.datasources.OutputWriter;
import org.apache.spark.sql.execution.datasources.OutputWriterFactory;
import org.apache.spark.sql.execution.datasources.PartitionedFile;
import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat;
import org.apache.spark.sql.internal.SQLConf;
import org.apache.spark.sql.sources.Filter;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StructType;
import scala.Function1;
import scala.Option;
import scala.collection.Iterator;
import scala.collection.JavaConverters;
import scala.collection.Seq;
import scala.collection.immutable.Map;
import java.io.IOException;
import java.io.Serializable;
import java.nio.file.Paths;
import java.util.zip.DataFormatException;
public class GorFileFormat extends CSVFileFormat implements Serializable {
@Override
public Option inferSchema(SparkSession sparkSession, Map options, Seq files) {
String pathstr = options.get("path").get();
java.nio.file.Path path = Paths.get(pathstr);
StructType ret = null;
try {
ret = SparkRowSource.inferSchema(path, path.getFileName().toString(), false, pathstr.endsWith(".gorz"));
} catch (IOException | DataFormatException e) {
e.printStackTrace();
}
return Option.apply(ret);
}
@Override
public OutputWriterFactory prepareWrite(SparkSession sparkSession, Job job, Map options, StructType dataSchema) {
return new OutputWriterFactory() {
@Override
public OutputWriter newInstance(String path, StructType dataSchema, TaskAttemptContext context) {
try {
return new GorOutputWriter(path, dataSchema, options.get("path").get());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public String getFileExtension(TaskAttemptContext context) {
return CodecConfig.from(context).getCodec().getExtension() + ".gorz";
}
};
}
@Override
public boolean supportBatch(SparkSession sparkSession, StructType dataSchema) {
return super.supportBatch(sparkSession, dataSchema);
}
@Override
public Option> vectorTypes(StructType requiredSchema, StructType partitionSchema, SQLConf sqlConf) {
return super.vectorTypes(requiredSchema, partitionSchema, sqlConf);
}
@Override
public boolean isSplitable(SparkSession sparkSession, Map options, Path path) {
return super.isSplitable(sparkSession, options, path);
}
@Override
public Function1> buildReader(SparkSession sparkSession, StructType dataSchema, StructType partitionSchema, StructType requiredSchema, Seq filters, Map options, Configuration hadoopConf) {
Function1> func;
String pathstr = options.get("path").get();
boolean isGorz = pathstr.endsWith(".gorz");
boolean isGord = pathstr.endsWith(".gord");
if( isGord ) {
func = new GordFunction(requiredSchema);
} else {
Map soptions = SparkGOR.me(options);
func = super.buildReader(sparkSession, dataSchema, partitionSchema, requiredSchema, filters, soptions, hadoopConf);
if (isGorz) {
//var lgattr = JavaConverters.asJavaCollection(requiredSchema.toAttributes()).stream().map(Attribute::toAttribute).collect(Collectors.toList());
//var sgattr = JavaConverters.asScalaBuffer(lgattr).toSeq();
//ExpressionEncoder gorzencoder = SparkGOR.gorzencoder().resolveAndBind(sgattr, SimpleAnalyzer$.MODULE$);
return new GorzFunction(func, requiredSchema, JavaConverters.asJavaCollection(filters));
}
}
return func;
}
@Override
public Function1> buildReaderWithPartitionValues(SparkSession sparkSession, StructType dataSchema, StructType partitionSchema, StructType requiredSchema, Seq filters, Map options, Configuration hadoopConf) {
return super.buildReaderWithPartitionValues(sparkSession, dataSchema, partitionSchema, requiredSchema, filters, options, hadoopConf);
}
@Override
public boolean supportDataType(DataType dataType) {
return true;
}
@Override
public String shortName() {
return "gor";
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy