gorsat.spark.GorPartitionReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gor-spark Show documentation
Show all versions of gor-spark Show documentation
GORpipe allows analysis of large sets of genomic and phenotypic tabular data using a declarative query language in a parallel execution engine
package gorsat.spark;
import gorsat.BatchedPipeStepIteratorAdaptor;
import gorsat.BatchedReadSource;
import gorsat.process.GorPipe;
import gorsat.process.PipeInstance;
import gorsat.process.PipeOptions;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.connector.read.PartitionReader;
import org.apache.spark.sql.types.StructType;
import org.gorpipe.gor.model.RowBase;
import org.gorpipe.model.gor.RowObj;
import org.gorpipe.model.gor.iterators.RowSource;
import org.gorpipe.spark.GorSparkSession;
import org.gorpipe.spark.SparkGorMonitor;
import org.gorpipe.spark.SparkGorRow;
import org.gorpipe.spark.SparkSessionFactory;
import org.gorpipe.spark.platform.JobField;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.stream.Collectors;
public class GorPartitionReader implements PartitionReader {
RowSource iterator;
SparkGorRow sparkRow;
SparkGorMonitor sparkGorMonitor;
GorRangeInputPartition p;
ExpressionEncoder.Serializer serializer;
String redisUri;
String jobId;
String useCpp;
String projectRoot;
String cacheDir;
boolean nor = false;
public GorPartitionReader(StructType schema, GorRangeInputPartition gorRangeInputPartition, String redisUri, String jobId, String projectRoot, String cacheDir, String useCpp) {
ExpressionEncoder encoder = RowEncoder.apply(schema);
serializer = encoder.createSerializer();
sparkRow = new SparkGorRow(schema);
p = gorRangeInputPartition;
this.redisUri = redisUri;
this.jobId = jobId;
this.useCpp = useCpp;
this.projectRoot = projectRoot;
this.cacheDir = cacheDir;
}
private String parseMultiplePaths(Path epath) {
String epathstr = p.path;
if (Files.isDirectory(epath)) {
try {
epathstr = Files.walk(epath).skip(1).map(Path::toString).filter(p -> p.endsWith(".gorz")).collect(Collectors.joining(" "));
} catch (IOException e) {}
}
return epathstr;
}
private RowSource iteratorFromFile(PipeInstance pi) {
boolean useNative = useCpp != null && useCpp.equalsIgnoreCase("true");
String seek = useNative ? "cmd " : "gor ";
Path epath = Paths.get(p.path);
String epathstr = parseMultiplePaths(epath);
String spath = useNative ? "cgor #(S:-p chr:pos) " + p.path + "}" : epathstr;
String s = p.filterColumn != null && p.filterColumn.length() > 0 ? "-s " + p.filterColumn + " " : "";
String path = seek + (p.filterFile == null ? p.filter == null ? s + spath : s + "-f " + p.filter + " " + spath : s + "-ff " + p.filterFile + " " + spath);
String[] args = {path};
PipeOptions options = new PipeOptions();
options.parseOptions(args);
pi.subProcessArguments(options);
RowSource rowSource = pi.theInputSource();
if(p.chr!=null&&p.chr.length()>0) rowSource.setPosition(p.chr, p.start);
if (redisUri != null && redisUri.length() > 0) {
return new BatchedReadSource(rowSource, GorPipe.brsConfig(), rowSource.getHeader(), sparkGorMonitor);
} else {
return rowSource;
}
}
private RowSource iteratorWithPipeSteps(PipeInstance pi) {
pi.init(p.query, false, null);
RowSource rowSource = pi.theInputSource();
if(p.chr!=null&&p.chr.length()>0) rowSource.setPosition(p.chr, p.start);
return new BatchedPipeStepIteratorAdaptor(rowSource, pi.getPipeStep(), rowSource.getHeader(), GorPipe.brsConfig());
}
void initIterator() {
sparkGorMonitor = new SparkGorMonitor(redisUri,jobId) {
@Override
public boolean isCancelled() {
return sparkGorMonitor.getValue(JobField.CancelFlag) != null;
}
};
SparkSessionFactory sessionFactory = new SparkSessionFactory(null, projectRoot, cacheDir, sparkGorMonitor);
GorSparkSession gorPipeSession = (GorSparkSession) sessionFactory.create();
PipeInstance pi = new PipeInstance(gorPipeSession.getGorContext());
if(p.query!=null) {
iterator = iteratorWithPipeSteps(pi);
nor = p.query.toLowerCase().startsWith("nor ");
} else {
iterator = iteratorFromFile(pi);
}
}
@Override
public boolean next() {
if( iterator == null ) {
initIterator();
}
boolean hasNext = iterator.hasNext();
if( hasNext ) {
org.gorpipe.gor.model.Row gorrow = iterator.next();
if (nor) {
String rowstr = gorrow.otherCols();
int[] sa = RowObj.splitArray(rowstr);
gorrow = new RowBase("chrN", 0, rowstr, sa, null);
}
if (p.tag != null) gorrow = gorrow.rowWithAddedColumn(p.tag);
hasNext = p.chr == null || (gorrow.chr.equals(p.chr) && (p.end == -1 || gorrow.pos <= p.end));
sparkRow.row = gorrow;
}
return hasNext;
}
@Override
public InternalRow get() {
return serializer.apply(sparkRow);
}
@Override
public void close() {
if(iterator != null) iterator.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy