gorsat.spark.GordFunction Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of gor-spark Show documentation

GORpipe allows analysis of large sets of genomic and phenotypic tabular data using a declarative query language in a parallel execution engine

There is a newer version: 4.3.2

Show newest version

package gorsat.spark;

import gorsat.BatchedPipeStepIteratorAdaptor;
import gorsat.BatchedReadSourceConfig;
import gorsat.process.GenericSessionFactory;
import gorsat.process.GorPipe;
import gorsat.process.PipeInstance;
import gorsat.process.PipeOptions;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer$;
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.catalyst.expressions.Attribute;
import org.apache.spark.sql.execution.datasources.PartitionedFile;
import org.apache.spark.sql.types.StructType;
import org.gorpipe.gor.session.GorSession;
import org.gorpipe.spark.SparkGorRow;
import scala.Function1;
import scala.collection.Iterator;
import scala.collection.JavaConverters;
import scala.collection.Seq;

import java.io.Serializable;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

public class GordFunction implements Function1>, Serializable {
    String header;
    ExpressionEncoder encoder;
    ExpressionEncoder.Serializer serializer;

    public GordFunction(StructType schema) {
        this.header = String.join("\t",schema.fieldNames());
        List lattr = JavaConverters.asJavaCollection(schema.toAttributes()).stream().map(Attribute::toAttribute).collect(Collectors.toList());
        Seq sattr = JavaConverters.asScalaBuffer(lattr).toSeq();
        encoder = RowEncoder.apply(schema).resolveAndBind(sattr, SimpleAnalyzer$.MODULE$);
        serializer = encoder.createSerializer();
    }

    @Override
    public Iterator apply(PartitionedFile v1) {
        GenericSessionFactory sessionFactory = new GenericSessionFactory("/gorproject","result_cache");
        GorSession gorPipeSession = sessionFactory.create();
        PipeInstance pi = new PipeInstance(gorPipeSession.getGorContext());

        String[] args = {v1.filePath()};
        PipeOptions options = new PipeOptions();
        options.parseOptions(args);
        pi.subProcessArguments(options);

        BatchedReadSourceConfig brsConfig = GorPipe.brsConfig();
        BatchedPipeStepIteratorAdaptor batchedPipeStepIteratorAdaptor = new BatchedPipeStepIteratorAdaptor(pi.getIterator(), pi.thePipeStep(), true, header, brsConfig);
        Stream ir = StreamSupport.stream(batchedPipeStepIteratorAdaptor,false).map(r -> new SparkGorRow(r, encoder.schema())).map(r -> serializer.apply(r));

        return JavaConverters.asScalaIterator(ir.iterator());
    }

    @Override
    public  Function1> compose(Function1 g) {
        return null; //super.compose(g);
    }

    @Override
    public  Function1 andThen(Function1, A> g) {
        return null; //super.andThen(g);
    }

    @Override
    public String toString() {
        return super.toString();
    }
}