gorsat.spark.GorzFunction Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gor-spark Show documentation
GORpipe allows analysis of large sets of genomic and phenotypic tabular data using a declarative query language in a parallel execution engine
There is a newer version: 4.3.2
Show newest version
package gorsat.spark;

import java.io.Serializable;
import java.util.Collection;
import java.util.List;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer$;
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.catalyst.expressions.Attribute;
import org.apache.spark.sql.execution.datasources.PartitionedFile;
import org.apache.spark.sql.sources.EqualTo;
import org.apache.spark.sql.sources.Filter;
import org.apache.spark.sql.sources.GreaterThan;
import org.apache.spark.sql.sources.LessThan;
import org.apache.spark.sql.types.StructType;
import org.gorpipe.gor.binsearch.Unzipper;
import org.gorpipe.model.gor.RowObj;
import org.gorpipe.spark.SparkGorRow;
import scala.Function1;
import scala.collection.Iterator;
import scala.collection.JavaConverters;
import scala.collection.Seq;

class GorzFunction implements Function1>, Serializable {
    Function1> func;
    ExpressionEncoder encoder;
    ExpressionEncoder.Serializer serializer;
    Unzipper unzip;
    String chrom;
    int start;
    int stop;
    byte[] unzipped;

    GorzFunction(Function1 func, StructType schema, Collection filters) {
        this.func = func;
        this.unzipped = new byte[1<<17];

        List lattr = JavaConverters.asJavaCollection(schema.toAttributes()).stream().map(Attribute::toAttribute).collect(Collectors.toList());
        Seq sattr = JavaConverters.asScalaBuffer(lattr).toSeq();

        this.encoder = RowEncoder.apply(schema).resolveAndBind(sattr, SimpleAnalyzer$.MODULE$);
        this.serializer = encoder.createSerializer();
        this.unzip = new Unzipper();
        this.chrom = filters.stream().filter(f -> f instanceof EqualTo).map(f -> (EqualTo)f).filter(f -> f.attribute().equalsIgnoreCase("chrom")).map(EqualTo::value).map(Object::toString).findFirst().orElse(null);
        this.start = filters.stream().filter(f -> f instanceof GreaterThan).map(f -> (GreaterThan)f).filter(f -> f.attribute().equalsIgnoreCase("pos")).map(GreaterThan::value).map(f -> (Integer)f).findFirst().orElse(-1);
        this.stop = filters.stream().filter(f -> f instanceof LessThan).map(f -> (LessThan)f).filter(f -> f.attribute().equalsIgnoreCase("pos")).map(LessThan::value).map(f -> (Integer)f).findFirst().orElse(-1);
    }

    @Override
    public Iterator apply(PartitionedFile v1) {
        Iterator it = func.apply(v1);
        Stream stream = StreamSupport.stream(Spliterators.spliteratorUnknownSize(JavaConverters.asJavaIterator(it), Spliterator.ORDERED), false).map(ir -> ir.getString(0));

        // Do not remove, reinsert when start building with jdk 11+
        /*stream = chrom != null ? stream.dropWhile(f -> {
            int i = f.indexOf('\t');
            return chrom.compareTo(f.substring(0, i)) > 0;
        }).takeWhile(f -> {
            int i = f.indexOf('\t');
            return chrom.equals(f.substring(0, i));
        }) : stream;

        stream = start != -1 ? stream.dropWhile(f -> {
            int i = f.indexOf('\t')+1;
            int e = f.indexOf('\t',i);
            return start > Integer.parseInt(f.substring(i, e));
        }) : stream;

        stream = stream.map(res -> {
            int i = res.indexOf('\t');
            i = res.indexOf('\t', i+1);
            return res.substring(i+2);
        });

        stream = stream.flatMap(s -> {
            byte[] bb = Base64.getDecoder().decode(s);
            unzip.setType(CompressionType.ZLIB);
            unzip.setRawInput(bb,0,bb.length);
            int unzipLen = 0; //unzipToNewBuffer(bb, 0, bb.length, (byte)0, null);
            try {
                unzipLen = unzip.decompress(unzipped,0,unzipped.length);
            } catch (DataFormatException | IOException e) {
                throw new GorSystemException("gorz write failed",e);
            }
            ByteArrayInputStream bais = new ByteArrayInputStream(unzipped,0,unzipLen);
            InputStreamReader isr = new InputStreamReader(bais);
            BufferedReader br = new BufferedReader(isr);
            return br.lines();
        });

        stream = start != -1 ? stream.dropWhile(f -> {
            int i = f.indexOf('\t')+1;
            int e = f.indexOf('\t',i);
            return start > Integer.parseInt(f.substring(i, e));
        }) : stream;

        stream = stop != -1 ? stream.takeWhile(f -> {
            int i = f.indexOf('\t')+1;
            int e = f.indexOf('\t',i);
            return stop > Integer.parseInt(f.substring(i, e));
        }) : stream;*/

        Stream istream = stream.map(RowObj::apply).map(r -> new SparkGorRow(r, encoder.schema())).map(r -> serializer.apply(r).copy());
        java.util.Iterator iterator = istream.iterator();
        return JavaConverters.asScalaIterator(iterator);
    }

    @Override
    public  Function1> compose(Function1 g) {
        return func.compose(g);
    }

    @Override
    public  Function1 andThen(Function1, A> g) {
        return func.andThen(g);
    }
}