All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gorsat.spark.GorBatchTable Maven / Gradle / Ivy

Go to download

GORpipe allows analysis of large sets of genomic and phenotypic tabular data using a declarative query language in a parallel execution engine

There is a newer version: 4.3.2
Show newest version
package gorsat.spark;

import gorsat.Script.ScriptEngineFactory;
import gorsat.Script.ScriptExecutionEngine;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.connector.write.LogicalWriteInfo;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.gorpipe.spark.GorSparkSession;
import gorsat.process.SparkRowSource;
import org.apache.spark.sql.connector.catalog.SupportsRead;
import org.apache.spark.sql.connector.catalog.SupportsWrite;
import org.apache.spark.sql.connector.catalog.Table;
import org.apache.spark.sql.connector.catalog.TableCapability;
import org.apache.spark.sql.connector.expressions.Expressions;
import org.apache.spark.sql.connector.expressions.Transform;
import org.apache.spark.sql.connector.read.InputPartition;
import org.apache.spark.sql.connector.read.Scan;
import org.apache.spark.sql.connector.read.ScanBuilder;
import org.apache.spark.sql.connector.read.SupportsPushDownFilters;
import org.apache.spark.sql.connector.write.WriteBuilder;
import org.apache.spark.sql.sources.*;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.util.CaseInsensitiveStringMap;
import org.gorpipe.spark.SparkSessionFactory;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import java.util.zip.DataFormatException;

import static org.apache.spark.sql.types.DataTypes.StringType;

public abstract class GorBatchTable implements Table, SupportsRead, SupportsWrite, SupportsPushDownFilters {
    String[] commands;
    String query;
    String path;
    String inputfilter;
    String filterFile;
    String filterColumn;
    String splitFile;
    String fchrom;
    int fstart = 0;
    int fstop = -1;
    String redisUri;
    String jobId;
    String cacheFile;
    String useCpp;
    StructType schema;
    boolean tag;
    String projectRoot;
    String cacheDir;

    public GorBatchTable(String query, boolean tag, String path, String filter, String filterFile, String filterColumn, String splitFile, String seek, String redisUri, String jobId, String cacheFile, String useCpp) throws IOException, DataFormatException {
        init(query,tag,path,filter,filterFile,filterColumn,splitFile,seek,redisUri,jobId,cacheFile,useCpp);
    }

    public GorBatchTable(String query, boolean tag, String path, String filter, String filterFile, String filterColumn, String splitFile, String seek, StructType schema, String redisUri, String jobId, String cacheFile, String useCpp) {
        init(query,tag,path,filter,filterFile,filterColumn,splitFile,seek,redisUri,jobId,cacheFile,useCpp);
        this.schema = schema;
    }

    public void setProjectRoot(String projectRoot) {
        this.projectRoot = projectRoot;
    }

    public void setCacheDir(String cacheDir) {
        this.cacheDir = cacheDir;
    }

    void checkSeek(String seek) {
        if( seek != null && seek.length() > 0 ) {
            String[] spl = seek.split(":");
            fchrom = spl[0];
            if( spl.length > 1 ) {
                String[] sspl = spl[1].split("-");
                fstart = Integer.parseInt(sspl[0]);
                if( sspl.length > 1 ) {
                    fstop = Integer.parseInt(sspl[1]);
                }
            }
        }
    }

    void init(String query, boolean tag, String path, String filter, String filterFile, String filterColumn, String splitFile, String seek, String redisUri, String jobId, String cacheFile, String useCpp) {
        this.query = query;
        this.projectRoot = Paths.get(".").toAbsolutePath().normalize().toString();
        this.cacheDir = "result_cache";
        this.tag = tag;
        this.path = path;
        this.inputfilter = filter;
        this.filterFile = filterFile;
        this.filterColumn = filterColumn;
        this.splitFile = splitFile;
        this.redisUri = redisUri;
        this.jobId = jobId;
        this.cacheFile = cacheFile;
        this.useCpp = useCpp;
        checkSeek(seek);
    }

    private String[] initCommands(String query) {
        String[] commands = null;
        if(query!=null) {
            if(query.toLowerCase().startsWith("pgor") || query.toLowerCase().startsWith("partgor") || query.toLowerCase().startsWith("parallel")) {
                ReceiveQueryHandler receiveQueryHandler = new ReceiveQueryHandler();
                SparkSessionFactory sessionFactory = new SparkSessionFactory(null, projectRoot, cacheDir, null, receiveQueryHandler);
                GorSparkSession gorPipeSession = (GorSparkSession) sessionFactory.create();
                ScriptExecutionEngine see = ScriptEngineFactory.create(gorPipeSession.getGorContext());
                see.execute(new String[]{query}, false);
                commands = receiveQueryHandler.getCommandsToExecute();
            } else commands = new String[] {query};
        }
        return commands;
    }

    void inferSchema() {
        commands = initCommands(query);
        schema = Encoders.STRING().schema();
        if(path!=null) {
            Path ppath = Paths.get(path);
            try {
                schema = SparkRowSource.inferSchema(ppath, ppath.getFileName().toString(), false, path.toLowerCase().endsWith(".gorz"));
            } catch (IOException | DataFormatException e) {
                throw new RuntimeException("Unable to infer schema from "+ppath.toString(), e);
            }
        } else if(commands!=null) {
            String query = commands[0];
            boolean nor = query.toLowerCase().startsWith("nor ");
            SparkSessionFactory sessionFactory = new SparkSessionFactory(null, projectRoot, cacheDir, null);
            GorSparkSession gorPipeSession = (GorSparkSession) sessionFactory.create();
            SparkRowSource.GorDataType gdt = SparkRowSource.gorCmdSchema(query,gorPipeSession, nor);

            String[] headerArray = gdt.header;
            DataType[] dataTypes = new DataType[headerArray.length];
            int start = 0;
            for (int i = start; i < dataTypes.length; i++) {
                dataTypes[i] = gdt.dataTypeMap.getOrDefault(i, StringType);
            }

            Stream fieldStream = IntStream.range(0, headerArray.length).mapToObj(i -> new StructField(headerArray[i], dataTypes[i], true, Metadata.empty()));
            StructField[] fields = (tag ? Stream.concat(fieldStream,Stream.of(new StructField("Tag", StringType, true, Metadata.empty()))) : fieldStream).toArray(StructField[]::new);
            schema = new StructType(fields);
        }
    }

    private static final Set CAPABILITIES = new HashSet<>(Arrays.asList(
            TableCapability.BATCH_READ,
            TableCapability.BATCH_WRITE,
            TableCapability.TRUNCATE));

    @Override
    public Scan build() {
        return null;
    }

    @Override
    public Filter[] pushFilters(Filter[] filters) {
        return new Filter[0];
    }

    @Override
    public Filter[] pushedFilters() {
        return new Filter[0];
    }

    @Override
    public StructType schema() {
        if(schema==null) inferSchema();
        return schema;
    }

    @Override
    public String name() {
        return this.getClass().toString();
    }

    @Override
    public Set capabilities() {
        return CAPABILITIES;
    }

    @Override
    public WriteBuilder newWriteBuilder(LogicalWriteInfo info) {
        return new GorWriteBuilder() {};
    }

    @Override
    public ScanBuilder newScanBuilder(CaseInsensitiveStringMap caseInsensitiveStringMap) {
        if(schema==null) inferSchema();
        return new GorScanBuilder(schema, redisUri, jobId, cacheFile, projectRoot, cacheDir, useCpp) {
            Filter[] pushedFilters = new Filter[0];
            String filterChrom = fchrom;
            int start = fstart;
            int stop = fstop;
            String filter = GorBatchTable.this.inputfilter;

            @Override
            public Filter[] pushFilters(Filter[] filters) {
                Filter[] ret;
                if( schema.size() > 2 ) {
                    String posName = schema.fieldNames()[1];
                    String lastName = schema.fieldNames()[schema.size() - 1];

                    Optional chrOpt = Arrays.stream(filters).filter(f -> f instanceof EqualTo).filter(f -> ((EqualTo) f).attribute().equalsIgnoreCase("CHROM")).findFirst();
                    chrOpt.map(f -> ((EqualTo) f).value().toString()).ifPresent(s -> filterChrom = s);

                    Optional inOpt = Arrays.stream(filters).filter(f -> f instanceof In).filter(f -> ((In) f).attribute().equalsIgnoreCase(lastName)).findFirst();
                    if( !inOpt.isPresent() ) {
                        inOpt = Arrays.stream(filters).filter(f -> f instanceof EqualTo).filter(f -> ((EqualTo) f).attribute().equalsIgnoreCase(lastName)).findFirst();
                        inOpt.map(f -> ((EqualTo) f).value()).map(Object::toString).ifPresent(s -> filter = s);
                    } else {
                        inOpt.map(f -> Arrays.stream(((In) f).values()).map(Object::toString).collect(Collectors.joining(","))).ifPresent(s -> filter = s);
                    }

                    /* jdk9+ required
                    Optional posGreatOpt = Arrays.stream(filters).filter(f -> f instanceof GreaterThan).filter(f -> ((GreaterThan) f).attribute().equalsIgnoreCase(posName)).findFirst().or(Arrays.stream(filters).filter(f -> f instanceof GreaterThanOrEqual).filter(f -> ((GreaterThanOrEqual) f).attribute().equalsIgnoreCase(posName))::findFirst);
                    posGreatOpt.ifPresent(f -> {
                        if (f instanceof GreaterThan) {
                            start = ((Number) ((GreaterThan) f).value()).intValue() - 1;
                        } else {
                            start = ((Number) ((LessThanOrEqual) f).value()).intValue();
                        }
                    });

                    Optional posLessOpt = Arrays.stream(filters).filter(f -> f instanceof LessThan).filter(f -> ((LessThan) f).attribute().equalsIgnoreCase(posName)).findFirst().or(Arrays.stream(filters).filter(f -> f instanceof LessThanOrEqual).filter(f -> ((LessThanOrEqual) f).attribute().equalsIgnoreCase(posName))::findFirst);
                    posLessOpt.ifPresent(f -> {
                        if (f instanceof LessThan) {
                            stop = ((Number) ((LessThan) f).value()).intValue();
                        } else {
                            stop = ((Number) ((LessThanOrEqual) f).value()).intValue() + 1;
                        }
                    });*/

                    Optional posGreatOpt = Arrays.stream(filters).filter(f -> f instanceof GreaterThan).filter(f -> ((GreaterThan) f).attribute().equalsIgnoreCase(posName)).findFirst();
                    if (!posGreatOpt.isPresent()) posGreatOpt = Arrays.stream(filters).filter(f -> f instanceof GreaterThanOrEqual).filter(f -> ((GreaterThanOrEqual) f).attribute().equalsIgnoreCase(posName)).findFirst();
                    posGreatOpt.ifPresent(f -> {
                        if (f instanceof GreaterThan) {
                            start = ((Number) ((GreaterThan) f).value()).intValue() - 1;
                        } else {
                            start = ((Number) ((LessThanOrEqual) f).value()).intValue();
                        }
                    });
                    final Optional posGreatOptFinal = posGreatOpt;

                    Optional posLessOpt = Arrays.stream(filters).filter(f -> f instanceof LessThan).filter(f -> ((LessThan) f).attribute().equalsIgnoreCase(posName)).findFirst();
                    if (!posLessOpt.isPresent()) posLessOpt = Arrays.stream(filters).filter(f -> f instanceof LessThanOrEqual).filter(f -> ((LessThanOrEqual) f).attribute().equalsIgnoreCase(posName)).findFirst();
                    posLessOpt.ifPresent(f -> {
                        if (f instanceof LessThan) {
                            stop = ((Number) ((LessThan) f).value()).intValue();
                        } else {
                            stop = ((Number) ((LessThanOrEqual) f).value()).intValue() + 1;
                        }
                    });
                    final Optional posLessOptFinal = posLessOpt;

                    ret =  Arrays.stream(filters).filter(f -> !chrOpt.isPresent() || (!f.equals(chrOpt.get()) && (!posGreatOptFinal.isPresent() || !f.equals(posGreatOptFinal.get())) && (!posLessOptFinal.isPresent() || !f.equals(posLessOptFinal.get())))).toArray(Filter[]::new);
                } else ret = new Filter[0];

                Set fset = new HashSet<>(Arrays.asList(ret));
                pushedFilters = Arrays.stream(filters).filter(f -> !fset.contains(f)).toArray(Filter[]::new);
                return ret;
            }

            @Override
            public Filter[] pushedFilters() {
                return pushedFilters;
            }

            @Override
            public InputPartition[] planInputPartitions() {
                InputPartition[] partitions = null;
                if( commands != null ) {
                    partitions = Arrays.stream(commands).map(cmd -> {
                        String tagstr = null;
                        if(tag) {
                            int i = cmd.indexOf("-p ") + 3;
                            if (i != -1) {
                                while (i < cmd.length() && cmd.charAt(i) == ' ') i++;
                                int k = i + 1;
                                while (k < cmd.length() && cmd.charAt(k) != ' ') k++;
                                tagstr = cmd.substring(i, k);
                            }
                        }
                        return new GorRangeInputPartition(cmd, tagstr);
                    }).toArray(GorRangeInputPartition[]::new);
                } else if( filterChrom != null ) {
                    partitions = new InputPartition[1];
                    partitions[0] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, filterChrom, start, stop, filterChrom);
                } else {
                    if (splitFile != null) {
                        try {
                            partitions = Files.lines(Paths.get(splitFile)).skip(1).map(line -> line.split("\t")).map(s -> new GorRangeInputPartition(path, filter, filterFile, filterColumn, s[0], Integer.parseInt(s[1]), Integer.parseInt(s[2]), s.length > 3 ? s[3] : s[0] + ":" + s[1] + "-" + s[2])).toArray(InputPartition[]::new);
                        } catch (IOException e) {
                            e.printStackTrace();
                        }
                    }

                    if (partitions == null) {
                        partitions = new InputPartition[24];
                        partitions[0] = new GorRangeInputPartition(path, filter, filterFile, filterColumn,"chr1", 0, 249250621, "chr1");
                        partitions[1] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr10", 0, 135534747, "chr10");
                        partitions[2] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr11", 0, 135006516, "chr11");
                        partitions[3] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr12", 0, 133851895, "chr12");
                        partitions[4] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr13", 0, 115169878, "chr13");
                        partitions[5] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr14", 0, 107349540, "chr14");
                        partitions[6] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr15", 0, 102531392, "chr15");
                        partitions[7] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr16", 0, 90354753, "chr16");
                        partitions[8] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr17", 0, 81195210, "chr17");
                        partitions[9] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr18", 0, 78077248, "chr18");
                        partitions[10] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr19", 0, 59128983, "chr19");
                        partitions[11] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr2", 0, 243199373, "chr2");
                        partitions[12] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr20", 0, 63025520, "chr20");
                        partitions[13] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr21", 0, 48129895, "chr21");
                        partitions[14] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr22", 0, 51304566, "chr22");
                        partitions[15] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr3", 0, 198022430, "chr3");
                        partitions[16] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr4", 0, 191154276, "chr4");
                        partitions[17] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr5", 0, 180915260, "chr5");
                        partitions[18] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr6", 0, 171115067, "chr6");
                        partitions[19] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr7", 0, 159138663, "chr7");
                        partitions[20] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr8", 0, 146364022, "chr8");
                        partitions[21] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chr9", 0, 141213431, "chr9");
                        partitions[22] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chrX", 0, 155270560, "chrX");
                        partitions[23] = new GorRangeInputPartition(path, filter, filterFile, filterColumn, "chrY", 0, 59373566, "chrY");
                    }
                }
                return partitions;
            }
        };
    }

    @Override
    public Transform[] partitioning() {
        return new Transform[] {
            Expressions.identity("i")
        };
    }

    /*@Override
    public String name() {
        return null;
    }

    @Override
    public StructType schema() {
        return null;
    }

    @Override
    public Transform[] partitioning() {
        return new Transform[0];
    }

    @Override
    public Map properties() {
        return null;
    }

    @Override
    public Set capabilities() {
        return null;
    }*/
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy