org.gorpipe.spark.SparkPCA Maven / Gradle / Ivy

Go to download
package org.gorpipe.spark;

import breeze.linalg.DenseMatrix;
import com.google.common.collect.Iterators;
import gorsat.process.GenericSessionFactory;
import gorsat.process.PipeInstance;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.feature.PCA;
import org.apache.spark.mllib.feature.PCAModel;
import org.apache.spark.mllib.linalg.Matrices;
import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.distributed.*;
import org.apache.spark.sql.*;
import org.gorpipe.gor.session.GorContext;
import org.gorpipe.gor.session.GorSession;
import scala.Tuple2;

import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.function.DoubleFunction;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

public class SparkPCA {
    static String[] testargs = {"--projectroot","/gorproject","--freeze","plink_wes","--variants","testvars2.gorz","--pnlist","testpns.txt","--partsize","10","--pcacomponents","3","--outfile","out.txt"};//,"--sparse"};

    public static void main(String[] args) throws IOException {
        //args = testargs;
        List argList = Arrays.asList(args);
        int i = argList.indexOf("--appname");
        String appName = i != -1 ? argList.get(i+1) : "pca";
        i = argList.indexOf("--freeze");
        String freeze = i != -1 ? argList.get(i+1) : null;
        if(freeze!=null&&freeze.startsWith("'")) freeze = freeze.substring(1,freeze.length()-1);
        i = argList.indexOf("--projectroot");
        String projectRoot = argList.get(i+1);
        i = argList.indexOf("--variants");
        String variants = argList.get(i+1);
        i = argList.indexOf("--pnlist");
        String pnlist = argList.get(i+1);
        i = argList.indexOf("--partsize");
        int partsize = i != -1 ? Integer.parseInt(argList.get(i+1)) : 10;
        i = argList.indexOf("--pcacomponents");
        int pcacomponents = i != -1 ? Integer.parseInt(argList.get(i+1)) : 3;
        i = argList.indexOf("--outfile");
        String outfile = i != -1 ? argList.get(i+1) : null;
        boolean sparse = argList.indexOf("--sparse") != -1;

        i = argList.indexOf("--instances");
        int instances = i != -1 ? Integer.parseInt(argList.get(i+1)) : -1;
        i = argList.indexOf("--cores");
        int cores = i != -1 ? Integer.parseInt(argList.get(i+1)) : -1;
        i = argList.indexOf("--memory");
        String memory = i != -1 ? argList.get(i+1) : "";

        Path root = Paths.get(projectRoot);

        Path outpath = Paths.get(outfile);
        if(!outpath.isAbsolute()) outpath = root.resolve(outpath);

        Path freezepath = Paths.get(freeze);
        if(!freezepath.isAbsolute()) freezepath = root.resolve(freezepath);

        Path pnpath = Paths.get(pnlist);
        if(!pnpath.isAbsolute()) pnpath = root.resolve(pnpath);

        Path varpath = Paths.get(variants);
        if(!varpath.isAbsolute()) varpath = root.resolve(varpath);

        Stream str;
        if(varpath.getFileName().toString().endsWith(".gorz")) {
            GenericSessionFactory gsf = new GenericSessionFactory(".", "result_cache");
            GorSession gs = gsf.create();
            GorContext gc = gs.getGorContext();
            PipeInstance pi = new PipeInstance(gc);
            pi.init("gor "+varpath.toString(), false, "");
            str = StreamSupport.stream(Spliterators.spliteratorUnknownSize(pi.theInputSource(), 0), false).map(Object::toString);

            /*byte[] output = new byte[65536];
            byte[] input = new byte[65536];
            InputStream in = Files.newInputStream(varpath);
            int r = in.read();
            while(r != '\n') r = in.read();
            r = in.read();
            while(r != -1) {
                while(r != '\t') r = in.read();
                r = in.read();
                while(r != '\t') r = in.read();
                r = in.read();
                //r = in.read();
                //in.read();
                r = in.read();
                int k = 0;
                while(r != '\n') {
                    input[k++] = (byte)r;
                    r = in.read();
                }
                r = in.read();

                Inflater ifl = new Inflater();
                ifl.setInput(input,0,k);
                try {
                    ifl.inflate(output);
                } catch (DataFormatException e) {
                    e.printStackTrace();
                }
                String bb = new String(output);
                System.err.println(bb);
            }*/
            /*str = str.flatMap(f -> {
                byte[] gzip = f.getBytes(StandardCharsets.ISO_8859_1);
                int k = 0;
                while(gzip[k++]!='\t');
                while(gzip[k++]!='\t');
                Inflater ifl = new Inflater();
                ifl.setInput(Arrays.copyOfRange(gzip,k,gzip.length));
                try {
                    ifl.inflate(output);
                } catch (DataFormatException e) {
                    e.printStackTrace();
                }
                String bb = new String(output);
                String[] spl = bb.split("\t");
                return Arrays.stream(spl);
            });*/
        } else {
            str = Files.lines(varpath).skip(1);
        }
        long varcount = str.count();
        long samplecount = Files.lines(pnpath).dropWhile(l -> l.startsWith("#")).count();

        SparkSession.Builder ssBuilder = SparkSession.builder();
        if(instances>=0) {
            ssBuilder = ssBuilder.config("spark.executor.instances",instances == 0 ? samplecount / partsize + 1 : instances);
        }
        if(!memory.equals("-1")) {
            ssBuilder = ssBuilder.config("spark.executor.memory",memory.equals("0") ? (varcount*partsize/1000000 + 1)+"g" : memory);
        }
        if(cores>0) {
            ssBuilder = ssBuilder.config("spark.executor.cores",cores);
        }

        try(SparkSession spark = ssBuilder/*.master("local[*]")*/.appName(appName).getOrCreate()) {
        //try(SparkSession spark = SparkSession.builder().master("local[*]").appName(appName).getOrCreate()) {
            pca(spark, projectRoot, freeze, pnlist, variants, partsize, pcacomponents, pnpath, varpath, freezepath, (int)varcount, outpath, sparse);
            spark.stop();
        }
    }

    private static RowMatrix blockMatrixToRowMatrix(Dataset ds, int varcount, int partsize) {
        JavaRDD,Matrix>> dbm = ds.select("chrom","pos","values").javaRDD().mapPartitionsWithIndex((Function2, Iterator, Matrix>>>) (pi, input) -> {
            double[] mat = null;
            Iterator,Matrix>> it = Collections.emptyIterator();
            int start = 0;
            while(input.hasNext()) {
                Row row = input.next();
                String strvec = row.getString(2).substring(1);
                int len = strvec.length();
                if(mat==null) {
                    mat = new double[varcount*len];
                }
                if(start*len > mat.length) throw new RuntimeException("len " + len + " " + mat.length + "  " + varcount);
                for(int i = 0; i < len; i++) {
                    mat[start+varcount*i] = strvec.charAt(i)-'0';
                }
                start++;
            }
            if(mat!=null) {
                Matrix matrix = Matrices.dense(mat.length/varcount,varcount,mat);
                Tuple2 index = new Tuple2<>(pi,0);
                Tuple2,Matrix> tupmat = new Tuple2<>(index,matrix);
                return Iterators.singletonIterator(tupmat);
            }
            return it;
        },true);

        BlockMatrix mat = new BlockMatrix(dbm.rdd(),partsize,varcount);
        IndexedRowMatrix irm = mat.toIndexedRowMatrix();

        DenseMatrix