All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.api.RMethods Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.api;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.HashMultiset;
import htsjdk.samtools.util.Tuple;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils;
import net.maizegenetics.pangenome.hapCalling.PathsToVCFPlugin;
import net.maizegenetics.pangenome.hapCalling.ReadMappingUtils;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GenerateRCode;
import net.maizegenetics.plugindef.ParameterCache;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.LoggingUtils;
import org.apache.log4j.Logger;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import java.util.stream.Collectors;

/**
 * The purpose of this class is hold static methods that generate results that can be used by R
 */
public class RMethods {

    private static Logger myLogger = Logger.getLogger(RMethods.class);

    /**
     *
     * @param graph a HaplotypeGraph
     * @return  a MatrixWithNames containing the haplotype ids from the graph with taxa as rows and reference ranges as columns
     */
    public static MatrixWithNames hapidTableAsMatrix(HaplotypeGraph graph) {
        List taxa = graph.taxaInGraph().stream().collect(Collectors.toList());
        Collections.sort(taxa);
        String[] taxaNames = taxa.stream().map(Taxon::getName).collect(Collectors.toList()).toArray(new String[taxa.size()]);
        Map taxaIndex = new HashMap<>();
        for (int i = 0; i < taxa.size(); i++) {
            taxaIndex.put(taxa.get(i), i);
        }

        int nranges = graph.numberOfRanges();
        int[][] hapidMatrix = new int[taxa.size()][nranges];

        //initialize to -1, which indicates no haplotype for a taxon
        for (int[] row : hapidMatrix) {
            Arrays.fill(row, -1);
        }

        int columnIndex = 0;
        for (ReferenceRange refrange : graph.referenceRanges()) {
            System.out.println("columnIndex = " + columnIndex);
            for (HaplotypeNode hn : graph.nodes(refrange)) {
                int hapid = hn.id();
                for (Taxon taxon : hn.taxaList()) {
                    hapidMatrix[taxaIndex.get(taxon)][columnIndex] = hapid;
                }
            }
            columnIndex++;
        }

        MatrixWithNames mwn = new MatrixWithNames();
        mwn.columnNames = graph.referenceRanges().stream().map(rr -> "R" + Integer.toString(rr.id())).toArray(String[]::new);
        mwn.rowNames = taxaNames;
        mwn.matrix = hapidMatrix;

        return mwn;
    }

    /**
     *
     * @param graph a HaplotypeGraph
     * @return  a DataFrameVectors containing the haplotype ids from the graph with taxa as the first column
     * and reference ranges as the remaining columns
     */
    public static DataFrameVectors hapidTableAsDF(HaplotypeGraph graph) {
        List taxa = graph.taxaInGraph().stream().collect(Collectors.toList());
        Collections.sort(taxa);
        String[] taxaNames = taxa.stream().map(Taxon::getName).collect(Collectors.toList()).toArray(new String[taxa.size()]);

        Map taxaIndex = new HashMap<>();
        for (int i = 0; i < taxa.size(); i++) {
            taxaIndex.put(taxa.get(i), i);
        }

        int ntaxa = taxaNames.length;
        int nranges = graph.numberOfRanges();

        DataFrameVectors dfv = new DataFrameVectors();
        dfv.columnNames = graph.referenceRanges().stream().map(rr -> Integer.toString(rr.id())).toArray(String[]::new);
        dfv.columnNames = new String[nranges + 1];
        dfv.columnNames[0] = "taxon";
        int colIndex = 1;
        for (ReferenceRange rr : graph.referenceRanges()) {
            dfv.columnNames[colIndex++] = "R" + Integer.toString(rr.id());
        }

        dfv.dataVectors.add(taxaNames);
        for (ReferenceRange refrange : graph.referenceRanges()) {
            int[] hapids = new int[ntaxa];
            Arrays.fill(hapids, -1);
            for (HaplotypeNode hn : graph.nodes(refrange)) {
                int nodeHapid = hn.id();
                for (Taxon taxon : hn.taxaList()) {
                    hapids[taxaIndex.get(taxon)] = nodeHapid;
                }
            }
            dfv.dataVectors.add(hapids);
        }

        return dfv;
    }

    /**
     *
     * @param graph a HaplotypeGraph
     * @return  a DataFrameVectors contain the reference range id, chromosome, start, and end positions
     * for all of the reference ranges in the graph
     */
    public static DataFrameVectors referenceRanges(HaplotypeGraph graph) {
        DataFrameVectors dfv = new DataFrameVectors();
        dfv.columnNames = new String[]{"id","chr","start","end"};
        int nranges = graph.numberOfRanges();
        String[] id = new String[nranges];
        String[] chrname = new String[nranges];
        int[] startpos = new int[nranges];
        int[] endpos = new int[nranges];
        int rangeIndex = 0;
        for (ReferenceRange rr : graph.referenceRanges()) {
            id[rangeIndex] = "R" + rr.id();
            chrname[rangeIndex] = rr.chromosome().getName();
            startpos[rangeIndex] = rr.start();
            endpos[rangeIndex] = rr.end();
            rangeIndex++;
        }

        dfv.dataVectors.add(id);
        dfv.dataVectors.add(chrname);
        dfv.dataVectors.add(startpos);
        dfv.dataVectors.add(endpos);
        return dfv;
    }

    /**
     *
     * @param configFile    a config file of PHG database connection parameters
     * @param pathFileNames     a String array of path file names
     * @return  a MatrixWithNames of haplotype ids with taxa name for row names and reference range id for the column name.
     * The method returns a matrix for all the paths in pathFileNames. The taxa names are parsed from the file names
     * by deleting the suffix _multimap.txt_path.txt.
     */
    public static MatrixWithNames pathHapids(String configFile, String[] pathFileNames) {
        //get the taxon name from the file name
        String extension = "_path.txt";
        Connection conn = DBLoadingUtils.connection(configFile, false);

        //create a hapid -> ref range map
        Map hapidToRefRangeMap = new HashMap<>();
        String sqlstr = "SELECT haplotypes_id, ref_range_id FROM haplotypes";
        try (ResultSet rs = conn.createStatement().executeQuery(sqlstr)) {
            while (rs.next()) {
                hapidToRefRangeMap.put(rs.getInt("haplotypes_id"), rs.getInt("ref_range_id"));
            }
        } catch (SQLException se) {
            throw new IllegalArgumentException("Error executing query: " + sqlstr);
        }
        System.out.println("executed hapid query and loaded hapids to map");

        //get a Map of id -> ReferenceRange from the db
        Map referenceRangeMap = new HashMap<>();

        sqlstr = "SELECT ref_range_id, chrom, range_start, range_end FROM reference_ranges";
        try (ResultSet rs = conn.createStatement().executeQuery(sqlstr)) {
            while (rs.next()) {
                int id = rs.getInt("ref_range_id");
                ReferenceRange rr = new ReferenceRange("NA", Chromosome.instance(rs.getString("chrom")),
                        rs.getInt("range_start"), rs.getInt("range_end"), id);
                referenceRangeMap.put(id, rr);
            }
        } catch (SQLException se) {
            throw new IllegalArgumentException("Error executing query: " + sqlstr);
        }

        //need list of all reference ranges. Do not assume every file has every reference range
        //for each file need a ref range -> hapid map
        List> rangeToHapidMapList = new ArrayList<>();
        List taxaNames = new ArrayList<>();
        TreeSet refRangeSet = new TreeSet<>();

        for (String filename : pathFileNames) {
            Map rangeToHapidMap = new HashMap<>();
            try (BufferedReader br = Files.newBufferedReader(Paths.get(filename))) {
                String input = br.readLine();
                while (input.startsWith("#")) input = br.readLine();
                while (input != null) {
                    try {
                        Integer hapid = Integer.valueOf(input);
                        Integer refRangeId = hapidToRefRangeMap.get(hapid);
                        if (refRangeId != null) {
                            rangeToHapidMap.put(refRangeId, hapid);
                            refRangeSet.add(refRangeId);
                        }
                    } catch(NumberFormatException nfe) {
                        System.err.println("Non-fatal error parsing '" + input + "' from " + filename);
                    }
                    input = br.readLine();
                }
                rangeToHapidMapList.add(rangeToHapidMap);
                int start = filename.lastIndexOf("/") + 1;
                int end = filename.indexOf(extension);
                taxaNames. add(filename.substring(start, end));
            } catch(IOException e) {
                e.printStackTrace();
                throw new IllegalArgumentException("problem reading " + filename, e);
            }
            System.out.println("Processed " + filename);
        }

        int ntaxa = taxaNames.size();
        int nranges = refRangeSet.size();

        int[][] hapidMatrix = new int[ntaxa][nranges];
        List sortedReferenceRangeList = refRangeSet.stream().map(id -> referenceRangeMap.get(id)).collect(Collectors.toList());
        Collections.sort(sortedReferenceRangeList);

        for (int i = 0; i < ntaxa; i++) {
            Map rangeToHapid = rangeToHapidMapList.get(i);
            int colIndex = 0;
            for (ReferenceRange refRange : sortedReferenceRangeList) {
                Integer hapid = rangeToHapid.get(refRange.id());
                if (hapid != null) hapidMatrix[i][colIndex] = hapid;
                else hapidMatrix[i][colIndex] = -1;
                colIndex++;
            }
        }

        MatrixWithNames mwn =new MatrixWithNames();
        mwn.matrix = hapidMatrix;
        mwn.rowNames = taxaNames.toArray(new String[taxaNames.size()]);
        mwn.columnNames = sortedReferenceRangeList.stream().map(rr -> "R" + rr.id()).toArray(String[]::new);


        return mwn;
    }

    /**
     *
     * @param configFile    the database configuration file
     * @param pathMethod    the name of the path method in the PHG DB
     * @return a MatrixWithNames of haplotype ids with taxa name for row names and reference range id for the column name.
     *    The method returns a matrix for all the paths for pathMethod.
     */
    public static MatrixWithNames pathsForMethod(String configFile, String pathMethod) {
        ParameterCache.load(configFile);
        Connection dbConn = DBLoadingUtils.connection(false);

        //get the paths from the db for the method name (taxa name, list of haplotype ids)
        //get hapid -> refid map for hapids in db
        //get refid set of all reference ranges covered by the paths. This will be the table columns
        //create the data matrix (taxa_name rows, refid columns)

        //all paths for path method as tuples of line_name, list of haplotypes
        List>> pathList = new ArrayList<>();

        String pathQuery = "SELECT line_name, paths_data FROM paths, genotypes, methods " +
                "WHERE paths.genoid=genotypes.genoid AND methods.method_id=paths.method_id AND methods.name='" + pathMethod + "'";
        try {
            ResultSet pathResult = dbConn.createStatement().executeQuery(pathQuery);
            while (pathResult.next()) {
                String lineName = pathResult.getString(1);
                List> haplotypeLists = DBLoadingUtils.decodePathsForMultipleLists(pathResult.getBytes(2));
                for (List hapList : haplotypeLists) {
                    pathList.add(new Tuple(lineName, hapList));
                }
            }
            pathResult.close();
        } catch(SQLException se) {
            throw new IllegalArgumentException("Could not execute query: " + pathQuery, se);
        }

        //a mapping of all hapids -> ref_range_id
        Map hapRefidMap = new HashMap<>();
        String refidQuery = "SELECT haplotypes_id, ref_range_id FROM haplotypes";
        try {
            ResultSet refidResult = dbConn.createStatement().executeQuery(refidQuery);
            while (refidResult.next()) {
                hapRefidMap.put(refidResult.getInt(1), refidResult.getInt(2));
            }
            refidResult.close();

        } catch(SQLException se) {
            throw new IllegalArgumentException("Could not execute query: " + refidQuery, se);
        }

        //no more queries. Close the database connection
        try {
            dbConn.close();
        } catch (SQLException sqlExc) {
            System.out.println("Error closing db connection in pathsForMethod.");
        }

        //the ordered set of all covered refids and a map of refid -> column index
        TreeSet refidSet = pathList.stream().flatMap(t -> t.b.stream())
                .collect(Collectors.toSet()).stream()
                .map(h -> hapRefidMap.get(h))
                .collect(Collectors.toCollection(TreeSet::new));

        Map refidIndex = new HashMap<>();
        int colCount = 0;
        for (Integer refid : refidSet) refidIndex.put(refid, colCount++);

        //create and fill the hapid matrix. Use -1 for no haplotype.
        int[][] hapidMatrix = new int[pathList.size()][refidSet.size()];
        for (int i = 0; i < pathList.size(); i++) Arrays.fill(hapidMatrix[i], -1);

        int rowCount = 0;
        for (Tuple> path : pathList) {
            for (Integer hapid : path.b) {
                hapidMatrix[rowCount][refidIndex.get(hapRefidMap.get(hapid))] = hapid;
            }
            rowCount++;
        }

        MatrixWithNames mwn = new MatrixWithNames();
        mwn.matrix = hapidMatrix;
        mwn.rowNames = pathList.stream().map(t -> t.a).toArray(String[]::new);
        mwn.columnNames = refidSet.stream().map(refid -> refid.toString()).toArray(String[]::new);

        return mwn;

    }

    /**
     *
     * @param configFile    a database configuration file
     * @param lineName  the name of the line (taxon) for which the read mapping information is to be retrieved.
     *                  If there are multiple read mappings with different file_group_names, they will be combined.
     * @param readMappingMethodName     the method name for the read mappings (only takes a single method)
     * @param haplotypeMethodName   optional: one or more colon-delimited haplotype method names
     * @return  A set of vectors and column names that can be used to create an R data.frame.
     *  Column labels are ref_range_id, chrom, start, hapid, taxa, taxa_count, range_count
     */
    public static DataFrameVectors readMappingsForLineName(String configFile, String lineName, String readMappingMethodName, String haplotypeMethodName) {
        ParameterCache.load(configFile);
        LoggingUtils.setupLogging();

        Connection conn = DBLoadingUtils.connection(false);

        //get haplotypeMethodId for this method name

        List hapMethodIdArray = new ArrayList();
        if (haplotypeMethodName != null) {
            String[] hapMethodIdList = haplotypeMethodName.split(":");

            StringBuilder hapMethodSB = new StringBuilder();
            hapMethodSB.append("'");
            for (String method : hapMethodIdList) {
                // this is appending single quotes around the name, and a comma between names
                hapMethodSB.append(method).append("','");
            }
            hapMethodSB.setLength(hapMethodSB.length()-2); // This removes the last ,' from the string, leaving just a '

            String hapMethods = hapMethodSB.toString();

            String hapMethodQueryBase = "SELECT method_id FROM methods WHERE name IN (";
            StringBuilder hapMethodQuerySB = new StringBuilder();
            hapMethodQuerySB.append(hapMethodQueryBase);
            hapMethodQuerySB.append(hapMethods);
            hapMethodQuerySB.append(");");

            myLogger.info("readMappingsForLineName: sending hap method query: " + hapMethodQuerySB.toString());
            try {
                ResultSet hapMethodResult = conn.createStatement().executeQuery(hapMethodQuerySB.toString());
                while (hapMethodResult.next()) {
                    int hapId = hapMethodResult.getInt(1);
                    hapMethodIdArray.add(Integer.toString(hapId));
                }
                hapMethodResult.close();
            } catch (SQLException e) {
                throw new IllegalArgumentException("failed to execute sql: " + hapMethodQuerySB.toString(), e);
            }
        }


        Map, Integer> readMappings = new HashMap<>();
        String sqlstr = "SELECT mapping_data FROM read_mapping, methods, genotypes " +
                "WHERE read_mapping.method_id=methods.method_id AND methods.name='" + readMappingMethodName + "' " +
                "AND read_mapping.genoid=genotypes.genoid AND genotypes.line_name='" + lineName + "'";
        try {
            ResultSet sqlResult = conn.createStatement().executeQuery(sqlstr);
            //this loop collects all the counts for each hapid set, in case there is more than one read mapping for a path
            while (sqlResult.next()) {
                //following returns Map,Int>, which is a hapid set and count
                byte[] bytesFromDB = sqlResult.getBytes(1);
                Map,Integer> reads = ReadMappingUtils.decodeHapIdMapping(bytesFromDB);

                //reads is a Map,Int> where List (key) is a list of hapids and Int (value) is the count
                for (Map.Entry,Integer> entry : reads.entrySet()) {
                    Integer existingCount = readMappings.get(entry.getKey());
                    if (existingCount == null) readMappings.put(entry.getKey(), entry.getValue());
                    else readMappings.put(entry.getKey(), entry.getValue() + existingCount);
                }
            }
            sqlResult.close();
        } catch (SQLException e) {
            throw new IllegalArgumentException("Failed to execute sql: " + sqlstr, e);
        }

        if (readMappings.size() == 0) myLogger.info("No read mappings returned for " + lineName + " with read method = " + readMappingMethodName)
                ;
        //to evaluate read mappings need hapid -> line_name (multiple?), hapid -> ref_range(id, chrom, start), hapid -> read count
        //ref_range -> read count

        //hap -> ref range id.  If no haplotype methods were specified this code
        // will pull all haplotypeIds from the haplotypes table
        Map hapRangeMap = new HashMap<>();
        StringBuilder sqlHapRangeSB = new StringBuilder();
        sqlHapRangeSB.append("SELECT haplotypes_id, ref_range_id FROM haplotypes");
        if (hapMethodIdArray.size() > 0) {
            String hapMethodsIds = String.join(",", hapMethodIdArray);
            sqlHapRangeSB.append(" WHERE haplotypes.method_id IN (");
            sqlHapRangeSB.append(hapMethodsIds);
            sqlHapRangeSB.append(")");

        }
        sqlHapRangeSB.append(";");

        myLogger.info("readMappingsForLineName: sending haplotypes and referece range query: " + sqlHapRangeSB.toString());
        try {
            ResultSet resultHapRange = conn.createStatement().executeQuery(sqlHapRangeSB.toString());
            while (resultHapRange.next())
                hapRangeMap.put(resultHapRange.getInt("haplotypes_id"), resultHapRange.getInt("ref_range_id"));
            resultHapRange.close();

        } catch (SQLException e) {
            throw new IllegalArgumentException("failed to execute sql: " + sqlHapRangeSB.toString(), e);
        }

        //rangeCountMultiset: total reads per reference range, range -> read count
        //hapidCountMultiset: reads per hapid, hapid -> read count
        HashMultiset rangeCountMultiset = HashMultiset.create();
        HashMultiset hapidCountMultiset = HashMultiset.create();
        for (Map.Entry, Integer> entry : readMappings.entrySet()) {
            Integer refRangeId = hapRangeMap.get(entry.getKey().get(0));
            rangeCountMultiset.add(refRangeId, entry.getValue());
            for (Integer hapid : entry.getKey()) hapidCountMultiset.add(hapid, entry.getValue());
        }

        //ref range id -> chrom,start
        Map> rangeMap = new HashMap<>();
        String sqlRange = "SELECT ref_range_id, chrom, range_start FROM reference_ranges";
        ResultSet resultRange = null;
        try {
            resultRange = conn.createStatement().executeQuery(sqlRange);
            while (resultRange.next())
                rangeMap.put(resultRange.getInt("ref_range_id"), new Tuple(resultRange.getString("chrom"), resultRange.getInt("range_start")));
            resultRange.close();
        } catch (SQLException e) {
            throw new IllegalArgumentException("failed to execute sql: " + sqlRange, e);
        }

        //hapid -> line_name(s)
        HashMultimap hapidLineNamesMap = HashMultimap.create();
        String sqlHapLine = "SELECT distinct haplotypes_id, line_name FROM haplotypes h, gamete_haplotypes gh, gametes g, genotypes geno " +
                "WHERE h.gamete_grp_id=gh.gamete_grp_id AND gh.gameteid=g.gameteid AND g.genoid=geno.genoid";

        ResultSet resultHapLine = null;
        try {
            resultHapLine = conn.createStatement().executeQuery(sqlHapLine);
            while (resultHapLine.next()) {
                hapidLineNamesMap.put(resultHapLine.getInt(1), resultHapLine.getString(2));
            }
            resultHapLine.close();
        } catch (SQLException e) {
            throw new IllegalArgumentException("Failed to execute sql: " + sqlHapLine,e);
        }

        try {
            conn.close();
        } catch (SQLException e) {
            System.out.println("Failed to close db connection.");
        }

        //"ref_range_id\tchrom\tstart\thapid\ttaxa\tcount\ttotal"
        DataFrameVectors myResultVectors = new DataFrameVectors();
        myResultVectors.columnNames = new String[] {"ref_range_id", "chrom", "start", "hapid", "taxa", "taxa_count", "range_count"};
        int numberOfRows = hapRangeMap.size();
        int[] refRanges = new int[numberOfRows];
        String[] chromosomes = new String[numberOfRows];
        int[] startPositions = new int[numberOfRows];
        int[] haplotypeIds = new int[numberOfRows];
        String[] taxa = new String[numberOfRows];
        int[] taxaCount = new int[numberOfRows];
        int[] rangeCount = new int[numberOfRows];
        int rowCount = 0;
        for (Map.Entry entry : hapRangeMap.entrySet()) {
            Integer refid = entry.getValue();
            Integer hapid = entry.getKey();
            refRanges[rowCount] = refid;
            Tuple rangeInfo = rangeMap.get(refid);
            chromosomes[rowCount] = rangeInfo.a;
            startPositions[rowCount] = rangeInfo.b;
            haplotypeIds[rowCount] = hapid;
            taxa[rowCount] = hapidLineNamesMap.get(hapid).stream().collect(Collectors.joining(","));
            taxaCount[rowCount] = hapidCountMultiset.count(hapid);
            rangeCount[rowCount] = rangeCountMultiset.count(refid);

            rowCount++;
        }

        myResultVectors.dataVectors.add(refRanges);
        myResultVectors.dataVectors.add(chromosomes);
        myResultVectors.dataVectors.add(startPositions);
        myResultVectors.dataVectors.add(haplotypeIds);
        myResultVectors.dataVectors.add(taxa);
        myResultVectors.dataVectors.add(taxaCount);
        myResultVectors.dataVectors.add(rangeCount);

        return myResultVectors;
    }

    /**
     *
     * @param configFile    the database configuration file
     * @param lineName  the name of the line (taxon) for which the read mapping information is to be retrieved.
     * @param readMappingMethodName the name of the read mapping method from the database
     * @param haplotypeMethodName   the name of the haplotype method used to write the pangenome fasta used to map reads
     * @param fileGroup the name of the file group for the line from the database.
     *                  This parameter is only necessary if the line (taxon) has more than one file group and only the reads
     *                  for a specific file group are wanted.
     * @return  A set of vectors and column names that can be used to create an R data.frame. The column names are
     * "ref_range_id", "chrom", "start", "hapid", "taxa", "taxa_count", and "range_count".
     */
    public static DataFrameVectors readMappingsForLineName(String configFile, String lineName, String readMappingMethodName, String haplotypeMethodName, String fileGroup) {
        if (fileGroup == null) {
            return readMappingsForLineName(configFile, lineName, readMappingMethodName, haplotypeMethodName);
        }

        LoggingUtils.setupLogging();

        Connection conn = DBLoadingUtils.connection(configFile, false);

        //get haplotypeMethodId for this method name
        String hapMethodQuery = "SELECT method_id FROM methods WHERE name = '" + haplotypeMethodName + "'";
        int haplotypeMethodId = -1;
        try {
            ResultSet hapMethodResult = conn.createStatement().executeQuery(hapMethodQuery);
            if (!hapMethodResult.next()) throw new IllegalArgumentException("No haplotype method returned by " + hapMethodQuery);
            haplotypeMethodId = hapMethodResult.getInt(1);
            hapMethodResult.close();
        } catch (SQLException e) {
            throw new IllegalArgumentException("failed to execute sql: " + hapMethodQuery, e);
        }

        Map, Integer> readMappings = new HashMap<>();
        String sqlstr = "SELECT mapping_data FROM read_mapping, methods, genotypes " +
                "WHERE read_mapping.method_id=methods.method_id AND methods.name='" + readMappingMethodName + "' " +
                "AND read_mapping.genoid=genotypes.genoid AND genotypes.line_name='" + lineName + "' " +
                "AND file_group_name = '" + fileGroup + "'";
        try {
            ResultSet sqlResult = conn.createStatement().executeQuery(sqlstr);
            //this loop collects all the counts for each hapid set, in case there is more than one read mapping for a path
            while (sqlResult.next()) {
                //following returns Map,Int>, which is a hapid set and count
                byte[] bytesFromDB = sqlResult.getBytes(1);
                Map,Integer> reads = ReadMappingUtils.decodeHapIdMapping(bytesFromDB);

                //reads is a Map,Int> where List (key) is a list of hapids and Int (value) is the count
                for (Map.Entry,Integer> entry : reads.entrySet()) {
                    Integer existingCount = readMappings.get(entry.getKey());
                    if (existingCount == null) readMappings.put(entry.getKey(), entry.getValue());
                    else readMappings.put(entry.getKey(), entry.getValue() + existingCount);
                }
            }
            sqlResult.close();
        } catch (SQLException e) {
            throw new IllegalArgumentException("Failed to execute sql: " + sqlstr, e);
        }

        if (readMappings.size() == 0) myLogger.info("No read mappings returned for " + lineName + " with read method = " + readMappingMethodName)
                ;
        //to evaluate read mappings need hapid -> line_name (multiple?), hapid -> ref_range(id, chrom, start), hapid -> read count
        //ref_range -> read count

        //hap -> ref range id
        Map hapRangeMap = new HashMap<>();
        String sqlHapRange = "SELECT haplotypes_id, ref_range_id FROM haplotypes WHERE haplotypes.method_id = " + haplotypeMethodId;
        try {
            ResultSet resultHapRange = conn.createStatement().executeQuery(sqlHapRange);
            while (resultHapRange.next())
                hapRangeMap.put(resultHapRange.getInt("haplotypes_id"), resultHapRange.getInt("ref_range_id"));
            resultHapRange.close();

        } catch (SQLException e) {
            throw new IllegalArgumentException("failed to execute sql: " + sqlHapRange, e);
        }

        //rangeCountMultiset: total reads per reference range, range -> read count
        //hapidCountMultiset: reads per hapid, hapid -> read count
        HashMultiset rangeCountMultiset = HashMultiset.create();
        HashMultiset hapidCountMultiset = HashMultiset.create();
        for (Map.Entry, Integer> entry : readMappings.entrySet()) {
            Integer refRangeId = hapRangeMap.get(entry.getKey().get(0));
            rangeCountMultiset.add(refRangeId, entry.getValue());
            for (Integer hapid : entry.getKey()) hapidCountMultiset.add(hapid, entry.getValue());
        }

        //ref range id -> chrom,start
        Map> rangeMap = new HashMap<>();
        String sqlRange = "SELECT ref_range_id, chrom, range_start FROM reference_ranges";
        ResultSet resultRange = null;
        try {
            resultRange = conn.createStatement().executeQuery(sqlRange);
            while (resultRange.next())
                rangeMap.put(resultRange.getInt("ref_range_id"), new Tuple(resultRange.getString("chrom"), resultRange.getInt("range_start")));
            resultRange.close();
        } catch (SQLException e) {
            throw new IllegalArgumentException("failed to execute sql: " + sqlRange, e);
        }

        //hapid -> line_name(s)
        HashMultimap hapidLineNamesMap = HashMultimap.create();
        String sqlHapLine = "SELECT distinct haplotypes_id, line_name FROM haplotypes h, gamete_haplotypes gh, gametes g, genotypes geno " +
                "WHERE h.gamete_grp_id=gh.gamete_grp_id AND gh.gameteid=g.gameteid AND g.genoid=geno.genoid";

        ResultSet resultHapLine = null;
        try {
            resultHapLine = conn.createStatement().executeQuery(sqlHapLine);
            while (resultHapLine.next()) {
                hapidLineNamesMap.put(resultHapLine.getInt(1), resultHapLine.getString(2));
            }
            resultHapLine.close();
        } catch (SQLException e) {
            throw new IllegalArgumentException("Failed to execute sql: " + sqlHapLine,e);
        }

        try {
            conn.close();
        } catch (SQLException e) {
            System.out.println("Failed to close db connection.");
        }

        //"ref_range_id\tchrom\tstart\thapid\ttaxa\tcount\ttotal"
        DataFrameVectors myResultVectors = new DataFrameVectors();
        myResultVectors.columnNames = new String[] {"ref_range_id", "chrom", "start", "hapid", "taxa", "taxa_count", "range_count"};
        int numberOfRows = hapRangeMap.size();
        int[] refRanges = new int[numberOfRows];
        String[] chromosomes = new String[numberOfRows];
        int[] startPositions = new int[numberOfRows];
        int[] haplotypeIds = new int[numberOfRows];
        String[] taxa = new String[numberOfRows];
        int[] taxaCount = new int[numberOfRows];
        int[] rangeCount = new int[numberOfRows];
        int rowCount = 0;
        for (Map.Entry entry : hapRangeMap.entrySet()) {
            Integer refid = entry.getValue();
            Integer hapid = entry.getKey();
            refRanges[rowCount] = refid;
            Tuple rangeInfo = rangeMap.get(refid);
            chromosomes[rowCount] = rangeInfo.a;
            startPositions[rowCount] = rangeInfo.b;
            haplotypeIds[rowCount] = hapid;
            taxa[rowCount] = hapidLineNamesMap.get(hapid).stream().collect(Collectors.joining(","));
            taxaCount[rowCount] = hapidCountMultiset.count(hapid);
            rangeCount[rowCount] = rangeCountMultiset.count(refid);

            rowCount++;
        }

        myResultVectors.dataVectors.add(refRanges);
        myResultVectors.dataVectors.add(chromosomes);
        myResultVectors.dataVectors.add(startPositions);
        myResultVectors.dataVectors.add(haplotypeIds);
        myResultVectors.dataVectors.add(taxa);
        myResultVectors.dataVectors.add(taxaCount);
        myResultVectors.dataVectors.add(rangeCount);

        return myResultVectors;
    }

    /**
     *
     * @param configFile    a config file containing the DB name and connection information
     * @return  information for all the records in the PHG DB read_mapping table as a DataFrameVectors object
     *   The column names are "readMappingId","lineName","method", and "fileGroup".
     */
    public static DataFrameVectors readMappingTableInfo(String configFile) {
        LoggingUtils.setupLogging();
        Connection dbConn = DBLoadingUtils.connection(configFile, false);

        String infoQuery = "SELECT read_mapping_id, line_name, methods.name, file_group_name " +
                "FROM read_mapping, genotypes, methods " +
                "WHERE read_mapping.genoid=genotypes.genoid " +
                "AND read_mapping.method_id=methods.method_id " +
                "ORDER BY line_name, methods.name, file_group_name";
        try {
            List readMappingIdList = new ArrayList<>();
            List lineNameList = new ArrayList<>();
            List methodNameList = new ArrayList<>();
            List fileGroupList = new ArrayList<>();

            ResultSet infoResult = dbConn.createStatement().executeQuery(infoQuery);
            while (infoResult.next()) {
                readMappingIdList.add(infoResult.getInt(1));
                lineNameList.add(infoResult.getString(2));
                methodNameList.add(infoResult.getString(3));
                fileGroupList.add(infoResult.getString(4));
            }
            infoResult.close();

            DataFrameVectors dfv = new DataFrameVectors();
            dfv.columnNames = new String[]{"readMappingId","lineName","method","fileGroup"};
            dfv.dataVectors.add(readMappingIdList.stream().map(id -> id.toString()).toArray(String[]::new));
            dfv.dataVectors.add(lineNameList.stream().toArray(String[]::new));
            dfv.dataVectors.add(methodNameList.stream().toArray(String[]::new));
            dfv.dataVectors.add(fileGroupList.stream().toArray(String[]::new));

            return dfv;

        } catch (SQLException e) {
            throw new IllegalArgumentException("Error executing: " + infoQuery);
        } finally {
            try {
                dbConn.close();
            } catch (SQLException e) {
                myLogger.info("Unable to close db connection in readMappingTableInfo");
            }
        }

    }

    public static Set hapidSetFromReadMappings(Map, Integer> readMappings) {
        return readMappings.entrySet().stream().flatMap(entry -> entry.getKey().stream()).collect(Collectors.toSet());
    }

    public static Set rangeSetFromHapidSet(Set hapids, Map hapidRangeMap) {
        return hapids.stream().map(hid -> hapidRangeMap.get(hid)).filter(x -> x != null).collect(Collectors.toSet());
    }

    public static void exportPHGToFlapjack(HaplotypeGraph graph, String filename) {

        GenotypeTable myGenotypeTable = (GenotypeTable) new PathsToVCFPlugin(null, false)
                .performFunction(DataSet.getDataSet(graph)).getData(0).getData();
        System.out.println("my genotype table has reference ranges " + graph.referenceRangeStream().map(rr -> Integer.toString(rr.id())).collect(Collectors.joining(",")));
        GenerateRCode.exportToFlapjack(myGenotypeTable, filename);
    }

    /**
     * A test function to verify that MatrixWithNames works correctly for R
     * @return  a test MatrixWithNames
     */
    public static MatrixWithNames testMatrix() {
        MatrixWithNames mwn = new MatrixWithNames();
        mwn.columnNames = new String[]{"col1","col2","col3"};
        mwn.rowNames = new String[] {"row1","row2","row3","row4"};
        mwn.matrix = new int[][] {{11,12,13},{21,22,23},{31,32,33},{41,42,43}};
        return mwn;
    }

    /**
     *  A test function to verify that DataFrameVectors works correctly for R
     * @return  a test DataFrameVectors
     */
    public static DataFrameVectors testDataFrame() {
        DataFrameVectors dfv = new DataFrameVectors();
        dfv.columnNames = new String[]{"col1","col2","col3"};
        dfv.dataVectors.add(new int[]{11,21,31,41});
        dfv.dataVectors.add(new int[]{12,22,32,42});
        dfv.dataVectors.add(new int[]{13,23,33,43});
        return dfv;
    }

    /**
     * A data class holding columnNames, rowNames, and a dataVectors List as public fields in a format
     * that can be easily used by rJava to create R objects
     */
    public static class DataFrameVectors {
        public String[] columnNames = null;
        public String[] rowNames = null;
        public List dataVectors = new ArrayList();

        public void toTextFile(String filename) {
            if (dataVectors.size() == 0) {
                myLogger.info("No data in dataVecors, not writing to " + filename);
                return;
            }
            try (PrintWriter pw = new PrintWriter(filename)) {
                if (rowNames != null) pw.print("RowName\t");
                pw.println(Arrays.stream(columnNames).collect(Collectors.joining("\t")));

            } catch (FileNotFoundException e) {
                throw new IllegalArgumentException("Unable to write to " + filename, e);
            }
        }
    }

    /**
     * A data class holding columnNames, rowNames, and a matrix List as public fields in a format
     * that can be easily used by rJava to create R objects. It is expected that the matrix will be a 2-D primitive
     * Java array of some type that is converted correctly by rJava.
     */
    public static class MatrixWithNames {
        //first dimension is row, second dimension is columns
        public String[] columnNames = null;
        public String[] rowNames = null;
        public Object matrix;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy