Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
net.maizegenetics.pangenome.api.RMethods Maven / Gradle / Ivy
package net.maizegenetics.pangenome.api;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.HashMultiset;
import htsjdk.samtools.util.Tuple;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils;
import net.maizegenetics.pangenome.hapCalling.PathsToVCFPlugin;
import net.maizegenetics.pangenome.hapCalling.ReadMappingUtils;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GenerateRCode;
import net.maizegenetics.plugindef.ParameterCache;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.LoggingUtils;
import org.apache.log4j.Logger;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import java.util.stream.Collectors;
/**
* The purpose of this class is hold static methods that generate results that can be used by R
*/
public class RMethods {
private static Logger myLogger = Logger.getLogger(RMethods.class);
/**
*
* @param graph a HaplotypeGraph
* @return a MatrixWithNames containing the haplotype ids from the graph with taxa as rows and reference ranges as columns
*/
public static MatrixWithNames hapidTableAsMatrix(HaplotypeGraph graph) {
List taxa = graph.taxaInGraph().stream().collect(Collectors.toList());
Collections.sort(taxa);
String[] taxaNames = taxa.stream().map(Taxon::getName).collect(Collectors.toList()).toArray(new String[taxa.size()]);
Map taxaIndex = new HashMap<>();
for (int i = 0; i < taxa.size(); i++) {
taxaIndex.put(taxa.get(i), i);
}
int nranges = graph.numberOfRanges();
int[][] hapidMatrix = new int[taxa.size()][nranges];
//initialize to -1, which indicates no haplotype for a taxon
for (int[] row : hapidMatrix) {
Arrays.fill(row, -1);
}
int columnIndex = 0;
for (ReferenceRange refrange : graph.referenceRanges()) {
System.out.println("columnIndex = " + columnIndex);
for (HaplotypeNode hn : graph.nodes(refrange)) {
int hapid = hn.id();
for (Taxon taxon : hn.taxaList()) {
hapidMatrix[taxaIndex.get(taxon)][columnIndex] = hapid;
}
}
columnIndex++;
}
MatrixWithNames mwn = new MatrixWithNames();
mwn.columnNames = graph.referenceRanges().stream().map(rr -> "R" + Integer.toString(rr.id())).toArray(String[]::new);
mwn.rowNames = taxaNames;
mwn.matrix = hapidMatrix;
return mwn;
}
/**
*
* @param graph a HaplotypeGraph
* @return a DataFrameVectors containing the haplotype ids from the graph with taxa as the first column
* and reference ranges as the remaining columns
*/
public static DataFrameVectors hapidTableAsDF(HaplotypeGraph graph) {
List taxa = graph.taxaInGraph().stream().collect(Collectors.toList());
Collections.sort(taxa);
String[] taxaNames = taxa.stream().map(Taxon::getName).collect(Collectors.toList()).toArray(new String[taxa.size()]);
Map taxaIndex = new HashMap<>();
for (int i = 0; i < taxa.size(); i++) {
taxaIndex.put(taxa.get(i), i);
}
int ntaxa = taxaNames.length;
int nranges = graph.numberOfRanges();
DataFrameVectors dfv = new DataFrameVectors();
dfv.columnNames = graph.referenceRanges().stream().map(rr -> Integer.toString(rr.id())).toArray(String[]::new);
dfv.columnNames = new String[nranges + 1];
dfv.columnNames[0] = "taxon";
int colIndex = 1;
for (ReferenceRange rr : graph.referenceRanges()) {
dfv.columnNames[colIndex++] = "R" + Integer.toString(rr.id());
}
dfv.dataVectors.add(taxaNames);
for (ReferenceRange refrange : graph.referenceRanges()) {
int[] hapids = new int[ntaxa];
Arrays.fill(hapids, -1);
for (HaplotypeNode hn : graph.nodes(refrange)) {
int nodeHapid = hn.id();
for (Taxon taxon : hn.taxaList()) {
hapids[taxaIndex.get(taxon)] = nodeHapid;
}
}
dfv.dataVectors.add(hapids);
}
return dfv;
}
/**
*
* @param graph a HaplotypeGraph
* @return a DataFrameVectors contain the reference range id, chromosome, start, and end positions
* for all of the reference ranges in the graph
*/
public static DataFrameVectors referenceRanges(HaplotypeGraph graph) {
DataFrameVectors dfv = new DataFrameVectors();
dfv.columnNames = new String[]{"id","chr","start","end"};
int nranges = graph.numberOfRanges();
String[] id = new String[nranges];
String[] chrname = new String[nranges];
int[] startpos = new int[nranges];
int[] endpos = new int[nranges];
int rangeIndex = 0;
for (ReferenceRange rr : graph.referenceRanges()) {
id[rangeIndex] = "R" + rr.id();
chrname[rangeIndex] = rr.chromosome().getName();
startpos[rangeIndex] = rr.start();
endpos[rangeIndex] = rr.end();
rangeIndex++;
}
dfv.dataVectors.add(id);
dfv.dataVectors.add(chrname);
dfv.dataVectors.add(startpos);
dfv.dataVectors.add(endpos);
return dfv;
}
/**
*
* @param configFile a config file of PHG database connection parameters
* @param pathFileNames a String array of path file names
* @return a MatrixWithNames of haplotype ids with taxa name for row names and reference range id for the column name.
* The method returns a matrix for all the paths in pathFileNames. The taxa names are parsed from the file names
* by deleting the suffix _multimap.txt_path.txt.
*/
public static MatrixWithNames pathHapids(String configFile, String[] pathFileNames) {
//get the taxon name from the file name
String extension = "_path.txt";
Connection conn = DBLoadingUtils.connection(configFile, false);
//create a hapid -> ref range map
Map hapidToRefRangeMap = new HashMap<>();
String sqlstr = "SELECT haplotypes_id, ref_range_id FROM haplotypes";
try (ResultSet rs = conn.createStatement().executeQuery(sqlstr)) {
while (rs.next()) {
hapidToRefRangeMap.put(rs.getInt("haplotypes_id"), rs.getInt("ref_range_id"));
}
} catch (SQLException se) {
throw new IllegalArgumentException("Error executing query: " + sqlstr);
}
System.out.println("executed hapid query and loaded hapids to map");
//get a Map of id -> ReferenceRange from the db
Map referenceRangeMap = new HashMap<>();
sqlstr = "SELECT ref_range_id, chrom, range_start, range_end FROM reference_ranges";
try (ResultSet rs = conn.createStatement().executeQuery(sqlstr)) {
while (rs.next()) {
int id = rs.getInt("ref_range_id");
ReferenceRange rr = new ReferenceRange("NA", Chromosome.instance(rs.getString("chrom")),
rs.getInt("range_start"), rs.getInt("range_end"), id);
referenceRangeMap.put(id, rr);
}
} catch (SQLException se) {
throw new IllegalArgumentException("Error executing query: " + sqlstr);
}
//need list of all reference ranges. Do not assume every file has every reference range
//for each file need a ref range -> hapid map
List> rangeToHapidMapList = new ArrayList<>();
List taxaNames = new ArrayList<>();
TreeSet refRangeSet = new TreeSet<>();
for (String filename : pathFileNames) {
Map rangeToHapidMap = new HashMap<>();
try (BufferedReader br = Files.newBufferedReader(Paths.get(filename))) {
String input = br.readLine();
while (input.startsWith("#")) input = br.readLine();
while (input != null) {
try {
Integer hapid = Integer.valueOf(input);
Integer refRangeId = hapidToRefRangeMap.get(hapid);
if (refRangeId != null) {
rangeToHapidMap.put(refRangeId, hapid);
refRangeSet.add(refRangeId);
}
} catch(NumberFormatException nfe) {
System.err.println("Non-fatal error parsing '" + input + "' from " + filename);
}
input = br.readLine();
}
rangeToHapidMapList.add(rangeToHapidMap);
int start = filename.lastIndexOf("/") + 1;
int end = filename.indexOf(extension);
taxaNames. add(filename.substring(start, end));
} catch(IOException e) {
e.printStackTrace();
throw new IllegalArgumentException("problem reading " + filename, e);
}
System.out.println("Processed " + filename);
}
int ntaxa = taxaNames.size();
int nranges = refRangeSet.size();
int[][] hapidMatrix = new int[ntaxa][nranges];
List sortedReferenceRangeList = refRangeSet.stream().map(id -> referenceRangeMap.get(id)).collect(Collectors.toList());
Collections.sort(sortedReferenceRangeList);
for (int i = 0; i < ntaxa; i++) {
Map rangeToHapid = rangeToHapidMapList.get(i);
int colIndex = 0;
for (ReferenceRange refRange : sortedReferenceRangeList) {
Integer hapid = rangeToHapid.get(refRange.id());
if (hapid != null) hapidMatrix[i][colIndex] = hapid;
else hapidMatrix[i][colIndex] = -1;
colIndex++;
}
}
MatrixWithNames mwn =new MatrixWithNames();
mwn.matrix = hapidMatrix;
mwn.rowNames = taxaNames.toArray(new String[taxaNames.size()]);
mwn.columnNames = sortedReferenceRangeList.stream().map(rr -> "R" + rr.id()).toArray(String[]::new);
return mwn;
}
/**
*
* @param configFile the database configuration file
* @param pathMethod the name of the path method in the PHG DB
* @return a MatrixWithNames of haplotype ids with taxa name for row names and reference range id for the column name.
* The method returns a matrix for all the paths for pathMethod.
*/
public static MatrixWithNames pathsForMethod(String configFile, String pathMethod) {
ParameterCache.load(configFile);
Connection dbConn = DBLoadingUtils.connection(false);
//get the paths from the db for the method name (taxa name, list of haplotype ids)
//get hapid -> refid map for hapids in db
//get refid set of all reference ranges covered by the paths. This will be the table columns
//create the data matrix (taxa_name rows, refid columns)
//all paths for path method as tuples of line_name, list of haplotypes
List>> pathList = new ArrayList<>();
String pathQuery = "SELECT line_name, paths_data FROM paths, genotypes, methods " +
"WHERE paths.genoid=genotypes.genoid AND methods.method_id=paths.method_id AND methods.name='" + pathMethod + "'";
try {
ResultSet pathResult = dbConn.createStatement().executeQuery(pathQuery);
while (pathResult.next()) {
String lineName = pathResult.getString(1);
List> haplotypeLists = DBLoadingUtils.decodePathsForMultipleLists(pathResult.getBytes(2));
for (List hapList : haplotypeLists) {
pathList.add(new Tuple(lineName, hapList));
}
}
pathResult.close();
} catch(SQLException se) {
throw new IllegalArgumentException("Could not execute query: " + pathQuery, se);
}
//a mapping of all hapids -> ref_range_id
Map hapRefidMap = new HashMap<>();
String refidQuery = "SELECT haplotypes_id, ref_range_id FROM haplotypes";
try {
ResultSet refidResult = dbConn.createStatement().executeQuery(refidQuery);
while (refidResult.next()) {
hapRefidMap.put(refidResult.getInt(1), refidResult.getInt(2));
}
refidResult.close();
} catch(SQLException se) {
throw new IllegalArgumentException("Could not execute query: " + refidQuery, se);
}
//no more queries. Close the database connection
try {
dbConn.close();
} catch (SQLException sqlExc) {
System.out.println("Error closing db connection in pathsForMethod.");
}
//the ordered set of all covered refids and a map of refid -> column index
TreeSet refidSet = pathList.stream().flatMap(t -> t.b.stream())
.collect(Collectors.toSet()).stream()
.map(h -> hapRefidMap.get(h))
.collect(Collectors.toCollection(TreeSet::new));
Map refidIndex = new HashMap<>();
int colCount = 0;
for (Integer refid : refidSet) refidIndex.put(refid, colCount++);
//create and fill the hapid matrix. Use -1 for no haplotype.
int[][] hapidMatrix = new int[pathList.size()][refidSet.size()];
for (int i = 0; i < pathList.size(); i++) Arrays.fill(hapidMatrix[i], -1);
int rowCount = 0;
for (Tuple> path : pathList) {
for (Integer hapid : path.b) {
hapidMatrix[rowCount][refidIndex.get(hapRefidMap.get(hapid))] = hapid;
}
rowCount++;
}
MatrixWithNames mwn = new MatrixWithNames();
mwn.matrix = hapidMatrix;
mwn.rowNames = pathList.stream().map(t -> t.a).toArray(String[]::new);
mwn.columnNames = refidSet.stream().map(refid -> refid.toString()).toArray(String[]::new);
return mwn;
}
/**
*
* @param configFile a database configuration file
* @param lineName the name of the line (taxon) for which the read mapping information is to be retrieved.
* If there are multiple read mappings with different file_group_names, they will be combined.
* @param readMappingMethodName the method name for the read mappings (only takes a single method)
* @param haplotypeMethodName optional: one or more colon-delimited haplotype method names
* @return A set of vectors and column names that can be used to create an R data.frame.
* Column labels are ref_range_id, chrom, start, hapid, taxa, taxa_count, range_count
*/
public static DataFrameVectors readMappingsForLineName(String configFile, String lineName, String readMappingMethodName, String haplotypeMethodName) {
ParameterCache.load(configFile);
LoggingUtils.setupLogging();
Connection conn = DBLoadingUtils.connection(false);
//get haplotypeMethodId for this method name
List hapMethodIdArray = new ArrayList();
if (haplotypeMethodName != null) {
String[] hapMethodIdList = haplotypeMethodName.split(":");
StringBuilder hapMethodSB = new StringBuilder();
hapMethodSB.append("'");
for (String method : hapMethodIdList) {
// this is appending single quotes around the name, and a comma between names
hapMethodSB.append(method).append("','");
}
hapMethodSB.setLength(hapMethodSB.length()-2); // This removes the last ,' from the string, leaving just a '
String hapMethods = hapMethodSB.toString();
String hapMethodQueryBase = "SELECT method_id FROM methods WHERE name IN (";
StringBuilder hapMethodQuerySB = new StringBuilder();
hapMethodQuerySB.append(hapMethodQueryBase);
hapMethodQuerySB.append(hapMethods);
hapMethodQuerySB.append(");");
myLogger.info("readMappingsForLineName: sending hap method query: " + hapMethodQuerySB.toString());
try {
ResultSet hapMethodResult = conn.createStatement().executeQuery(hapMethodQuerySB.toString());
while (hapMethodResult.next()) {
int hapId = hapMethodResult.getInt(1);
hapMethodIdArray.add(Integer.toString(hapId));
}
hapMethodResult.close();
} catch (SQLException e) {
throw new IllegalArgumentException("failed to execute sql: " + hapMethodQuerySB.toString(), e);
}
}
Map, Integer> readMappings = new HashMap<>();
String sqlstr = "SELECT mapping_data FROM read_mapping, methods, genotypes " +
"WHERE read_mapping.method_id=methods.method_id AND methods.name='" + readMappingMethodName + "' " +
"AND read_mapping.genoid=genotypes.genoid AND genotypes.line_name='" + lineName + "'";
try {
ResultSet sqlResult = conn.createStatement().executeQuery(sqlstr);
//this loop collects all the counts for each hapid set, in case there is more than one read mapping for a path
while (sqlResult.next()) {
//following returns Map,Int>, which is a hapid set and count
byte[] bytesFromDB = sqlResult.getBytes(1);
Map,Integer> reads = ReadMappingUtils.decodeHapIdMapping(bytesFromDB);
//reads is a Map,Int> where List (key) is a list of hapids and Int (value) is the count
for (Map.Entry,Integer> entry : reads.entrySet()) {
Integer existingCount = readMappings.get(entry.getKey());
if (existingCount == null) readMappings.put(entry.getKey(), entry.getValue());
else readMappings.put(entry.getKey(), entry.getValue() + existingCount);
}
}
sqlResult.close();
} catch (SQLException e) {
throw new IllegalArgumentException("Failed to execute sql: " + sqlstr, e);
}
if (readMappings.size() == 0) myLogger.info("No read mappings returned for " + lineName + " with read method = " + readMappingMethodName)
;
//to evaluate read mappings need hapid -> line_name (multiple?), hapid -> ref_range(id, chrom, start), hapid -> read count
//ref_range -> read count
//hap -> ref range id. If no haplotype methods were specified this code
// will pull all haplotypeIds from the haplotypes table
Map hapRangeMap = new HashMap<>();
StringBuilder sqlHapRangeSB = new StringBuilder();
sqlHapRangeSB.append("SELECT haplotypes_id, ref_range_id FROM haplotypes");
if (hapMethodIdArray.size() > 0) {
String hapMethodsIds = String.join(",", hapMethodIdArray);
sqlHapRangeSB.append(" WHERE haplotypes.method_id IN (");
sqlHapRangeSB.append(hapMethodsIds);
sqlHapRangeSB.append(")");
}
sqlHapRangeSB.append(";");
myLogger.info("readMappingsForLineName: sending haplotypes and referece range query: " + sqlHapRangeSB.toString());
try {
ResultSet resultHapRange = conn.createStatement().executeQuery(sqlHapRangeSB.toString());
while (resultHapRange.next())
hapRangeMap.put(resultHapRange.getInt("haplotypes_id"), resultHapRange.getInt("ref_range_id"));
resultHapRange.close();
} catch (SQLException e) {
throw new IllegalArgumentException("failed to execute sql: " + sqlHapRangeSB.toString(), e);
}
//rangeCountMultiset: total reads per reference range, range -> read count
//hapidCountMultiset: reads per hapid, hapid -> read count
HashMultiset rangeCountMultiset = HashMultiset.create();
HashMultiset hapidCountMultiset = HashMultiset.create();
for (Map.Entry, Integer> entry : readMappings.entrySet()) {
Integer refRangeId = hapRangeMap.get(entry.getKey().get(0));
rangeCountMultiset.add(refRangeId, entry.getValue());
for (Integer hapid : entry.getKey()) hapidCountMultiset.add(hapid, entry.getValue());
}
//ref range id -> chrom,start
Map> rangeMap = new HashMap<>();
String sqlRange = "SELECT ref_range_id, chrom, range_start FROM reference_ranges";
ResultSet resultRange = null;
try {
resultRange = conn.createStatement().executeQuery(sqlRange);
while (resultRange.next())
rangeMap.put(resultRange.getInt("ref_range_id"), new Tuple(resultRange.getString("chrom"), resultRange.getInt("range_start")));
resultRange.close();
} catch (SQLException e) {
throw new IllegalArgumentException("failed to execute sql: " + sqlRange, e);
}
//hapid -> line_name(s)
HashMultimap hapidLineNamesMap = HashMultimap.create();
String sqlHapLine = "SELECT distinct haplotypes_id, line_name FROM haplotypes h, gamete_haplotypes gh, gametes g, genotypes geno " +
"WHERE h.gamete_grp_id=gh.gamete_grp_id AND gh.gameteid=g.gameteid AND g.genoid=geno.genoid";
ResultSet resultHapLine = null;
try {
resultHapLine = conn.createStatement().executeQuery(sqlHapLine);
while (resultHapLine.next()) {
hapidLineNamesMap.put(resultHapLine.getInt(1), resultHapLine.getString(2));
}
resultHapLine.close();
} catch (SQLException e) {
throw new IllegalArgumentException("Failed to execute sql: " + sqlHapLine,e);
}
try {
conn.close();
} catch (SQLException e) {
System.out.println("Failed to close db connection.");
}
//"ref_range_id\tchrom\tstart\thapid\ttaxa\tcount\ttotal"
DataFrameVectors myResultVectors = new DataFrameVectors();
myResultVectors.columnNames = new String[] {"ref_range_id", "chrom", "start", "hapid", "taxa", "taxa_count", "range_count"};
int numberOfRows = hapRangeMap.size();
int[] refRanges = new int[numberOfRows];
String[] chromosomes = new String[numberOfRows];
int[] startPositions = new int[numberOfRows];
int[] haplotypeIds = new int[numberOfRows];
String[] taxa = new String[numberOfRows];
int[] taxaCount = new int[numberOfRows];
int[] rangeCount = new int[numberOfRows];
int rowCount = 0;
for (Map.Entry entry : hapRangeMap.entrySet()) {
Integer refid = entry.getValue();
Integer hapid = entry.getKey();
refRanges[rowCount] = refid;
Tuple rangeInfo = rangeMap.get(refid);
chromosomes[rowCount] = rangeInfo.a;
startPositions[rowCount] = rangeInfo.b;
haplotypeIds[rowCount] = hapid;
taxa[rowCount] = hapidLineNamesMap.get(hapid).stream().collect(Collectors.joining(","));
taxaCount[rowCount] = hapidCountMultiset.count(hapid);
rangeCount[rowCount] = rangeCountMultiset.count(refid);
rowCount++;
}
myResultVectors.dataVectors.add(refRanges);
myResultVectors.dataVectors.add(chromosomes);
myResultVectors.dataVectors.add(startPositions);
myResultVectors.dataVectors.add(haplotypeIds);
myResultVectors.dataVectors.add(taxa);
myResultVectors.dataVectors.add(taxaCount);
myResultVectors.dataVectors.add(rangeCount);
return myResultVectors;
}
/**
*
* @param configFile the database configuration file
* @param lineName the name of the line (taxon) for which the read mapping information is to be retrieved.
* @param readMappingMethodName the name of the read mapping method from the database
* @param haplotypeMethodName the name of the haplotype method used to write the pangenome fasta used to map reads
* @param fileGroup the name of the file group for the line from the database.
* This parameter is only necessary if the line (taxon) has more than one file group and only the reads
* for a specific file group are wanted.
* @return A set of vectors and column names that can be used to create an R data.frame. The column names are
* "ref_range_id", "chrom", "start", "hapid", "taxa", "taxa_count", and "range_count".
*/
public static DataFrameVectors readMappingsForLineName(String configFile, String lineName, String readMappingMethodName, String haplotypeMethodName, String fileGroup) {
if (fileGroup == null) {
return readMappingsForLineName(configFile, lineName, readMappingMethodName, haplotypeMethodName);
}
LoggingUtils.setupLogging();
Connection conn = DBLoadingUtils.connection(configFile, false);
//get haplotypeMethodId for this method name
String hapMethodQuery = "SELECT method_id FROM methods WHERE name = '" + haplotypeMethodName + "'";
int haplotypeMethodId = -1;
try {
ResultSet hapMethodResult = conn.createStatement().executeQuery(hapMethodQuery);
if (!hapMethodResult.next()) throw new IllegalArgumentException("No haplotype method returned by " + hapMethodQuery);
haplotypeMethodId = hapMethodResult.getInt(1);
hapMethodResult.close();
} catch (SQLException e) {
throw new IllegalArgumentException("failed to execute sql: " + hapMethodQuery, e);
}
Map, Integer> readMappings = new HashMap<>();
String sqlstr = "SELECT mapping_data FROM read_mapping, methods, genotypes " +
"WHERE read_mapping.method_id=methods.method_id AND methods.name='" + readMappingMethodName + "' " +
"AND read_mapping.genoid=genotypes.genoid AND genotypes.line_name='" + lineName + "' " +
"AND file_group_name = '" + fileGroup + "'";
try {
ResultSet sqlResult = conn.createStatement().executeQuery(sqlstr);
//this loop collects all the counts for each hapid set, in case there is more than one read mapping for a path
while (sqlResult.next()) {
//following returns Map,Int>, which is a hapid set and count
byte[] bytesFromDB = sqlResult.getBytes(1);
Map,Integer> reads = ReadMappingUtils.decodeHapIdMapping(bytesFromDB);
//reads is a Map,Int> where List (key) is a list of hapids and Int (value) is the count
for (Map.Entry,Integer> entry : reads.entrySet()) {
Integer existingCount = readMappings.get(entry.getKey());
if (existingCount == null) readMappings.put(entry.getKey(), entry.getValue());
else readMappings.put(entry.getKey(), entry.getValue() + existingCount);
}
}
sqlResult.close();
} catch (SQLException e) {
throw new IllegalArgumentException("Failed to execute sql: " + sqlstr, e);
}
if (readMappings.size() == 0) myLogger.info("No read mappings returned for " + lineName + " with read method = " + readMappingMethodName)
;
//to evaluate read mappings need hapid -> line_name (multiple?), hapid -> ref_range(id, chrom, start), hapid -> read count
//ref_range -> read count
//hap -> ref range id
Map hapRangeMap = new HashMap<>();
String sqlHapRange = "SELECT haplotypes_id, ref_range_id FROM haplotypes WHERE haplotypes.method_id = " + haplotypeMethodId;
try {
ResultSet resultHapRange = conn.createStatement().executeQuery(sqlHapRange);
while (resultHapRange.next())
hapRangeMap.put(resultHapRange.getInt("haplotypes_id"), resultHapRange.getInt("ref_range_id"));
resultHapRange.close();
} catch (SQLException e) {
throw new IllegalArgumentException("failed to execute sql: " + sqlHapRange, e);
}
//rangeCountMultiset: total reads per reference range, range -> read count
//hapidCountMultiset: reads per hapid, hapid -> read count
HashMultiset rangeCountMultiset = HashMultiset.create();
HashMultiset hapidCountMultiset = HashMultiset.create();
for (Map.Entry, Integer> entry : readMappings.entrySet()) {
Integer refRangeId = hapRangeMap.get(entry.getKey().get(0));
rangeCountMultiset.add(refRangeId, entry.getValue());
for (Integer hapid : entry.getKey()) hapidCountMultiset.add(hapid, entry.getValue());
}
//ref range id -> chrom,start
Map> rangeMap = new HashMap<>();
String sqlRange = "SELECT ref_range_id, chrom, range_start FROM reference_ranges";
ResultSet resultRange = null;
try {
resultRange = conn.createStatement().executeQuery(sqlRange);
while (resultRange.next())
rangeMap.put(resultRange.getInt("ref_range_id"), new Tuple(resultRange.getString("chrom"), resultRange.getInt("range_start")));
resultRange.close();
} catch (SQLException e) {
throw new IllegalArgumentException("failed to execute sql: " + sqlRange, e);
}
//hapid -> line_name(s)
HashMultimap hapidLineNamesMap = HashMultimap.create();
String sqlHapLine = "SELECT distinct haplotypes_id, line_name FROM haplotypes h, gamete_haplotypes gh, gametes g, genotypes geno " +
"WHERE h.gamete_grp_id=gh.gamete_grp_id AND gh.gameteid=g.gameteid AND g.genoid=geno.genoid";
ResultSet resultHapLine = null;
try {
resultHapLine = conn.createStatement().executeQuery(sqlHapLine);
while (resultHapLine.next()) {
hapidLineNamesMap.put(resultHapLine.getInt(1), resultHapLine.getString(2));
}
resultHapLine.close();
} catch (SQLException e) {
throw new IllegalArgumentException("Failed to execute sql: " + sqlHapLine,e);
}
try {
conn.close();
} catch (SQLException e) {
System.out.println("Failed to close db connection.");
}
//"ref_range_id\tchrom\tstart\thapid\ttaxa\tcount\ttotal"
DataFrameVectors myResultVectors = new DataFrameVectors();
myResultVectors.columnNames = new String[] {"ref_range_id", "chrom", "start", "hapid", "taxa", "taxa_count", "range_count"};
int numberOfRows = hapRangeMap.size();
int[] refRanges = new int[numberOfRows];
String[] chromosomes = new String[numberOfRows];
int[] startPositions = new int[numberOfRows];
int[] haplotypeIds = new int[numberOfRows];
String[] taxa = new String[numberOfRows];
int[] taxaCount = new int[numberOfRows];
int[] rangeCount = new int[numberOfRows];
int rowCount = 0;
for (Map.Entry entry : hapRangeMap.entrySet()) {
Integer refid = entry.getValue();
Integer hapid = entry.getKey();
refRanges[rowCount] = refid;
Tuple rangeInfo = rangeMap.get(refid);
chromosomes[rowCount] = rangeInfo.a;
startPositions[rowCount] = rangeInfo.b;
haplotypeIds[rowCount] = hapid;
taxa[rowCount] = hapidLineNamesMap.get(hapid).stream().collect(Collectors.joining(","));
taxaCount[rowCount] = hapidCountMultiset.count(hapid);
rangeCount[rowCount] = rangeCountMultiset.count(refid);
rowCount++;
}
myResultVectors.dataVectors.add(refRanges);
myResultVectors.dataVectors.add(chromosomes);
myResultVectors.dataVectors.add(startPositions);
myResultVectors.dataVectors.add(haplotypeIds);
myResultVectors.dataVectors.add(taxa);
myResultVectors.dataVectors.add(taxaCount);
myResultVectors.dataVectors.add(rangeCount);
return myResultVectors;
}
/**
*
* @param configFile a config file containing the DB name and connection information
* @return information for all the records in the PHG DB read_mapping table as a DataFrameVectors object
* The column names are "readMappingId","lineName","method", and "fileGroup".
*/
public static DataFrameVectors readMappingTableInfo(String configFile) {
LoggingUtils.setupLogging();
Connection dbConn = DBLoadingUtils.connection(configFile, false);
String infoQuery = "SELECT read_mapping_id, line_name, methods.name, file_group_name " +
"FROM read_mapping, genotypes, methods " +
"WHERE read_mapping.genoid=genotypes.genoid " +
"AND read_mapping.method_id=methods.method_id " +
"ORDER BY line_name, methods.name, file_group_name";
try {
List readMappingIdList = new ArrayList<>();
List lineNameList = new ArrayList<>();
List methodNameList = new ArrayList<>();
List fileGroupList = new ArrayList<>();
ResultSet infoResult = dbConn.createStatement().executeQuery(infoQuery);
while (infoResult.next()) {
readMappingIdList.add(infoResult.getInt(1));
lineNameList.add(infoResult.getString(2));
methodNameList.add(infoResult.getString(3));
fileGroupList.add(infoResult.getString(4));
}
infoResult.close();
DataFrameVectors dfv = new DataFrameVectors();
dfv.columnNames = new String[]{"readMappingId","lineName","method","fileGroup"};
dfv.dataVectors.add(readMappingIdList.stream().map(id -> id.toString()).toArray(String[]::new));
dfv.dataVectors.add(lineNameList.stream().toArray(String[]::new));
dfv.dataVectors.add(methodNameList.stream().toArray(String[]::new));
dfv.dataVectors.add(fileGroupList.stream().toArray(String[]::new));
return dfv;
} catch (SQLException e) {
throw new IllegalArgumentException("Error executing: " + infoQuery);
} finally {
try {
dbConn.close();
} catch (SQLException e) {
myLogger.info("Unable to close db connection in readMappingTableInfo");
}
}
}
public static Set hapidSetFromReadMappings(Map, Integer> readMappings) {
return readMappings.entrySet().stream().flatMap(entry -> entry.getKey().stream()).collect(Collectors.toSet());
}
public static Set rangeSetFromHapidSet(Set hapids, Map hapidRangeMap) {
return hapids.stream().map(hid -> hapidRangeMap.get(hid)).filter(x -> x != null).collect(Collectors.toSet());
}
public static void exportPHGToFlapjack(HaplotypeGraph graph, String filename) {
GenotypeTable myGenotypeTable = (GenotypeTable) new PathsToVCFPlugin(null, false)
.performFunction(DataSet.getDataSet(graph)).getData(0).getData();
System.out.println("my genotype table has reference ranges " + graph.referenceRangeStream().map(rr -> Integer.toString(rr.id())).collect(Collectors.joining(",")));
GenerateRCode.exportToFlapjack(myGenotypeTable, filename);
}
/**
* A test function to verify that MatrixWithNames works correctly for R
* @return a test MatrixWithNames
*/
public static MatrixWithNames testMatrix() {
MatrixWithNames mwn = new MatrixWithNames();
mwn.columnNames = new String[]{"col1","col2","col3"};
mwn.rowNames = new String[] {"row1","row2","row3","row4"};
mwn.matrix = new int[][] {{11,12,13},{21,22,23},{31,32,33},{41,42,43}};
return mwn;
}
/**
* A test function to verify that DataFrameVectors works correctly for R
* @return a test DataFrameVectors
*/
public static DataFrameVectors testDataFrame() {
DataFrameVectors dfv = new DataFrameVectors();
dfv.columnNames = new String[]{"col1","col2","col3"};
dfv.dataVectors.add(new int[]{11,21,31,41});
dfv.dataVectors.add(new int[]{12,22,32,42});
dfv.dataVectors.add(new int[]{13,23,33,43});
return dfv;
}
/**
* A data class holding columnNames, rowNames, and a dataVectors List as public fields in a format
* that can be easily used by rJava to create R objects
*/
public static class DataFrameVectors {
public String[] columnNames = null;
public String[] rowNames = null;
public List dataVectors = new ArrayList();
public void toTextFile(String filename) {
if (dataVectors.size() == 0) {
myLogger.info("No data in dataVecors, not writing to " + filename);
return;
}
try (PrintWriter pw = new PrintWriter(filename)) {
if (rowNames != null) pw.print("RowName\t");
pw.println(Arrays.stream(columnNames).collect(Collectors.joining("\t")));
} catch (FileNotFoundException e) {
throw new IllegalArgumentException("Unable to write to " + filename, e);
}
}
}
/**
* A data class holding columnNames, rowNames, and a matrix List as public fields in a format
* that can be easily used by rJava to create R objects. It is expected that the matrix will be a 2-D primitive
* Java array of some type that is converted correctly by rJava.
*/
public static class MatrixWithNames {
//first dimension is row, second dimension is columns
public String[] columnNames = null;
public String[] rowNames = null;
public Object matrix;
}
}