net.maizegenetics.pangenome.db_loading.PHGdbAccess Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.db_loading;
import java.io.File;
import java.sql.*;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import kotlin.Pair;
import net.maizegenetics.dna.map.Chromosome;
import org.apache.log4j.Logger;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;
import htsjdk.variant.variantcontext.VariantContext;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.HaplotypeNode;
//import net.maizegenetics.pangenome.db_loading.DBLoadingUtils.AnchorType;
//import net.maizegenetics.pangenome.db_loading.DBLoadingUtils.MethodType;
import net.maizegenetics.util.Tuple;
import static net.maizegenetics.pangenome.db_loading.DBLoadingUtils.*;
import static net.maizegenetics.pangenome.db_loading.GZipCompression.compress;
/**
* Access methods for postgres or SQL PHG dbs.
*
* WHen making changes, or adding new methods, test the SQL statements in both PostgreSQL
* and SQL to ensure compatibility.
*
* @author lcj34
*
*/
public class PHGdbAccess implements PHGDataWriter, AutoCloseable {
private static final Logger myLogger = Logger.getLogger(PHGdbAccess.class);
private Connection connection = null;
private boolean isSqlite = false;
// These maps contain objects that are most queried by users.
// The logic behind this most of the datasets are relatively small, and this prevents creation
// of these objects over and over again.
private BiMap refRangeRefRangeIDMap; // reference_ranges
private BiMap genoGenoIDMap; // genotype names (genotypes table)
private BiMap methodMethodIDMap; // method and methodIDs
private BiMap taxaGroupTaxaGroupIDMap; // taxaGroupName and its id
private BiMap gameteGameteIDMap; // lineName_hapNumber to gameteid map
private BiMap listHashGameteGrpIDMap; // list_hash to gameteGrp from gamete_groups table
private Map readMapToReadMappingIDMap; // taxon_method mapped to read_map table id
private Map alleleHashAlleleIDMap = new HashMap<>(); // allele hash to alleleID
private Map variantHashVariantIDMap = new HashMap<>(); // variants hash to variant_id
PreparedStatement referenceRangesInsertPS;
PreparedStatement refRangeRefRangeMethodInsertPS;
PreparedStatement haplotypesInsertPS;
PreparedStatement haplotypesCountInsertPS;
PreparedStatement pathsInsertPS;
PreparedStatement readMappingPathsPS;
PreparedStatement genomeFileDataPS;
PreparedStatement taxaGroupGenoidPS;
PreparedStatement haplotypeListPS;
public PHGdbAccess (Connection dbConnection) {
this.connection = dbConnection; // can be postgres or sqlite
myLogger.info("PHGdbAccess - db is setup, init prepared statements, load hash table");
DatabaseMetaData metadata = null;
try {
// THis is needed as the command for INSERT or IGNORE is different
// for postgresl and sqlite.
String dbName = dbConnection.getMetaData().getDatabaseProductName();
isSqlite = dbName.equals("SQLite") ? true : false;
myLogger.info("\n beginning - isSqlite is " + isSqlite);
} catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess: could not get DBName from metadata, error: " + exc.getMessage());
}
initPreparedStatements();
loadGenotypeHash();
loadRefRangeHash();
loadMethodHash();
loadTaxaGroupHash();
loadGameteGrpHash();
loadGameteGameteIDHash();
}
private void initPreparedStatements() {
try{
referenceRangesInsertPS=connection.prepareStatement(
"INSERT into reference_ranges ( chrom, range_start,range_end )" +
" values(?,?,?)");
refRangeRefRangeMethodInsertPS = connection.prepareStatement(
"INSERT into ref_range_ref_range_method (method_id, ref_range_id )" +
" values(?,?)");
haplotypesCountInsertPS = connection.prepareStatement(
"INSERT into haplotype_counts (genoid, method_id, fastq_file, data)" +
" values (?,?,?,?)");
// putPathsData needs the key returned for adding into the read_mapping_paths table
pathsInsertPS = connection.prepareStatement(
"INSERT into paths (genoid, method_id, paths_data)" +
" values (?,?,?)",Statement.RETURN_GENERATED_KEYS);
readMappingPathsPS = connection.prepareStatement(
"INSERT into read_mapping_paths (read_mapping_id, path_id)" +
" values (?,?)");
if (isSqlite) {
genomeFileDataPS = connection.prepareStatement(
"INSERT OR IGNORE into genome_file_data (genome_path, genome_file, file_checksum, genoid,type)" +
" values (?,?,?,?,?)",Statement.RETURN_GENERATED_KEYS);
taxaGroupGenoidPS = connection.prepareStatement(
"INSERT or IGNORE into taxa_groups_genoid (taxa_grp_id, genoid)" +
"values (?,?)");
haplotypeListPS=connection.prepareStatement(
"INSERT OR IGNORE into haplotype_list (list_hash, hapid_list)" +
" values(?,?)", Statement.RETURN_GENERATED_KEYS);
haplotypesInsertPS=connection.prepareStatement(
"INSERT OR IGNORE into haplotypes (gamete_grp_id,ref_range_id,genome_file_id, asm_contig, asm_start_coordinate, asm_end_coordinate, asm_strand,sequence,seq_len,seq_hash,method_id,gvcf_file_id)" +
" values(?,?,?,?,?,?,?,?,?,?,?,?)");
} else { // postgres 9.5 or later
genomeFileDataPS = connection.prepareStatement(
"INSERT into genome_file_data (genome_path, genome_file, file_checksum, genoid,type)" +
" values (?,?,?,?,?) ON CONFLICT DO NOTHING",Statement.RETURN_GENERATED_KEYS);
taxaGroupGenoidPS = connection.prepareStatement(
"INSERT into taxa_groups_genoid (taxa_grp_id, genoid)" +
"values (?,?) ON CONFLICT DO NOTHING");
haplotypeListPS=connection.prepareStatement(
"INSERT into haplotype_list (list_hash, hapid_list) " +
" values(?,?) ON CONFLICT DO NOTHING", Statement.RETURN_GENERATED_KEYS);
haplotypesInsertPS=connection.prepareStatement(
"INSERT into haplotypes (gamete_grp_id,ref_range_id,genome_file_id, asm_contig, asm_start_coordinate, asm_end_coordinate, asm_strand,sequence,seq_len,seq_hash,method_id,gvcf_file_id)" +
" values(?,?,?,?,?,?,?,?,?,?,?,?) ON CONFLICT DO NOTHING");
}
} catch (SQLException sqle) {
myLogger.error("SQLException initializing prepared statements");
throw new IllegalStateException("PHGdbAccess:initPreparedStatements: " + sqle.getMessage());
}
}
// Creates a biMap of refRangeId to reference range name, where
// the reference range name is a string of chrom:range_start
// This is used for quick mapping between the values when loading reference ranges
// for focus and non_focus regions.
private void loadRefRangeHash() {
int anchorsAdded = 0;
try{
int size = 0;
ResultSet rs=connection.createStatement().executeQuery("select count(*) from reference_ranges");
if (rs.next()) size=rs.getInt(1);
if(refRangeRefRangeIDMap==null || size/(refRangeRefRangeIDMap.size()+1)>3) {
if (refRangeRefRangeIDMap != null) {
myLogger.info("Creating new refRangeRefRangeIDMap bimap, old size: " + refRangeRefRangeIDMap.size() +
", new size: " + size);
} else {
myLogger.info("refRangeRefRangeIDMap is null, creating new one with size : " + size);
}
refRangeRefRangeIDMap=HashBiMap.create(size);
}
rs=connection.createStatement().executeQuery("select ref_range_id, chrom, range_start from reference_ranges");
while(rs.next()) {
// reference_ranges don't have names, create one based on chrom/start position.
String intervalName = rs.getString("chrom") + ":" + Integer.toString(rs.getInt("range_start"));
refRangeRefRangeIDMap.putIfAbsent(intervalName,rs.getInt("ref_range_id"));
anchorsAdded++;
}
} catch (SQLException exc) {
myLogger.error("SQLException loading anchor hash table");
throw new IllegalStateException("PHGdbAccess:loadAnchorHash: exception loading anchor hash table " + exc.getMessage());
}
myLogger.info("loadAnchorHash: at end, size of refRangeRefRangeIDMap: " + refRangeRefRangeIDMap.size() +
", number of rs.next processed: " + anchorsAdded);
}
// Creates a biMap of name to genotype_id from the genotypes table.
// Allows for quick reference of either value. Used in getGameteIdsFromTaxonStrings()
// to speed up loading of gamete_groups and haplotypes. By storing values in a map, the
// db is hit less often.
private void loadGenotypeHash() {
try{
ResultSet rs=connection.createStatement().executeQuery("select count(*) from genotypes");
int size= 0;
if (rs.next()) size = rs.getInt(1);
myLogger.info("before loading hash, size of all geneotypes in genotype table=" + size);
if(genoGenoIDMap==null || size/(genoGenoIDMap.size()+1)>3) genoGenoIDMap=HashBiMap.create(size);
rs=connection.createStatement().executeQuery("select genoid,line_name from genotypes");
while(rs.next()) {
genoGenoIDMap.putIfAbsent(rs.getString("line_name"),rs.getInt("genoid"));
}
} catch (SQLException exc) {
myLogger.error("SQLException loading genotype hash table");
throw new IllegalStateException("PHGdbAccess:loadGenotypeHash: exception loading genotype hash table " + exc.getMessage());
}
}
// Creates a biMap of gamete to gameteId from the gametes table. The gamete is uniquely
// identified via genotypes.line_name and gametes.hapnumber.
// Used when loading genotypes/gametes data as well as when creating/loading gamete groups.
private void loadGameteGameteIDHash() {
try {
ResultSet rs=connection.createStatement().executeQuery("select count(*) from gametes");
int size=0;
if (rs.next()) size=rs.getInt(1);
myLogger.info("before loading hash, size of all gametes in gametes table=" + size);
if(gameteGameteIDMap==null || size/(gameteGameteIDMap.size()+1)>3) gameteGameteIDMap=HashBiMap.create(size);
String query = "select gameteid,hapNumber, line_name from gametes INNER JOIN genotypes ON genotypes.genoid=gametes.genoid";
rs = connection.createStatement().executeQuery(query);
while(rs.next()) {
String name = rs.getString("line_name") + "_" + rs.getInt("hapNumber");
gameteGameteIDMap.put(name, rs.getInt("gameteid"));
}
} catch (SQLException sqle) {
myLogger.error("SQLException loading gameteGameteID hash table");
throw new IllegalStateException("PHGdbAccess:loadGameteGameteIDHash: exception loading gameteID hash table " + sqle.getMessage());
}
}
// Creates biMap of a gamete_groups hash table list_hash value to the gamete_groups gamete_grp_id value
// Used to facilitate loading/creating new gamete_groups with fewer hits to the db.
private void loadGameteGrpHash() {
try{
ResultSet rs=connection.createStatement().executeQuery("select count(*) from gamete_groups");
int size=0;
if (rs.next()) size=rs.getInt(1);
myLogger.info("before loading hash, size of all groups in gamete_groups table=" + size);
if(listHashGameteGrpIDMap==null || size/(listHashGameteGrpIDMap.size()+1)>3) listHashGameteGrpIDMap=HashBiMap.create(size);
rs=connection.createStatement().executeQuery("select list_hash,gamete_grp_id from gamete_groups");
while(rs.next()) {
listHashGameteGrpIDMap.putIfAbsent(rs.getString("list_hash"),rs.getInt("gamete_grp_id"));
}
} catch (SQLException exc) {
myLogger.error("SQLException loading gamete group hash table");
throw new IllegalStateException("PHGdbAccess:loadGameteGrpHash: exception loading gamete group hash table " + exc.getMessage());
}
}
// Creates method name to methodId biMap. Used when checking if an ID exists for a specific method name,
// and when needing a methodID for loading specific tables.
private void loadMethodHash() {
try{
ResultSet rs=connection.createStatement().executeQuery("select count(*) from methods");
int size=0;
if (rs.next()) size = rs.getInt(1);
myLogger.info("before loading hash, size of all methods in method table=" + size);
if(methodMethodIDMap==null || size/(methodMethodIDMap.size()+1)>3){
methodMethodIDMap=HashBiMap.create(size);
}
rs=connection.createStatement().executeQuery("select method_id,name from methods");
while(rs.next()) {
methodMethodIDMap.putIfAbsent(rs.getString("name"),rs.getInt("method_id"));
}
} catch (SQLException exc) {
myLogger.error("SQLException loading method hash table");
throw new IllegalStateException("PHGdbAccess:loadMethodHash: exception loading method hash table " + exc.getMessage());
}
}
private void loadTaxaGroupHash() {
try{
ResultSet rs=connection.createStatement().executeQuery("select count(*) from taxa_groups");
int size=0;
if (rs.next()) size = rs.getInt(1);
myLogger.info("before loading hash, size of all groups in taxa_groups table=" + size);
if(taxaGroupTaxaGroupIDMap==null || size/(taxaGroupTaxaGroupIDMap.size()+1)>3){
taxaGroupTaxaGroupIDMap=HashBiMap.create(size);
}
rs=connection.createStatement().executeQuery("select taxa_grp_id,taxa_grp_name from taxa_groups");
while(rs.next()) {
taxaGroupTaxaGroupIDMap.putIfAbsent(rs.getString("taxa_grp_name"),rs.getInt("taxa_grp_id"));
}
} catch (SQLException exc) {
myLogger.error("SQLException loading taxaGroup hash table");
throw new IllegalStateException("PHGdbAccess:loadTaxaGroupHash: exception loading taxa_group hash table " + exc.getMessage());
}
}
@Override
public void updateReadMappingHash() {
loadReadMappingHash();
}
private void loadReadMappingHash() {
// to find a particular read_mapping_id, the caller will provide the taxon (ie genotypes line_name),
// the method name, and the file_group_name. A hash is created of __ taxaStringList = new ArrayList<>(Arrays.asList(idLine.substring(semiIndex+1).split(":")));
List gameteIds = getGameteIdsFromTaxonStrings(taxaStringList); // called method sorts the list
String gameteListHash =
AnchorDataPHG.getChecksumForString(gameteIds.stream().map(Object::toString).collect(Collectors.joining(",")),"MD5");
Integer gameteGrpId = null;
try {
gameteGrpId = listHashGameteGrpIDMap.get(gameteListHash);
} catch (Exception exc) {
myLogger.error("getHaplotypeIDFromFastaIDLine: Group not found for hash created from list: " + taxaString);
throw new IllegalArgumentException("getHaplotypeIDFromFastaIDLine: Group not found for hash created from list: " + taxaString);
}
Integer methodId = null;
try {
methodId = getMethodIdFromName(methodName);
} catch (Exception exc) {
// query threw and error
throw new IllegalArgumentException("getHaplotypeIDFromFastaIDLine: Error getting method from tables: " + methodName);
}
if (methodId < 1) {
// method not found
throw new IllegalArgumentException("getHaplotypeIDFromFastaIDLine: Method not found in tables: " + methodName);
}
// find haplotypes_id from the ref_range_id, methodID and gamete_grp_id
String query = "SELECT haplotypes_id from haplotypes where ref_range_id = " + genomeIntervalId +
" AND gamete_grp_id=" + gameteGrpId + " and method_id=" + methodId;
int haplotypeId = 0;
try (ResultSet rs = connection.createStatement().executeQuery(query)){
if (rs.next()) {
haplotypeId = rs.getInt("haplotypes_id");
}
} catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:getHaplotypeIDFromFastaIDLine: cannot get haplotypes_id from gameteGrp "
+ gameteGrpId + " and ref_range_id " + genomeIntervalId + " " + exc.getMessage());
}
return haplotypeId;
}
@Override
public int getMethodIdFromName(String method_name) {
String query = "SELECT method_id from methods where name='" + method_name + "'";
int methodid = 0;
try {
ResultSet rs = connection.createStatement().executeQuery(query);
while (rs.next()) { // should only be 1 due to unique constraint on name
methodid = rs.getInt("method_id");
}
} catch (Exception exc) {
throw new IllegalArgumentException("getMethodIDFromName: error attempting DB access for methods table:" + exc.getMessage());
}
return methodid;
}
@Override
public Map getMethodDescriptionFromName(String method_name) {
String query = "SELECT description from methods where name='" + method_name + "'";
String desc = null;
try {
ResultSet rs = connection.createStatement().executeQuery(query);
while (rs.next()) { // should only be 1 due to unique constraint on name
desc = rs.getString("description");
}
}catch (Exception exc) {
throw new IllegalArgumentException("getMethodDescriptionFromName: error attempting DB access for methods table:" + exc.getMessage());
}
Map descMap = null;
if (desc != null) {
// TODO - catch case of this not formated as a json string
descMap = DBLoadingUtils.parseMethodJsonParamsToString(desc);
}
return descMap;
}
// This method exists so the taxaToHapidMap can be gotten once, but the
// Integer idlists can be created often for multiple taxa lists. It helps limit
// hits to the db.
private List getGameteIdsFromTaxonStrings(Listgametes) {
List gameteIds = new ArrayList<>();
gametes.stream().forEach(gamete -> {
Integer gameteid = gameteGameteIDMap.get(gamete);
if (gameteid == null) {
myLogger.error("Gameteid not found for taxa name, hapnumber : " + gamete);
throw new IllegalArgumentException("Gameteid not found for " + gamete);
}
gameteIds.add(gameteid);
});
Collections.sort(gameteIds);
return gameteIds;
}
@Override
public int getGameteGroupIDFromTaxaList(Listgametes) {
List gameteIds = getGameteIdsFromTaxonStrings(gametes);
String gameteListSha1 =
AnchorDataPHG.getChecksumForString(gameteIds.stream().map(Object::toString).collect(Collectors.joining(",")),"MD5");
int gameteGroupId = 0;
// if group already exists, we're done
if (listHashGameteGrpIDMap != null) {
gameteGroupId = listHashGameteGrpIDMap.get(gameteListSha1);
}
return gameteGroupId;
}
@Override
public int getGenoidFromLine(String line_name){
String query = "select genoid from genotypes where line_name = '" + line_name +"'";
try {
ResultSet rs= connection.createStatement().executeQuery(query);
while(rs.next()) {
int genoid = rs.getInt("genoid");
return genoid;
}
} catch (Exception exc) {
myLogger.warn("PHGdbAccess:getGenoidFromLine: could not get genoid for line_name " + line_name);
}
return -1;
}
public List getGenoidsForTaxa(List taxa) {
List genoids = new ArrayList<>();
String taxaString = taxa.stream().map(id->"'" + id + "'").collect(Collectors.joining(","));
StringBuilder querySB = new StringBuilder();
querySB.append("select genoid from genotypes where line_name IN (");
querySB.append(taxaString);
querySB.append(");");
myLogger.info("getGEnoidsForTaxa query: " + querySB.toString());
try {
ResultSet rs= connection.createStatement().executeQuery(querySB.toString());
while(rs.next()) {
int genoid = rs.getInt(1);
genoids.add(genoid);
}
} catch (SQLException se) {
// This helps debug if batch loading had a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("error getting genoids for taxa");
} catch (Exception exc) {
myLogger.error("getGenoidsForTaxa: error getting genoids for taxa list: " + exc.getMessage());
}
return genoids;
}
// Find an exsiting file id from genome_file_data based on genoid
@Override
public int getGenomeFileIdFromGenoid(int genoid){
String query = "select id from genome_file_data where genoid=" + genoid;
try {
ResultSet rs= connection.createStatement().executeQuery(query);
if (rs.next()) {
int genome_file_id = rs.getInt("id");
return genome_file_id;
}
} catch (Exception exc) {
myLogger.warn("PHGdbAccess:getGenomeFileIdFromGenoid: could not get genome_file_id for genoid " + genoid);
}
return -1;
}
// Find an exsiting file id from genome_file_data based on genoid
@Override
public int getGenomeFileIdFromGenoidAndFile(int genoid,String file){
String query = "select id from genome_file_data where genoid=" + genoid + " and genome_file='" + file + "';";
try {
ResultSet rs= connection.createStatement().executeQuery(query);
if (rs.next()) {
int genome_file_id = rs.getInt("id");
return genome_file_id;
}
} catch (Exception exc) {
myLogger.warn("PHGdbAccess:getGenomeFileIdFromGenoidAndFile: could not get genome_file_id for genoid " + genoid + " and file " + file);
}
return -1;
}
@Override
public Pair getGenomeFileHashFromGenoidandFile(int genoid, String file) {
String query = "select id, file_checksum from genome_file_data where genoid=" + genoid + " and genome_file='" + file + "';";
try {
ResultSet rs= connection.createStatement().executeQuery(query);
if (rs.next()) {
int id = rs.getInt("id");
String hash = rs.getString("file_checksum");
return new Pair(id,hash);
}
} catch (Exception exc) {
myLogger.warn("PHGdbAccess:getGenomeFileIdFromGenoidAndFile: could not get genome_file_id for genoid " + genoid + " and file " + file);
}
return null;
}
@Override
public int getHapidForGenoidHapNumber(int genoid, int hap_number) {
String query = "select gameteid from gametes where genoid = " + genoid + " and hapNumber = " + hap_number;
try {
ResultSet rs= connection.createStatement().executeQuery(query);
while(rs.next()) {
int gameteid = rs.getInt("gameteid");
return gameteid;
}
} catch (Exception exc) {
myLogger.warn("PHGdbAccess:getHapidForGenoidHapNumber: could not find hapid for genoid " + genoid + " and hap_number " + hap_number);
}
return -1;
}
@Override
public List getHapidsForGenoid(int genoid) {
// get all gameteids for this genoid
List gameteids = new ArrayList<>();
String query = "select gameteid from gametes where genoid = " + genoid;
try {
ResultSet rs= connection.createStatement().executeQuery(query);
while(rs.next()) {
gameteids.add(rs.getInt("gameteid"));
}
return gameteids;
} catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:getHapidsForGenoid: could not get gameteids " + exc.getMessage());
}
}
@Override
public Tuple getLineNameHapNumberFromHapid (int gameteid) {
Tuple lineHapNumber = null;
try{
String query = "select gt.line_name, ht.hapNumber from genotypes gt, gametes ht where " +
" ht.genoid = gt.genoid and ht.gameteid=" + gameteid ;
ResultSet rs=connection.createStatement().executeQuery(query);;
while(rs.next()) {
String line_name = rs.getString("line_name");
int hapNum = rs.getInt("hapNumber");
return (new Tuple<>(line_name,hapNum));
}
} catch (SQLException exc) {
throw new IllegalStateException("PHGdbAccess:getLineNameHapNumberFromHapid: could not get lineName/hapNumber for gameteid "
+ gameteid + " " + exc.getMessage());
}
return lineHapNumber;
}
// THis method appends the hapNumber to the lineName string. Some
// calling methods need the both the line_name and the hapNumber that
// match a gameteid
// The returned map is an integer/string defined as:
@Override
public Map getHapidHapNumberLineNamesForLines(List lineNames) {
ImmutableMap.Builder hapLineName=new ImmutableMap.Builder<>();
try{
String query = "select gameteid from gametes";
List gameteids = new ArrayList<>();
ResultSet rs=connection.createStatement().executeQuery(query);
while(rs.next()) {
gameteids.add(rs.getInt("gameteid"));
}
// Now get line names for each gameteid. The line name returned
// is a concatenation of the name and the hapNumber, e.g. B73_0
// THis is neeeded when we deal with polyploids
for (Integer gameteid: gameteids) {
Tuple nameNumber = getLineNameHapNumberFromHapid(gameteid);
if (lineNames.contains(nameNumber.x)){
String name_hapNum = nameNumber.x + "_" + Integer.toString(nameNumber.y);
hapLineName.put(gameteid,name_hapNum);
}
}
} catch (SQLException exc) {
throw new IllegalStateException("PHGdbAccess:getHapidHapNumberLineNamesForLines failed " + exc.getMessage());
}
return hapLineName.build();
}
@Override
public RangeMap getIntervalRangesWithIDForChrom( String chrom) {
// This method stores Position ranges in a sorted manner. The calling method needs chrom positions
// sorted to find inter-anchor regions on the chroms
//ImmutableRangeMap.Builder intervalData = ImmutableRangeMap.builder();
// Can ImmutableRangeMap.Builder create a TreeRangeMap? I want this sorted.
RangeMap intervalData = TreeRangeMap.create();
String query;
if (chrom.equalsIgnoreCase("all")) {
query = "select ref_range_id, chrom, range_start,range_end from reference_ranges";
} else {
query = "select ref_range_id,chrom, range_start,range_end from reference_ranges where chrom = '"
+ chrom + "'";
}
try {
ResultSet rs= connection.createStatement().executeQuery(query);
while(rs.next()) {
Chromosome currentChr= Chromosome.instance(rs.getString("chrom"));
intervalData.put(Range.closed(Position.of(currentChr,rs.getInt("range_start")),
Position.of(currentChr,rs.getInt("range_end"))), rs.getInt("ref_range_id"));
}
} catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:getIntervalRangesWithIDForChrom: failed to get intervals " + exc.getMessage());
}
return intervalData;
}
@Override
public List getChromNamesForHaplotype(String line_name, int hap_number, String version) {
// This used to be used for both ref and Assembly. Assembly chrom names are no longer
// stored.
ImmutableList.Builder chromListBuilder = ImmutableList.builder();
try {
String query = "select DISTINCT(chrom) from reference_ranges " ;
ResultSet rs = connection.createStatement().executeQuery(query);
while(rs.next()) {
chromListBuilder.add(rs.getString("chrom"));
}
} catch (Exception exc) {
myLogger.error("PHGdbAccess:getChromNamesForHaplotype: error getting chrom name list " + exc.getMessage());
}
return chromListBuilder.build();
}
@Override
public Map getHapidMapFromLinenameHapNumber() {
ImmutableMap.Builder taxaNameToHapidBuilder = ImmutableMap.builder();
int count = 0;
try {
String query = "select gameteid,hapNumber, line_name from gametes INNER JOIN genotypes on genotypes.genoid=gametes.genoid";
ResultSet rs = connection.createStatement().executeQuery(query);
int totalCount = 0;
while(rs.next()) {
count++;
totalCount++;
String name = rs.getString("line_name") + "_" + rs.getInt("hapNumber");
taxaNameToHapidBuilder.put(name, rs.getInt("gameteid"));
if (count > 10000) {
myLogger.info("gameteid list: processed so far: " + totalCount);
count=0;
}
}
} catch (SQLException sqle) {
throw new IllegalArgumentException("PHGdbAccess:getHapidMapFromLinenameHapNumber: processing error " + sqle.getMessage());
}
return taxaNameToHapidBuilder.build();
}
@Override
public Map getHapCountsIDAndPathsForMethod(String method) {
ImmutableMap.Builder hapCountToPathBuilder = ImmutableMap.builder();
int methodID = getMethodIdFromName(method);
if (methodID < 1) {
throw new IllegalArgumentException("Path methodid not found in db for method : " + method);
}
try {
String query = "select haplotype_counts_id, haplotype_paths from paths where method_id=" + methodID;
ResultSet rs = connection.createStatement().executeQuery(query);
while (rs.next()) {
int haplotypes_id = rs.getInt("haplotype_counts_id");
byte[] path = rs.getBytes("haplotype_paths");
hapCountToPathBuilder.put(haplotypes_id,path);
}
} catch (SQLException sqle) {
throw new IllegalArgumentException("PHGdbAccess:getHapCountsIDAndPathsForMethod: failed to get hapCounts: " + sqle.getMessage());
}
return hapCountToPathBuilder.build();
}
@Override
public Map> getHapCountsIDAndDataForVersionMethod(String method) {
ImmutableMap.Builder> hapCountToDataBuilder = ImmutableMap.builder();
int methodId = getMethodIdFromName(method);
if (methodId < 1) {
throw new IllegalArgumentException("Haplotype count methodid not found in db for method : " + method);
}
// method and version are good - now get data
try {
String query = "select haplotype_counts_id, genoid, data from haplotype_counts where method_id=" + methodId ;
ResultSet rs = connection.createStatement().executeQuery(query);
while (rs.next()) {
int haplotypes_id = rs.getInt("haplotype_counts_id");
byte[] data = rs.getBytes("data");
String taxonName = genoGenoIDMap.inverse().get(rs.getInt("genoid"));
hapCountToDataBuilder.put(haplotypes_id,new Tuple<>(taxonName,data));
}
} catch (SQLException sqle) {
throw new IllegalArgumentException("PHGdbAccess:getHapCountsIDAndDataForVersionMethod: failed to get hapCounts: " + sqle.getMessage());
}
return hapCountToDataBuilder.build();
}
@Override
public HashSet getTaxaForPathMethod(String method_name) {
Integer methodId = getMethodIdFromName(method_name);
if (methodId == null || methodId < 1) return new HashSet<>();
String query = "SELECT line_name FROM paths, genotypes " +
"WHERE genotypes.genoid = paths.genoid AND method_id=" + methodId;
HashSet taxaNameList = new HashSet<>();
try (ResultSet rs = connection.createStatement().executeQuery(query)){
while (rs.next()) {
taxaNameList.add(rs.getString(1));
}
} catch (SQLException se) {
throw new IllegalStateException("error getting taxa for method " + method_name, se);
} catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:getTaxonPathsForMethod: error getting paths for method " + method_name, exc);
}
return taxaNameList;
}
@Override
public Map getTaxonPathsForMethod(String method_name) {
// THis method takes a method name, finds the method ID, then
// produces a map of all taxon with path data stored against that methodId in the paths table.
Integer methodId = getMethodIdFromName(method_name);
if (methodId == null || methodId < 1) {
throw new IllegalStateException("PHGdbAccess:getTaxonPathsForMethod: no methodID in database for method " + method_name);
}
StringBuilder querySB = new StringBuilder();
querySB.append("select line_name, paths_data from paths ");
querySB.append("join genotypes on genotypes.genoid = paths.genoid ");
querySB.append(" WHERE method_id=");
querySB.append(methodId);
querySB.append(";");
Map taxonToPathsMap = new HashMap<>();
try (ResultSet rs = connection.createStatement().executeQuery(querySB.toString())){
while (rs.next()) {
String taxon = rs.getString("line_name");
byte[] pathsData = rs.getBytes("paths_data");
taxonToPathsMap.put(taxon,pathsData);
}
} catch (SQLException se) {
// This helps debug if batch loading had a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("error getting paths for method " + method_name);
} catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:getTaxonPathsForMethod: error getting paths for method " + method_name +":"+ exc.getMessage());
}
return taxonToPathsMap;
}
@Override
public int getReadMappingId(String line_name, String method_name, String file_group_name) {
Integer genoid = getGenoidFromLine(line_name);
if (genoid == null || genoid < 1) {
return -1; // genooid not in db
}
Integer methodId = getMethodIdFromName(method_name);
if (methodId == null || methodId < 1) {
return -1; // method not in db
}
// Method checks hash table for read mapping id. If not found, will query the db.
StringBuilder hashSB = new StringBuilder();
hashSB.append(line_name).append("_").append(method_name).append("_").append(file_group_name);
Integer readMappingID = readMapToReadMappingIDMap.get(hashSB.toString());
if (readMappingID == null) {
// query the db
StringBuilder querySB = new StringBuilder();
querySB.append("SELECT read_mapping_id from read_mapping where genoid = ");
querySB.append(genoid);
querySB.append(" and method_id=");
querySB.append(methodId);
querySB.append(" and file_group_name='");
querySB.append(file_group_name);
querySB.append("';");
int readMappingId = -1;
try (ResultSet rs = connection.createStatement().executeQuery(querySB.toString())){
if (rs.next()) {
readMappingId = rs.getInt("read_mapping_id");
}
} catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:getReadMappingId: cannot query read_mapping_id using taxon "
+ line_name + ", method: " + method_name + " and file_group_name " + file_group_name + " "+ exc.getMessage());
}
readMappingID = readMappingId;
}
if (readMappingID == null ) {
readMappingID = -1;
}
return readMappingID;
}
@Override
public List getHaplotypeList(int haplist_id) {
StringBuilder querySB = new StringBuilder();
querySB.append("SELECT hapid_list FROM haplotype_list WHERE haplotype_list_id = ");
querySB.append(haplist_id);
querySB.append(";");
System.out.println("getHaplotypeList - query: " + querySB.toString());
try (ResultSet rs = connection.createStatement().executeQuery(querySB.toString())) {
if (rs.next()) {
byte[] hapListData = rs.getBytes(1);
List hapIdsAsList = decodeHapidList(hapListData);
return hapIdsAsList;
} else {
// something is wrong - throw an exception
throw new IllegalStateException("PHGdbAccess:getHaplotypeList: no haplotype_list entries found for id "
+ haplist_id);
}
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("getReadMappingIdsForTaxaMethod: error getting read_mapping_ids: " + se.getMessage());
} catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:getHaplotypeList: cannot query haplotype_list using id "
+ haplist_id + " " + exc.getMessage());
}
}
@Override
public Pair getReadMappingsForId(int readMappingId) {
StringBuilder querySB = new StringBuilder();
querySB.append("SELECT mapping_data, haplotype_list_id FROM read_mapping WHERE read_mapping_id = ");
querySB.append(readMappingId);
try(ResultSet rs = connection.createStatement().executeQuery(querySB.toString())) {
if(rs.next()) {
byte[] mappingData = rs.getBytes("mapping_data");
int haplotypeListId = rs.getInt("haplotype_list_id");
return new Pair(mappingData,haplotypeListId);
}
}
catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:getReadMappingsForId: cannot query read_mapping using id "
+ readMappingId+" "+exc.getMessage());
}
return null;
}
@Override
public List getReadMappingIdsForTaxaMethod(List taxaList,String method) {
// This method was written for DeleteReadMappingPlugin, which requires a method parameter
// but does not require a taxa list. That Plugin enforces the presence of the method parameter.
// To keep this method suitable for other applications to call, neither parameter is required.
// If both are null, the query will return all read mapping ids from the read_mapping table.
List readMappingIds = new ArrayList();
Integer methodId = -1;
if (method != null) {
methodId = getMethodIdFromName(method);
if ( methodId < 1) {
throw new IllegalArgumentException(("getReadMappingIdsForTaxaMethod: method name not found in db: " + method));
}
}
String genoidString = null;
if (taxaList != null && taxaList.size() > 0) {
List genoidList = getGenoidsForTaxa(taxaList);
genoidString = genoidList.stream().map(id -> Integer.toString(id)).collect(Collectors.joining(","));
}
StringBuilder querySB = new StringBuilder();
querySB.append("select read_mapping_id from read_mapping ");
if (methodId > 0) {
querySB.append(" where method_id=");
querySB.append(methodId);
if (genoidString != null) {
querySB.append(" and genoid IN (");
querySB.append(genoidString);
querySB.append(")");
}
} else {
querySB.append(" where genoid IN {");
querySB.append(genoidString);
querySB.append(")");
}
querySB.append(";");
try(ResultSet rs = connection.createStatement().executeQuery(querySB.toString())) {
while(rs.next()) {
int readMappingId = rs.getInt(1);
readMappingIds.add(readMappingId);
}
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("getReadMappingIdsForTaxaMethod: error getting read_mapping_ids: " + se.getMessage());
} catch (Exception exc) {
throw new IllegalStateException("getReadMappingIdsForTaxaMethod: cannot query read_mapping_ids using methodName "
+ method+" and provided taxa list: "+exc.getMessage());
}
return readMappingIds;
}
@Override
public List getPathIdsForReadMappingIds(List readMappingIds) {
// Note this method does not associate a specific pathid with a specific readMapping id.
// It gets a list of all pathids that are associated with a read mapping id from the input list.
List pathIds = new ArrayList<>();
String rmString = readMappingIds.stream().map(id->id.toString()).collect(Collectors.joining(","));
StringBuilder querySB = new StringBuilder();
querySB.append("select path_id from read_mapping_paths where read_mapping_id IN (");
querySB.append(rmString);
querySB.append(");");
try(ResultSet rs = connection.createStatement().executeQuery(querySB.toString())) {
while(rs.next()) {
int pathId = rs.getInt(1);
pathIds.add(pathId);
}
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("getPathIdsForReadMappingIds: error getting path_ids: " + se.getMessage());
} catch (Exception exc) {
throw new IllegalStateException("getPathIdsForReadMappingIds: failed to get path_ids for provided read_mapping_id list "
+exc.getMessage());
}
return pathIds;
}
@Override
public List getReadMappingsForMethod(String methodName) {
StringBuilder querySB = new StringBuilder();
querySB.append("SELECT name, description, line_name,file_group_name,read_mapping_id, mapping_data " +
"FROM read_mapping INNER JOIN methods ON methods.method_id = read_mapping.method_id " +
"INNER JOIN genotypes ON genotypes.genoid = read_mapping.genoid " +
"WHERE methods.name = '");
querySB.append(methodName);
querySB.append("';");
List mappingRecords = new ArrayList<>();
try(ResultSet rs = connection.createStatement().executeQuery(querySB.toString())) {
while(rs.next()) {
String dbMethodName = rs.getString(1);
String dbMethodDesc = rs.getString(2);
String lineName = rs.getString(3);
String fileGroupName = rs.getString(4);
int readMappingId = rs.getInt(5);
byte[] mappings = rs.getBytes(6);
mappingRecords.add(new ReadMappingDBRecord(dbMethodName,dbMethodDesc,readMappingId,mappings,lineName,fileGroupName));
}
}
catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:getReadMappingIdsForMethod: cannot query read_mapping using methodName "
+ methodName+" "+exc.getMessage());
}
return mappingRecords;
}
@Override
public boolean isFileGroupNew(String taxon, String fileGroupName, String methodName) {
StringBuilder querySB = new StringBuilder();
querySB.append("SELECT count(*) FROM read_mapping INNER JOIN genotypes ON genotypes.genoid = read_mapping.genoid " +
"INNER JOIN methods ON methods.method_id = read_mapping.method_id ");
querySB.append("WHERE line_name='");
querySB.append(taxon);
querySB.append("' AND file_group_name='");
querySB.append(fileGroupName);
querySB.append("' AND name='");
querySB.append(methodName);
querySB.append("';");
boolean isNew = false;
try(ResultSet rs = connection.createStatement().executeQuery(querySB.toString())) {
int counter = 0;
while(rs.next()) {
counter++;
if(rs.getInt(1)== 0) {
isNew = true;
}
}
if(counter != 1) {
throw new IllegalStateException("Query returned ${counter} rows. isFileGroupNew should only return one row with a number either 0 or 1.");
}
}
catch(Exception exc) {
throw new IllegalStateException("PHGdbAccess:isFileGroupNew cannot query read_mapping "+exc.getMessage());
}
return isNew;
}
@Override
public List getRefRangesForMethod(String methodName) {
Integer id = methodMethodIDMap.get(methodName);
if (id < 1) {
throw new IllegalArgumentException("getREfRangesForMethod: methodid not found in db for method : " + methodName);
}
StringBuilder sb = new StringBuilder();
sb.append( "select ref_range_id from ref_range_ref_range_method where method_id = ");
sb.append(id);
List refRangeIds = new ArrayList<>();
try {
ResultSet rs = connection.createStatement().executeQuery(sb.toString());
while (rs.next()) {
int refRangeId = rs.getInt("ref_range_id");
refRangeIds.add(refRangeId);
}
} catch (SQLException sqle) {
throw new IllegalStateException("PHGdbAccess:getHapCountsIDAndPathsForMethod: failed to get hapCounts: " + sqle.getMessage());
}
return refRangeIds;
}
@Override
public Map>> getPathsForTaxonMethod(List taxon, String method_name) {
Map>> taxonToPathsMap = new HashMap();
Map taxonToGenoid = new HashMap();
int method_id= 0;
if (method_name != null) {
method_id = getMethodIdFromName(method_name);
}
// get genoids for taxon
for (String name: taxon) {
int genoid = getGenoidFromLine(name);
taxonToGenoid.put(name,genoid);
}
// get the paths based on method an genoid
for (Map.Entry entry : taxonToGenoid.entrySet()) {
StringBuilder querySB = new StringBuilder();
querySB.append("select paths_data from paths where genoid=").append(entry.getValue());
if (method_id > 0) {
querySB.append(" and method_id=").append(method_id);
}
try {
ResultSet rs = connection.createStatement().executeQuery(querySB.toString());
while (rs.next()) {
byte[] paths = rs.getBytes(1);
List> hapids = DBLoadingUtils.decodePathsForMultipleLists(paths);
taxonToPathsMap.put(entry.getKey(),hapids);
}
rs.close();
} catch (SQLException sqle) {
throw new IllegalStateException("PHGdbAccess:getPathsForTaxonMethod: failed to get paths for taxon: " + entry.getKey() + ":" + sqle.getMessage());
}
}
return taxonToPathsMap;
}
@Override
public List getDbTaxaNames() {
List taxaNames = new ArrayList();
String query = "select line_name from genotypes;";
try(ResultSet rs = connection.createStatement().executeQuery(query)) {
while(rs.next()) {
taxaNames.add(rs.getString(1));
}
} catch (SQLException sqle) {
throw new IllegalStateException("PHGdbAccess:getDbTaxaNames: failed to get taxa from genotypes table");
}
return taxaNames;
}
@Override
public int getTaxaGroupIDFromName(String group_name) {
String query = "SELECT taxa_grp_id from taxa_groups where taxa_Grp_name='" + group_name + "'";
int groupID = 0;
try {
ResultSet rs = connection.createStatement().executeQuery(query);
while (rs.next()) { // should only be 1 due to unique constraint on name
groupID = rs.getInt("taxa_grp_id");
}
} catch (Exception exc) {
throw new IllegalArgumentException("getTaxaGroupIDFromName: error attempting DB access for taxa_groups table:" + exc.getMessage());
}
return groupID;
}
@Override
public List getTaxaForTaxaGroup(String group_name) {
List taxaNames = new ArrayList<>();
int grpID = getTaxaGroupIDFromName(group_name);
if (grpID == 0) {
return taxaNames; // this will return empty list, user should assume either no taxa in group, or name isn't valid
}
StringBuilder querySB = new StringBuilder();
querySB.append("select line_name,genotypes.genoid from genotypes ");
querySB.append("JOIN taxa_groups_genoid on taxa_groups_genoid.genoid = genotypes.genoid ");
querySB.append("AND taxa_groups_genoid.taxa_grp_id=");
querySB.append(grpID);
myLogger.info("getTaxaForTaxaGroup query: " + querySB.toString());
try(ResultSet rs = connection.createStatement().executeQuery(querySB.toString())) {
while(rs.next()) {
taxaNames.add(rs.getString("line_name"));
}
} catch (SQLException sqle) {
throw new IllegalStateException("PHGdbAccess:getTaxaForTaxaGroup: failed to get taxa names from taxa_groups_genoid table");
}
return taxaNames;
}
@Override
public List getAllTaxaGroupNames() {
List taxaNames = new ArrayList<>();
String query = "select taxa_grp_name from taxa_groups";
myLogger.info("getTaxaForTaxaGroup query: " + query);
try(ResultSet rs = connection.createStatement().executeQuery(query)) {
while(rs.next()) {
taxaNames.add(rs.getString(1));
}
} catch (SQLException sqle) {
throw new IllegalStateException("PHGdbAccess:getAllTaxaGroupNames: failed to get group names from taxa_groups table");
}
return taxaNames;
}
// Begin PUT methods
@Override
public int putMethod(String name, DBLoadingUtils.MethodType type, Map descriptionMap) {
// Load anchor method
Integer id = methodMethodIDMap.get(name);
int methodType = type.value;
if(id==null) {
try {
connection.setAutoCommit(true);
String description = DBLoadingUtils.formatMethodParamsToJSON(descriptionMap);
String query = "insert into methods (name, method_type, description) values ('" + name + "', '" + methodType + "','" + description + "')";
connection.createStatement().executeUpdate(query);
} catch (SQLException exc) {
throw new IllegalStateException("PHGdbAccess:putMethod: failed to add method " + name + " " + exc.getMessage());
}
myLogger.info("PHGdbAccess:putMethod: added method " + name + " to methods table");
loadMethodHash();
id = methodMethodIDMap.get(name);
}
return id;
}
private int putGameteGroup(String gameteListSha1) {
if (listHashGameteGrpIDMap != null && listHashGameteGrpIDMap.get(gameteListSha1) != null)
return listHashGameteGrpIDMap.get(gameteListSha1);
try {
String query = "INSERT into gamete_groups (list_hash) values ('" + gameteListSha1 + "')";
connection.createStatement().executeUpdate(query);
} catch (SQLException sqle) {
throw new IllegalStateException("Failed to insert gamete group: " + sqle.getMessage());
}
// reload the hash
loadGameteGrpHash();
return listHashGameteGrpIDMap.get(gameteListSha1);
}
private boolean putGameteGroupList(List hashCodes){
// load the gamete_groups table
try {
PreparedStatement gameteGrpPS=connection.prepareStatement(
"INSERT into gamete_groups (list_hash)" + " values(?)");
connection.setAutoCommit(false);
int totalCount=0;
int batchCount = 0;
// Should be no more than a few hundred at max - not expecting a need for batching
for (String hash : hashCodes) {
gameteGrpPS.setString(1, hash);
gameteGrpPS.addBatch();
batchCount++;
totalCount++;
if (batchCount > 10000) {
gameteGrpPS.executeBatch();
// connection.commit();
batchCount=0;
}
}
// last batch
gameteGrpPS.executeBatch();
myLogger.info("putGameteGroupList: total loaded to gamete_groups: " + totalCount);
} catch (Exception exc) {
throw new IllegalStateException("error loading gamete_groups table: " + exc.getMessage());
}
return true;
}
private void putGameteHaplotypesFromList(Map> hashcodeGameteidMap) {
// load to gamete_haplotype table in batches
try {
PreparedStatement gameteHaplotypesPS=connection.prepareStatement(
"INSERT into gamete_haplotypes (gameteid, gamete_grp_id)" + " values(?,?)");
connection.setAutoCommit(false);
int totalCount=0;
int batchCount = 0;
// Should be no more than a few hundred at max - not expecting a need for batching
for (String hashCode : hashcodeGameteidMap.keySet()) {
int gameteGroupId = listHashGameteGrpIDMap.get(hashCode);
List gameteids = hashcodeGameteidMap.get(hashCode);
for (int gameteid : gameteids) {
gameteHaplotypesPS.setInt(1, gameteid);
gameteHaplotypesPS.setInt(2, gameteGroupId);
gameteHaplotypesPS.addBatch();
batchCount++;
totalCount++;
if (batchCount > 10000) {
gameteHaplotypesPS.executeBatch();
connection.commit();
batchCount=0;
}
}
}
// connection.commit();
gameteHaplotypesPS.executeBatch();
connection.setAutoCommit(true);
myLogger.info("putGameteHaplotypesFromList: committed " + totalCount + " to gamete_haplotypes table");
} catch (Exception exc) {
throw new IllegalStateException("putGameteHaplotypesFromList: error adding to gamete_haplotypes " + exc.getMessage());
}
}
@Override
public boolean putGameteGroupAndHaplotypes(Listgametes) {
// Each name on the list should of the form taxaName_hapNumber
Map taxaToHapidMap = getHapidMapFromLinenameHapNumber();
List gameteIds = new ArrayList<>();
gametes.stream().forEach(gamete -> {
Integer gameteid = taxaToHapidMap.get(gamete);
if (gameteid == null) {
myLogger.error("Gameteid not found for taxa name, hapnumber : " + gamete);
throw new IllegalArgumentException("Gameteid not found for " + gamete);
}
gameteIds.add(gameteid);
});
Collections.sort(gameteIds);
String gameteListSha1 =
AnchorDataPHG.getChecksumForString(gameteIds.stream().map(Object::toString).collect(Collectors.joining(",")),"MD5");
// if group already exists, we're done
if (listHashGameteGrpIDMap != null && listHashGameteGrpIDMap.get(gameteListSha1) != null) return true;
// load the gamete_groups table
int gameteGroupId = putGameteGroup(gameteListSha1);
// get the group id (hash was reloaded in putGameteGroup() call)
//int gameteGroupId = listHashGameteGrpIDMap.get(gameteListSha1);
// load the gamete_haplotypes table
try {
PreparedStatement gameteHaplotypesPS=connection.prepareStatement(
"INSERT into gamete_haplotypes (gameteid, gamete_grp_id)" + " values(?,?)");
connection.setAutoCommit(false);
// Should be no more than a few hundred at max - not expecting a need for batching
for (int gameteid : gameteIds) {
gameteHaplotypesPS.setInt(1, gameteid);
gameteHaplotypesPS.setInt(2, gameteGroupId);
gameteHaplotypesPS.addBatch();
}
// connection.commit();
gameteHaplotypesPS.executeBatch();
} catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:putGameteGroupAndHaplotypes: error loading gamete_haplotypes table: " + exc.getMessage());
}
return true;
}
@Override
public boolean putAllAnchors(List adata, int refGroupMethodID) {
// Add all reference ranges to the reference_ranges table.
myLogger.info("putAllAnchors: " + "size of adata list:" + adata.size());
int batchCount=0, totalCount=0;
// This list should be empty when initially populating the DB with anchor-focus regions
// When calling this for the non-focus regions, it should contain data for
// all focus ranges.
List refRangeExistingIDsList = new ArrayList<>();
refRangeRefRangeIDMap.inverse().keySet().stream().forEach(refRangeID -> {
refRangeExistingIDsList.add(refRangeID);
});
if (!refRangeExistingIDsList.isEmpty()) {
Collections.sort(refRangeExistingIDsList);
}
try {
connection.setAutoCommit(false);
for (AnchorDataPHG data : adata) {
// create anchor name as chrom:startpos
// want this so can have refRangeRefRangeIDMap map for quicker access
String chrom = data.intervalCoordinates().upperEndpoint().getChromosome().getName();
String anchorName = chrom +
":" + Integer.toString(data.intervalCoordinates().lowerEndpoint().getPosition());
if(refRangeRefRangeIDMap != null && refRangeRefRangeIDMap.containsKey(anchorName)){
myLogger.info("putAllAnchors: duplicate range : " + anchorName);
continue; //it is already in the DB skip
}
referenceRangesInsertPS.setString(1, chrom);
referenceRangesInsertPS.setInt(2, data.intervalCoordinates().lowerEndpoint().getPosition());
referenceRangesInsertPS.setInt(3, data.intervalCoordinates().upperEndpoint().getPosition());
referenceRangesInsertPS.addBatch();
batchCount++;
totalCount++;
if(batchCount>10000) {
myLogger.info("referenceRangesInsertPS.executeBatch() "+totalCount);
referenceRangesInsertPS.executeBatch();
//connection.commit();
batchCount=0;
}
}
referenceRangesInsertPS.executeBatch();
connection.setAutoCommit(true);
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("PHGdbAccess:putAllAnchors: error adding to reference_ranges table ");
}
if(totalCount>0) {
myLogger.info("putAllAnchors - total count loaded : " + totalCount);
loadRefRangeHash(); // this loads reference range id map
}
ResultSet rs;
try {
rs = connection.createStatement().executeQuery("select count(*) from reference_ranges");
int size=0;
if (rs.next()) size = rs.getInt(1);
myLogger.info("after putAllAnchors size of all ranges in reference ranges table=" + size
+ ", size of refRangeRefRangeIDMap=" + refRangeRefRangeIDMap.keySet().size());
} catch (SQLException exc) {
exc.printStackTrace();
return false;
}
// reference range data has been loaded.
return true;
}
@Override
public void putRefRangeRefRangeMethod(int method_id, ListrefRangeIDList) {
int batchCount = 0;
int totalCount = 0;
try {
connection.setAutoCommit(false);
for (int refRangeID : refRangeIDList) {
//TODO: do we need to check if this is already in the table?
refRangeRefRangeMethodInsertPS.setInt(1, method_id);
refRangeRefRangeMethodInsertPS.setInt(2, refRangeID);
refRangeRefRangeMethodInsertPS.addBatch();
batchCount++;
totalCount++;
if(batchCount>10000) {
myLogger.info("refRangeRefRangeMethodInsertPS.executeBatch() "+totalCount);
refRangeRefRangeMethodInsertPS.executeBatch();
batchCount=0;
}
}
refRangeRefRangeMethodInsertPS.executeBatch();
connection.setAutoCommit(true);
} catch (SQLException exc) {
throw new IllegalStateException("PHGdbAccess:putRefRangeRefRangeMethod: error adding to ref_range_ref_range_method table " + exc.getMessage());
}
if(totalCount>0) {
myLogger.info("putRefRangeRefRangeMethod: method_id " + method_id + ", total count loaded : " + totalCount);
}
}
private int addTaxontoGenotypes(String taxonName) {
// Add data to genotypes. This method adds just the name. All other fields
// are defaulted (ploidy=1, everything else is false)
// Load genotype table first
Integer id = genoGenoIDMap.get(taxonName);
if(id==null) {
try {
String query = "insert into genotypes (ploidy, is_reference, line_name, line_data, isPhasedAcrossGenes, isPhasedAcrossChromosomes) values ("
+ 1 + ", '" + false + "', '" + taxonName + "', 'none','" + false + "', '" + false + "')";
myLogger.info("addTaxontoGenotypes query: " + query);
connection.setAutoCommit(true);
connection.createStatement().executeUpdate(query);
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("PHGdbAccess:addTaxontoGenotypes: cannot add entry for taxonname:" + taxonName + " error: " + se.getMessage());
}
loadGenotypeHash();
}
return genoGenoIDMap.get(taxonName); // should be loaded from above
}
@Override
public boolean putGenoAndHaploTypeData(GenoHaploData ghData) {
// Add data to both genotypes and gametes tables
// Load genotype table first
Integer id = genoGenoIDMap.get(ghData.line_name());
if(id==null) {
int isRef = ghData.is_reference() ? 1 : 0;
int phasedGenes = ghData.isPhasedAcrossGenes() ? 1 : 0;
int phasedChroms = ghData.isPhasedAcrossChromosomes() ? 1 : 0;
try {
String query = "insert into genotypes (ploidy, is_reference, line_name, line_data, isPhasedAcrossGenes, isPhasedAcrossChromosomes) values ('"
+ ghData.ploidy() + "', '" + isRef + "', '" + ghData.line_name() + "', '" + ghData.line_data() + "', '" + phasedGenes
+ "', '" + phasedChroms + "')";
connection.setAutoCommit(true);
connection.createStatement().executeUpdate(query);
} catch (SQLException exc) {
exc.printStackTrace();
throw new IllegalStateException("PHGdbAccess:putGenoAndHaploTypeData: failed to add genotypes table data for line " + ghData.line_name() + ", error: " + exc.getMessage());
}
loadGenotypeHash();
}
id = genoGenoIDMap.get(ghData.line_name()); // should be loaded from above
// load gamete entry if it doesn't exist. POstgres does not have insert or ignore
String name = ghData.line_name() + "_" + ghData.hapNumber();
Integer gameteid = gameteGameteIDMap.get(name);
if (gameteid == null) {
myLogger.info("putGenoAndHaploTypeData: adding gamete for genoid " + id + ", hapNumber " + ghData.hapNumber());
try {
String query = "insert into gametes (genoid, hapNumber, phasingConfidence) values ('"
+ id + "', '" + ghData.hapNumber() + "', '" + ghData.phasingConfidence() + "')";
connection.createStatement().executeUpdate(query);
} catch (SQLException exc) {
throw new IllegalStateException("PHGdbAccess:putGenoAndHaploTypeData: failed! " + exc.getMessage());
}
loadGameteGameteIDHash(); // load bimap of lineName_HapNumber/gameteId
}
return true;
}
// Reference data gets unique call because there is only 1 set of "reference_ranges".
// This needs to be called directly after loading the reference_ranges.
// Both are called from LoadAllIntervalsToPHGdbPlugin or LoadGenomeIntervalsToPHGdbPlugin
@Override
public boolean putRefAnchorData(String line_name, int hapnumber, List anchorData, int hapMethod,
Set refGrpMethods, String gvcf, String variant_list, int genomeFileId, int gvcfFileId) {
// Need to get the ref_range_id from the db, match the anchor chrom/start to the
// sequence data.
// 1. Load the sequence data, method_id and gamete group id to the haplotypes table.
// 2. find method_id from method_name parameter, get the list of ref_range_ids associated with that
// method. These are associated via chrom/range_start with the anchorData that was passed in.
// Create map - chrom and start is enough to identify the
// reference_range. Grab ref_range_ids based on method_name/ref_range_groups id
Map chromgenome_interval_idData = new HashMap<>();
List methodIds = new ArrayList<>();
// use methodId to get ref_range_group_id, from that get the list of reference range ids for the intervals
for (String methodName : refGrpMethods) {
int refGrpMethodID = getMethodIdFromName(methodName);
if (refGrpMethodID == 0) {
throw new IllegalArgumentException("PHGdbAccess:putRefAnchorData -no method in methods table for method_name " + methodName);
}
methodIds.add(refGrpMethodID);
}
int anchorDBTotal = 0;
int anchorMatchedTotal = 0;
// Get list of reference_ranges associated with this method
List refRangeIdList = new ArrayList<>();
StringBuilder querySB = new StringBuilder();
querySB.append("select ref_range_id from ref_range_ref_range_method where method_id IN (");
String ids = methodIds.stream().map(id -> Integer.toString(id)).collect(Collectors.joining(","));
querySB.append(ids);
querySB.append(")");
try {
connection.setAutoCommit(false);
ResultSet rs = connection.createStatement().executeQuery(querySB.toString());
while(rs.next()) {
refRangeIdList.add(rs.getInt("ref_range_id"));
}
// We have a list of reference ranges. Grab the chrom and start site for each from refRangeRefRangeIDMap
for (int refRangeId : refRangeIdList) {
String rangeChromStart = refRangeRefRangeIDMap.inverse().get(refRangeId);
if (rangeChromStart == null) {
// all values SHOULD be on this hash map
throw new IllegalStateException("PHGdbAccess:putRefAnchorData: could not get data for refRangeid " + refRangeId);
}
int firstColon = rangeChromStart.indexOf(":");
String chrom = rangeChromStart.substring(0, firstColon);
int range_start = Integer.parseInt(rangeChromStart.substring(firstColon+1));
Position rangePos= Position.of(Chromosome.instance(chrom),range_start);
chromgenome_interval_idData.put(rangePos, refRangeId);
}
anchorDBTotal = chromgenome_interval_idData.size();
myLogger.info("PHGdbAccess: putRefAnchorData: size of chromgenome_interval_idData: " + anchorDBTotal);
// match the chromgenome_interval_idData to sequence from adata
// Map
Map refRangeRefRangeDataMap = new HashMap<>();
for (AnchorDataPHG adata : anchorData) {
Position startPos = adata.intervalCoordinates().lowerEndpoint();
Integer refRangeId = chromgenome_interval_idData.get(startPos);
if (startPos == null) {
throw new IllegalStateException("PHGdbAccess:putRefAnchorData: could not get refRangeId for position " + startPos.toString());
}
refRangeRefRangeDataMap.put(refRangeId, adata);
}
anchorMatchedTotal = refRangeRefRangeDataMap.size();
myLogger.info("putRefAnchorData: anchorMatchedTotal: " + anchorMatchedTotal + " number input: "
+ anchorData.size());
List taxaList = new ArrayList<>();
taxaList.add(line_name + "_" + hapnumber);
int gamete_grp_id = getGameteGroupIDFromTaxaList(taxaList);
putHaplotypesForGamete(gamete_grp_id, hapMethod, refRangeRefRangeDataMap, genomeFileId, gvcfFileId);
} catch (Exception exc) {
myLogger.debug(exc.getMessage(),exc);
throw new IllegalStateException("putRefAnchorData: failed getting reference range list for method " + hapMethod
+ ", " +exc.getMessage());
}
return true;
}
@Override
public void putHaplotypesForMultipleGroups(Multimap> mapWithGroupHash, int method_id) {
// This method used for loading consensus sequence data. The groupid for each consensus must be
// determined from the hashcode passed in tuple.y
//
// Using -1 as the gvcf file id as we don't have gvcf files for consensus
myLogger.info("Begin putHaplotypesForMultipleGroups, number interval sequences to load: " + mapWithGroupHash.keySet().size());
int totalAnchorsProcessed = 0;
String chrom = "none";
int debuggenome_interval_id = 0;
int debugStartPos = 0;
String debugHash = "0";
int debugSeqLen = 0;
// for each item in the anchorSequences map, add data to haplotypes map
try {
connection.setAutoCommit(false);
int batchCount=0, totalCount=0;
// load the haplotypes table in batches:
for(Collection> dataSet : mapWithGroupHash.asMap().values()) {
for (Tuple data : dataSet) {
totalAnchorsProcessed++;
AnchorDataPHG aData = data.getX();
chrom = aData.intervalCoordinates().lowerEndpoint().getChromosome().getName();
int startPos = aData.intervalCoordinates().lowerEndpoint().getPosition();
String genomeName = chrom + ":" + startPos;
Integer ref_range_id = refRangeRefRangeIDMap.get(genomeName);
if (ref_range_id == null) {
myLogger.info("ref_range_id null for " + genomeName);
continue; // skip this one, keep processing others in file
//throw new IllegalStateException("putConsensusSEquences: error processing anchorName " + anchorName);
}
debugSeqLen = aData.seqLen();
debugHash = aData.seqHash();
debuggenome_interval_id = ref_range_id; // for debugging!
debugStartPos = startPos;
int gamete_grp_id = listHashGameteGrpIDMap.get(data.getY());
String asmChrom = aData.asmContig(); // will be same as ref as alignment is per-chrom
int asmStart = aData.asmStart();
int asmEnd = aData.asmEnd();
haplotypesInsertPS.setInt(1, gamete_grp_id);
haplotypesInsertPS.setInt(2, ref_range_id);
haplotypesInsertPS.setInt(3, aData.asmFileId());
haplotypesInsertPS.setString(4, asmChrom);
haplotypesInsertPS.setInt(5, asmStart);
haplotypesInsertPS.setInt(6, asmEnd);
haplotypesInsertPS.setString(7, aData.asmStrand());
haplotypesInsertPS.setBytes(8, compress(aData.sequence()));
haplotypesInsertPS.setInt(9, aData.seqLen());
haplotypesInsertPS.setString(10, aData.seqHash());
haplotypesInsertPS.setInt(11, method_id);
haplotypesInsertPS.setInt(12, aData.gvcfFileId()); // this is populated when clusteringMode=CLUSTERING_MODE.upgma_assembly
haplotypesInsertPS.addBatch();
batchCount++;
totalCount++;
if(batchCount>10000) {
myLogger.info("haplotypeInsertPS.executeBatch() "+totalCount);
haplotypesInsertPS.executeBatch();
batchCount=0;
}
}
}
haplotypesInsertPS.executeBatch();
connection.setAutoCommit(true);
myLogger.info("putHaployptes - total count loaded to haplotypes table: " + totalCount);
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
myLogger.error("genome_interval_id: " + debuggenome_interval_id + ", chrom: " + chrom + ", chromStart: "
+ debugStartPos + ", debugSeqLen: " + debugSeqLen + ", debugHash: " + debugHash);
throw new IllegalStateException("PHGdbAccess:putHaplotypesForMultipleGroups: failed " + se.getMessage());
}
}
@Override
public void putHaplotypesForGamete(int gamete_grp_id, int method_id,Map anchorSequences,
int genomeFileId, int gvcfFileId) {
// This method used when loading reference interval or data for a single gamete
myLogger.info("Begin putHaplotypesForGamete, number anchorSequences to load: " + anchorSequences.keySet().size());
// for each item in the anchorSequences map, add data to haplotypes map
try {
connection.setAutoCommit(false);
int batchCount=0, totalCount=0;
myLogger.info("putHaplotypes: starting to commit haplotypes" );
// load the haplotypes table in batches:
for (Map.Entry entry: anchorSequences.entrySet()) {
AnchorDataPHG aData = entry.getValue();
int ref_range_id = entry.getKey();
String asmChrom = aData.asmContig();
int asmStart = aData.asmStart();
int asmEnd = aData.asmEnd();
haplotypesInsertPS.setInt(1, gamete_grp_id);
haplotypesInsertPS.setInt(2, ref_range_id);
haplotypesInsertPS.setInt(3, genomeFileId);
haplotypesInsertPS.setString(4, asmChrom);
haplotypesInsertPS.setInt(5, asmStart);
haplotypesInsertPS.setInt(6, asmEnd);
haplotypesInsertPS.setString(7, aData.asmStrand());
haplotypesInsertPS.setBytes(8, compress(aData.sequence()));
haplotypesInsertPS.setInt(9, aData.seqLen());
haplotypesInsertPS.setString(10, aData.seqHash());
haplotypesInsertPS.setInt(11, method_id);
// This is the gvcfId, was previously the variants_list stuff
haplotypesInsertPS.setInt(12, gvcfFileId);
haplotypesInsertPS.addBatch();
batchCount++;
totalCount++;
if(batchCount>10000) {
myLogger.info("haplotypeInsertPS.executeBatch() "+totalCount);
haplotypesInsertPS.executeBatch();
batchCount=0;
}
}
haplotypesInsertPS.executeBatch();
connection.setAutoCommit(true);
myLogger.info("putHaployptes - total count loaded to haplotypes table: " + totalCount);
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("PHGdbAccess:putHaplotypesForGamete: failed " );
}
}
@Override
public void putHaplotypesData(int gamete_grp_id, int method, Map anchorSequences, String chromosome, int genomeFileId, int gvcfFileId) {
putHaplotypesData(gamete_grp_id, method, anchorSequences, chromosome, genomeFileId, gvcfFileId, 10000);
}
@Override
public void putHaplotypesData(int gamete_grp_id, int method, Map anchorSequences, String chromosome, int genomeFileId, int gvcfFileId,int maxEntries){
long time = System.nanoTime();
Set mapIdList = anchorSequences.keySet();
// Handles the memory issues occuring when users create massive number of reference ranges,
// basically populating the entire genome into PHG.
if (mapIdList.size() <= maxEntries) {
processHaplotypesData(gamete_grp_id, method, anchorSequences, chromosome, genomeFileId, gvcfFileId);
myLogger.info("putHaplotypesData: Finished batch, total processed = " + mapIdList.size());
} else {
int mapCount = 0;
Map tempAnchorSequences = new HashMap<>();
int totalProcessed = 0;
for (int key : mapIdList) {
tempAnchorSequences.put(key,anchorSequences.get(key));
mapCount++;
totalProcessed++;
if (mapCount == maxEntries) {
processHaplotypesData(gamete_grp_id, method, tempAnchorSequences, chromosome, genomeFileId, gvcfFileId);
mapCount = 0;
tempAnchorSequences.clear();
myLogger.info("putHaplotypesData: Finished batch, total processed = " + totalProcessed);
}
}
if (tempAnchorSequences.size() > 0) {
processHaplotypesData(gamete_grp_id, method, tempAnchorSequences, chromosome, genomeFileId, gvcfFileId);
myLogger.info("putHaplotypesData: Finished batch, total processed = " + totalProcessed);
}
}
}
// This method may no be longer needed - we can call putHaplotypesForGamete directly.
// It was created previously to 1. store the alleles data, 2. store the variants data, 3. store the haplotypes
// Now it only does Step 3,
public void processHaplotypesData(int gamete_grp_id, int method, Map anchorSequences, String chromosome, int genomeFileId, int gvcfFileId) {
// anchorsequence is map of GenomeIntervalID, AnchorDAtaPHG
long time = System.nanoTime();
myLogger.info("putHaplotypeData calling putHaploytpesForGamete");
putHaplotypesForGamete( gamete_grp_id, method, anchorSequences, genomeFileId, gvcfFileId);
myLogger.info("PHGdbAccess:processHaplotypesData: time to load haplotypes : " + (System.nanoTime()-time)/1e9 + " seconds");
}
@Override
public void putConsensusSequences(Multimap>> consensusMap, int method_id) {
Long time = System.nanoTime();
//loadGameteGrpHash(); // Should be loaded prior to calling this method
// loadGameteGameteIDHash(); will be loaded prior to calling this method
// This map contains the hashcode for the group, and a list of gameteids for it.
// It will be used to load the gamete_haplotypes and gamete_groups tables
Map> hashToGameteList = new HashMap<>();
// The string on this map is the grouphash. The group hash is needed for finding
// the gamete_grp_id for each entry when loading to the haplotypes table.
Multimap> mapWithGroupHash = HashMultimap.create();
consensusMap.asMap().entrySet().stream().forEach(entry -> {
Collection>> data = entry.getValue();
for (Tuple> consensusItem : data) {
// turn the List into a hash sequence
// First get hapids for each taxa pair
List gameteIds = getGameteIdsFromTaxonStrings(consensusItem.y);
String gameteListHash =
AnchorDataPHG.getChecksumForString(gameteIds.stream().map(Object::toString).collect(Collectors.joining(",")),"MD5");
mapWithGroupHash.put(entry.getKey(), new Tuple<>(consensusItem.x, gameteListHash));
// Create list to load to consensus group tables.
if (listHashGameteGrpIDMap == null || listHashGameteGrpIDMap.get(gameteListHash) == null){
// not yet stored in gamete_groups or gamete_haplotypes, add to list
// Idea is to store all entries to gamete_groups and gamete_haplotypes in batches for efficiency
hashToGameteList.put(gameteListHash, gameteIds);
}
}
});
// 1. Add to gamete_groups and gamete_haplotypes data from hashToGameteList,
// update gameteGroup hash
putGameteGroupList(hashToGameteList.keySet().stream()
.collect(Collectors.toList()));
loadGameteGrpHash();
// 2. add to gamete_haplotypes table
putGameteHaplotypesFromList(hashToGameteList);
// 3. Add to haplotypes from mapWithGroupHash. Will need to get grpid from
// the listHashGameteGrpIDMap
putHaplotypesForMultipleGroups(mapWithGroupHash, method_id);
myLogger.info("Finished processing putConsensusSeuqnces in " + (System.nanoTime() - time)/1e9 + " seconds");
}
@Override
public void putHaplotypeCountsData( String method, Map methodDetails, String taxonName, String fastqFile,
byte[] counts) {
// MethodType is temporarily renamed "READ_MAPPING" to facilitate compilation.
// The "HAPLOTYPE_COUNT" option in the enum MethodType has been replaced by "READ_MAPPING".
// This entire method is slated for removal as soon as the plugins using it are removed.
// No need to check for test method type as method will be removed
int methodId = putMethod( method, DBLoadingUtils.MethodType.READ_MAPPING, methodDetails);
if (methodId < 1) {
throw new IllegalStateException("PHGdbAccess:putHaplotypeCountsData: could not store method in db, method= " + method);
}
try {
int genoid = addTaxontoGenotypes(taxonName);
myLogger.info("adding to haplotype counts:" );
haplotypesCountInsertPS.setInt(1, genoid);
haplotypesCountInsertPS.setInt(2, methodId);
haplotypesCountInsertPS.setString(3, fastqFile);
haplotypesCountInsertPS.setBytes(4, counts);
haplotypesCountInsertPS.executeUpdate();
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("PHGdbAccess:putHaplotypeCountsData: failed to add haplotype_counts for taxon " + taxonName );
}
myLogger.info("Finished processing putHaplotypeCOuntsData");
}
// This method creates a paths table entry, then with that paths table's entry ID, creates entries in the
// read_mapping_paths table for each readMappingIds/pathId pair.
@Override
public int putPathsData(String method, Map methodDetails, String taxon, List readMappingIds, byte[] pathBytes, boolean isTestMethod) {
DBLoadingUtils.MethodType method_type = DBLoadingUtils.MethodType.PATHS;
if (isTestMethod) {
method_type = DBLoadingUtils.MethodType.TEST_PATHS;
}
int methodId = putMethod( method, method_type, methodDetails);
Integer genoid = getGenoidFromLine(taxon);
int batchCount = 0;
int totalCount = 0;
int pathID = -1;
try {
connection.setAutoCommit(false);
// Check if we found a genoid
if (genoid == null || genoid < 1) {
// need to add entry to genotypes table
myLogger.info("putPathsData: creating genotypes data for taxon " + taxon);
StringBuilder sb = new StringBuilder();
sb.append("INSERT INTO genotypes (ploidy, is_reference, line_name, line_data,isPhasedAcrossGenes,isPhasedAcrossChromosomes) ");
sb.append("values (2,0,'");
sb.append(taxon);
sb.append("','genotype for adding the path',0,0);");
myLogger.info("putPathsData: genotypes insert: " + sb.toString());
connection.createStatement().executeUpdate(sb.toString());
genoid = getGenoidFromLine(taxon);
}
// Add paths table entry
pathsInsertPS.setInt(1, genoid);
pathsInsertPS.setInt(2, methodId);
pathsInsertPS.setBytes(3, pathBytes);
int status = pathsInsertPS.executeUpdate();
if (status == 0) {
throw new SQLException ("Failed to create paths entry");
}
ResultSet generatedKeys = pathsInsertPS.getGeneratedKeys();
if (generatedKeys.next()){
pathID = (int)generatedKeys.getLong(1);
//myLogger.info("found paths generated key: " + pathID);
}
// add entry to read_mapping_paths table if readMappingIds aren't null
// These will be NULL if we are creating paths for initial assembly or
// WGS loads.
if (readMappingIds != null) {
for (Integer readMappingId : readMappingIds) {
readMappingPathsPS.setInt(1, readMappingId);
readMappingPathsPS.setInt(2, pathID);
readMappingPathsPS.addBatch();
batchCount++;
totalCount++;
if(batchCount>10000) {
myLogger.info("readMappingPathsPS.executeBatch() "+totalCount);
readMappingPathsPS.executeBatch();
batchCount=0;
}
}
readMappingPathsPS.executeBatch();
myLogger.info("putPathsData - total count loaded to read_mapping_paths table: " + totalCount);
}
connection.setAutoCommit(true);
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("PHGdbAccess:putPathsData: SQLException: failed when adding paths for method: " + method + ", taxon: " + taxon);
} catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:putPathsData: General Exception: failed when adding paths for method: " + method + ", taxon: " + taxon);
}
return pathID;
}
@Override
public int putReadMappingData(String method, Map methodDetails, String taxon, String file_group_name, byte[] mapping_data, boolean isTestMethod, int haplotypeListId) {
// NOTE: this method does not call loadReadMappingHash(). The hash will initially be loaded when
// a PHGdbAccess object is created. Afterwards, the user may call updateReadMappingHash() to
// update the hash.
DBLoadingUtils.MethodType method_type = DBLoadingUtils.MethodType.READ_MAPPING;
if (isTestMethod) {
method_type = DBLoadingUtils.MethodType.TEST_READ_MAPPING;
}
int methodId = putMethod( method, method_type, methodDetails);
Integer genoid = getGenoidFromLine(taxon);
int readMappingID = -1;
try {
// Check if we found a genoid
if (genoid == null || genoid < 1) {
// need to add entry to genotypes table
myLogger.info("putReadMappingData: creating genotypes data for taxon " + taxon);
StringBuilder sb = new StringBuilder();
sb.append("INSERT INTO genotypes (ploidy, is_reference, line_name, line_data,isPhasedAcrossGenes,isPhasedAcrossChromosomes) ");
sb.append("values (2,0,'");
sb.append(taxon);
sb.append("','genotype for creating the path',0,0);");
myLogger.info("putReadMappingData: genotypes insert: " + sb.toString());
connection.createStatement().executeUpdate(sb.toString());
genoid = getGenoidFromLine(taxon);
}
connection.setAutoCommit(true);
// add the read_mapping data - used preparedStatement so can get generated keys
PreparedStatement readMappingPS=connection.prepareStatement(
"INSERT into read_mapping (genoid, method_id, file_group_name, mapping_data, haplotype_list_id)" +
" values(?,?,?,?,?)", Statement.RETURN_GENERATED_KEYS);
readMappingPS.setInt(1,genoid);
readMappingPS.setInt(2,methodId);
readMappingPS.setString(3,file_group_name);
readMappingPS.setBytes(4,mapping_data);
readMappingPS.setInt(5,haplotypeListId);
int status = readMappingPS.executeUpdate();
if (status == 0) {
throw new SQLException ("Failed to create read_mapping entry");
}
ResultSet generatedKeys = readMappingPS.getGeneratedKeys();
if (generatedKeys.next()){
readMappingID = (int)generatedKeys.getLong(1);
//myLogger.info("found read_mapping generated key: " + readMappingID);
}
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("error loading read_mapping data ");
} catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:putReadMappingData: failed when adding read mappings for method " + method + ", taxon " + taxon);
}
// returns the id for the read mapped data
return readMappingID;
}
@Override
public int putHalotypeListData(List hapids) {
// The prepared statement using INSERT OR IGNORE ( or ON CONFLICT DO NOTHING)
// for sqlite and postgres. This relieves the user from worrying about loading
// duplicate data.
// BUt ... getGeneratedKeys may not return an Id for those
Collections.sort(hapids);
String hashString = hapids.stream().map(String::valueOf).collect(Collectors.joining(","));
String hashMD5 = AnchorDataPHG.getChecksumForString(hashString,"MD5");
byte[] hapListBytes = encodeHapidListToByteArray(hapids);
int haplotypeListID = -1;
int updateStatus = 0;
try {
connection.setAutoCommit(true);
haplotypeListPS.setString(1,hashMD5);
haplotypeListPS.setBytes(2,hapListBytes);
// Status will be 0 if the key already exists - because nothing was generated
// But generatedKeys will return the last of the keys as the value - so we need
// to keep track of that status
updateStatus = haplotypeListPS.executeUpdate();
//System.out.println("putHaplotypeListData - after executeUpdate, status=" + updateStatus);
ResultSet generatedKeys = haplotypeListPS.getGeneratedKeys();
if (generatedKeys.next()){
haplotypeListID = (int)generatedKeys.getLong(1);
//myLogger.info("found haplotype_list generated key: " + haplotypeListID);
}
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("error loading haplotype_list data ");
} catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:putHalotypeListData: failed when adding haplotype_list data " + exc.getMessage());
}
if (updateStatus == 0) {
// this means an entry with this hash value already exists - so grab the id
haplotypeListID = getHaplotypeListIDfromHash(hashMD5);
}
myLogger.info("putHaplotypeListData - at end, haplotypeListId = " + haplotypeListID);
return haplotypeListID;
}
@Override
public int putGenomeFileData(String genome_server_path, String genome_file, int genoid, int type) {
// "genome_file" is a full path plus file name where this genome file can be accessed for processing its checksum
File genomeFile = new File(genome_file);
String fileName = genomeFile.getName();
// Check if entry already exists for this genoid
int genomeFileID = getGenomeFileIdFromGenoidAndFile(genoid,fileName);
if (genomeFileID > 0) {
myLogger.info("putGenomeFileData - genoid already has data for genoid and file: " + genoid + ", " + genome_file + ", returning genomeFileID:" +genomeFileID);
return genomeFileID;
}
// Create the file checksum, Add entries to the genome_file_data table.
// Return the newly created table id
// Note the "genome_server_path" is the path on some server where this file
// resides for outside users. It is not necessarily the path this code needs
// to access the file.
// Originally, the sequence checksums were created with MD5 and this genome data file checksum
// was created using SHA-256. I believe we switched to SHA-256 as it was thought to be more secure.
// Because these are no passwords we're encrypting, and BrAPI uses MD5 for its Reference checksums, I
// am changing this back to use MD5
String checksum = DBLoadingUtils.getChecksumForFile(genomeFile,"MD5");
try {
connection.setAutoCommit(true);
genomeFileDataPS.setString(1,genome_server_path);
genomeFileDataPS.setString(2,fileName);
genomeFileDataPS.setString(3,checksum);
genomeFileDataPS.setInt(4,genoid);
genomeFileDataPS.setInt(5,type);
genomeFileDataPS.executeUpdate();
ResultSet generatedKeys = genomeFileDataPS.getGeneratedKeys();
if (generatedKeys.next()){
genomeFileID = (int)generatedKeys.getLong(1);
}
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("error loading genome_file_data for file: " + genome_file);
} catch (Exception exc) {
throw new IllegalStateException("putGenomeFileData: error adding genomome file " + genome_file + " for genoid " + genoid + " " + exc.getMessage());
}
return genomeFileID;
}
@Override
public int putTaxaGroupName(String group_name) {
// Load taxa group name
Integer id = taxaGroupTaxaGroupIDMap.get(group_name);
if(id==null) {
try {
connection.setAutoCommit(true);
String query = "insert into taxa_groups (taxa_grp_name) values ('" + group_name + "')";
connection.createStatement().executeUpdate(query);
} catch (SQLException exc) {
throw new IllegalStateException("PHGdbAccess:putTaxaGroupName: failed to add method " + group_name + " " + exc.getMessage());
}
myLogger.info("PHGdbAccess:putTaxaGroupName: added group name " + group_name + " to taxa_grp_name table");
loadTaxaGroupHash();
id = taxaGroupTaxaGroupIDMap.get(group_name);
}
return id;
}
@Override
public void putTaxaTaxaGroups(String group_name, List taxaList) {
int batchCount = 0;
int totalCount = 0;
Integer id = taxaGroupTaxaGroupIDMap.get(group_name);
if (id == null) {
id = putTaxaGroupName(group_name);
}
// get genoids for taxa
List genoids = new ArrayList<>();
for (String taxon: taxaList) {
int genoid = getGenoidFromLine(taxon);
if (genoid <= 0) {
// this taxa isn't in the db
throw new IllegalArgumentException("putTaxaTaxaGroups: taxon on list not found in the database: " + taxon);
}
genoids.add(genoid);
}
// load the db in batches
try {
connection.setAutoCommit(false);
for (int genoid : genoids) {
taxaGroupGenoidPS.setInt(1,id);
taxaGroupGenoidPS.setInt(2,genoid);
taxaGroupGenoidPS.addBatch();
batchCount++;
totalCount++;
if (batchCount> 1000 ) {
myLogger.info("taxaGroupGenoidPS.executeBatch count: " + totalCount);
taxaGroupGenoidPS.executeBatch();
batchCount = 0;
}
}
taxaGroupGenoidPS.executeBatch(); // execute remaining
connection.setAutoCommit(true);
} catch (SQLException se) {
// This helps debug when queries have a problem
int count = 1;
while (se != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + se.getErrorCode());
myLogger.error("SqlState: " + se.getSQLState());
myLogger.error("Error Message: " + se.getMessage());
se = se.getNextException();
count++;
}
throw new IllegalStateException("SQL error loading taxaGroup with genoids for group name: " + group_name);
} catch (Exception exc) {
throw new IllegalStateException("putTaxaTaxaGroups: error adding taxa groups for group " + group_name + " " + exc.getMessage());
}
if (totalCount > 0) {
myLogger.info("putTaxaTaxaGroups: group name:" + group_name + " total count added " + totalCount);
}
}
@Override
public boolean putAssemblyInterAnchorSequences(String line_name, int hapNumber, String method, Multimap anchorSequences) {
return false;
}
// begin delete methods
@Override
public void deletePaths(String method, List taxa) {
if (method == null && (taxa == null || taxa.size() == 0)) {
throw new IllegalArgumentException("deletePathsByMethod: both method and taxa are null, one must contain data");
}
int methodId = -1;
if (method != null) {
methodId = getMethodIdFromName(method);
if (methodId < 1 ) {
// method not found.
throw new IllegalArgumentException("deletePathsByMethod: Method not found in tables: " + method);
}
}
String genoidString = null;
if (taxa != null && taxa.size() > 0) {
List genoids = getGenoidsForTaxa(taxa);
genoidString = genoids.stream().map(id->id.toString()).collect(Collectors.joining(","));
}
StringBuilder genoidQuery = new StringBuilder();
if (genoidString != null) {
genoidQuery.append(" genoid IN (");
genoidQuery.append(genoidString);
genoidQuery.append(")");
}
// Get the pathids that are associated with this method/taxaList.
// With these path_ids, we delete from the read_mapping_paths table, then the paths table
List pathIds = new ArrayList();
StringBuilder querySB = new StringBuilder();
querySB.append("select path_id from paths ");
// Add method if one was specified
if (methodId > 0) {
querySB.append(" where method_id = ");
querySB.append(methodId);
// add taxa if they were specified
if (genoidQuery.length() > 0) {
querySB.append(" and ");
querySB.append(genoidQuery.toString());
}
} else { // only taxa was specified
querySB.append(" where ");
querySB.append(genoidQuery.toString());
}
querySB.append(";");
myLogger.info("deletePaths: get paths id query: " + querySB.toString());
try(ResultSet rs = connection.createStatement().executeQuery(querySB.toString())) {
while(rs.next()) {
int pathId = rs.getInt(1);
pathIds.add(pathId);
}
}
catch (Exception exc) {
throw new IllegalStateException("PHGdbAccess:deletePathsByMethod: error querying paths using methodID "
+ methodId+" "+exc.getMessage());
}
if (pathIds.size() == 0) {
String taxaString = taxa.stream().collect(Collectors.joining(","));
myLogger.warn("No path entries found for method " + method + " and taxa: " + taxaString);
return;
}
String pathidString = pathIds.stream().map(id->id.toString()).collect(Collectors.joining(","));
// Delete from the read_mapping_paths table
querySB.setLength(0);
querySB.append("delete from read_mapping_paths where path_id IN (");
querySB.append(pathidString);
querySB.append(");");
myLogger.info("deletePaths: query statement to delete from read_Mapping_paths table: " + querySB.toString());
try {
// Result should return the number that were deleted
int result = connection.createStatement().executeUpdate(querySB.toString());
// This value can be 0, as some paths are created without read_mappings, e.g.
// for assemblies and WGS haplotypes.
myLogger.info("deletePathsByMethod: deleted " + result + " entries from read_mapping_paths for path_ids having method " + method);
} catch (SQLException sqle) {
int count = 1;
while (sqle != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + sqle.getErrorCode());
myLogger.error("SqlState: " + sqle.getSQLState());
myLogger.error("Error Message: " + sqle.getMessage());
sqle = sqle.getNextException();
count++;
}
throw new IllegalStateException("error deleting from read_mapping_paths table");
}
// Delete from the paths table
querySB.setLength(0);
querySB.append("delete from paths where path_id IN (");
querySB.append(pathidString);
querySB.append(");");
myLogger.info("deletePaths: query statement to delete from paths table: " + querySB.toString());
try {
// Result should return the number that were deleted
int result = connection.createStatement().executeUpdate(querySB.toString());
myLogger.info("deletePathsByMethod: deleted " + result + " entries from paths with method " + method);
} catch (SQLException sqle) {
int count = 1;
while (sqle != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + sqle.getErrorCode());
myLogger.error("SqlState: " + sqle.getSQLState());
myLogger.error("Error Message: " + sqle.getMessage());
sqle = sqle.getNextException();
count++;
}
throw new IllegalStateException("error deleting from paths table with method " + method);
}
}
@Override
public boolean deleteReadMappingsCascade(List readMappingIds ) {
// Must first delete from the read_mapping_paths table
// then delete from the read_mapping table
// then delete the paths associated with these read_mappings from the paths table
if (readMappingIds.size() > 0) {
List pIds = getPathIdsForReadMappingIds(readMappingIds);
deleteReadMappingPathsById( readMappingIds); // this is the read_mapping_paths table
deleteReadMappingsById(readMappingIds); // this is the read_mapping table
if (pIds != null && pIds.size() > 0) {
deletePathsById(pIds); // delete from paths table
}
}
return true;
}
@Override
public void deleteReadMappingPathsById(List readMappingIds) {
String rmString = readMappingIds.stream().map(id->id.toString()).collect(Collectors.joining(","));
StringBuilder querySB = new StringBuilder();
querySB.append("delete from read_mapping_paths where read_mapping_id IN (");
querySB.append(rmString);
querySB.append(");");
try {
// Result should return the number that were deleted
int result = connection.createStatement().executeUpdate(querySB.toString());
myLogger.info("deleteReadMappingPathsById: deleted " + result + " entries from read_mapping_paths based on id");
} catch (SQLException sqle) {
int count = 1;
while (sqle != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + sqle.getErrorCode());
myLogger.error("SqlState: " + sqle.getSQLState());
myLogger.error("Error Message: " + sqle.getMessage());
sqle = sqle.getNextException();
count++;
}
throw new IllegalStateException("error deleting from read_mapping_paths table");
}
}
@Override
public void deleteReadMappingsById(List readMappingIds) {
String rmString = readMappingIds.stream().map(id->id.toString()).collect(Collectors.joining(","));
StringBuilder querySB = new StringBuilder();
querySB.append("delete from read_mapping where read_mapping_id IN (");
querySB.append(rmString);
querySB.append(");");
try {
// Result should return the number that were deleted
int result = connection.createStatement().executeUpdate(querySB.toString());
myLogger.info("deleteReadMappingsById: deleted " + result + " entries from read_mapping table based on id");
} catch (SQLException sqle) {
int count = 1;
while (sqle != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + sqle.getErrorCode());
myLogger.error("SqlState: " + sqle.getSQLState());
myLogger.error("Error Message: " + sqle.getMessage());
sqle = sqle.getNextException();
count++;
}
throw new IllegalStateException("error deleting from read_mapping table ");
}
}
public void deletePathsById(List pathsIds) {
String pathString = pathsIds.stream().map(id->id.toString()).collect(Collectors.joining(","));
StringBuilder querySB = new StringBuilder();
querySB.append("delete from paths where path_id IN (");
querySB.append(pathString);
querySB.append(");");
try {
// Result should return the number that were deleted
int result = connection.createStatement().executeUpdate(querySB.toString());
myLogger.info("deletePathsById: deleted " + result + " entries from paths table based on id");
} catch (SQLException sqle) {
int count = 1;
while (sqle != null) {
myLogger.error("SQLException " + count);
myLogger.error("Code: " + sqle.getErrorCode());
myLogger.error("SqlState: " + sqle.getSQLState());
myLogger.error("Error Message: " + sqle.getMessage());
sqle = sqle.getNextException();
count++;
}
throw new IllegalStateException("error deleting from paths table ");
}
}
@Override
public int deleteMethodByName(String method) {
StringBuilder querySB = new StringBuilder();
querySB.append("delete from methods where name='");
querySB.append(method);
querySB.append("';");
myLogger.info("deleteMethodByName query: " + querySB.toString());
int result = 0;
try {
result = connection.createStatement().executeUpdate(querySB.toString());
} catch (Exception exc) {
// query threw and error
throw new IllegalArgumentException("deleteReadMappingsForTaxaWithMethod: Error deleting method from methods_tables " + method +
"Please verify this method is not still in use in other table entries. " + exc.getMessage());
}
return result;
}
@Override
public int getHaplotypeListIDfromHash(String hash) {
// This method is written for use by putHalotypeListData()
// When an entry already exists, no generated_key is returned.
// This method must then be called to obtain the key for an existing
// entry with the specified hash value.
StringBuilder querySB = new StringBuilder();
querySB.append("SELECT haplotype_list_id from haplotype_list where list_hash='");
querySB.append(hash);
querySB.append("';");
try {
int id = 0;
ResultSet rs = connection.createStatement().executeQuery(querySB.toString());
if (rs.next()) {
id = rs.getInt(1);
} else {
myLogger.warn("getHapltoypeListID returning 0");
}
return id;
} catch (SQLException se) {
throw new IllegalArgumentException("getHapltoypeListID - error getting ID for hash");
}
}
@Override
public void close() throws Exception {
myLogger.info("Closing DB");
connection.close();
}
}