net.maizegenetics.pangenome.api.CreateGraphUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.api;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSortedSet;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.Tuple;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import java.util.function.BiConsumer;
import java.util.stream.Collectors;
import static net.maizegenetics.pangenome.api.VariantUtils.*;
/**
* @author Terry Casstevens Created August 21, 2017
*/
public class CreateGraphUtils {
private static final Logger myLogger = Logger.getLogger(CreateGraphUtils.class);
public static final String NO_CONSENSUS_METHOD = "Haplotype_caller";
private CreateGraphUtils() {
// utility
}
/**
* Creates a database connection given a properties file
*
* @param propertiesFile properties file
*
* @return database connection
*/
public static Connection connection(String propertiesFile) {
//False indicates don't create db - if db doesn't exist, returns null
return DBLoadingUtils.connection(propertiesFile, false);
}
/**
* Creates a sqlite database connection.
*
* @param host hostname
* @param user user id
* @param password password
* @param dbName database name
*
* @return SQLite database connection
*/
public static Connection connection(String host, String user, String password, String dbName) {
Connection connection = null;
String url = "jdbc:sqlite:" + dbName;
myLogger.info("Database URL: " + url);
try {
Class.forName("org.sqlite.JDBC");
connection = DriverManager.getConnection(url, user, password);
} catch (ClassNotFoundException e) {
myLogger.error(e.getMessage(), e);
throw new IllegalStateException("CreateGraph: connection: org.sqlite.JDBC can't be found");
} catch (SQLException e) {
myLogger.error(e.getMessage(), e);
throw new IllegalStateException("CreateGraph: connection: problem connecting to database: " + e.getMessage());
}
myLogger.info("Connected to database: " + url + "\n");
return connection;
}
/**
* Retrieves all ReferenceRange instances
*
* @param database database connection
*
* @return map of ReferenceRanges, key is references_ranges.ref_range_id
*/
public static Map referenceRangeMap(Connection database) {
if (database == null) {
throw new IllegalArgumentException("CreateGraphUtils: referenceRangesAsMap: Must specify database connection.");
}
long time = System.nanoTime();
// Create method name for querying initial ref region and inter-region ref_range_group method ids
String refLine = getRefLineName(database);
StringBuilder querySB = new StringBuilder();
querySB.append("select reference_ranges.ref_range_id, chrom, range_start, range_end, methods.name from reference_ranges ");
querySB.append(" INNER JOIN ref_range_ref_range_method on ref_range_ref_range_method.ref_range_id=reference_ranges.ref_range_id ");
querySB.append(" INNER JOIN methods on ref_range_ref_range_method.method_id = methods.method_id ");
querySB.append(" AND methods.method_type = ");
querySB.append(DBLoadingUtils.MethodType.REF_RANGE_GROUP.getValue());
querySB.append(" ORDER BY reference_ranges.ref_range_id");
String query = querySB.toString();
myLogger.info("referenceRangesAsMap: query statement: " + query);
ImmutableMap.Builder builder = ImmutableMap.builder();
try (ResultSet rs = database.createStatement().executeQuery(query)) {
String currentChromosome = null;
int currentStart = -1;
int currentEnd = -1;
int currentRefRangeId = -1;
ImmutableSet.Builder methodNameSet = ImmutableSet.builder();
while (rs.next()) {
int id = rs.getInt("ref_range_id");
String chromosome = rs.getString("chrom");
int start = rs.getInt("range_start");
int end = rs.getInt("range_end");
String methodName = rs.getString("name");
if (currentRefRangeId == -1) {
currentRefRangeId = id;
currentChromosome = chromosome;
currentStart = start;
currentEnd = end;
methodNameSet.add(methodName);
} else if (currentRefRangeId == id) {
methodNameSet.add(methodName);
} else {
builder.put(currentRefRangeId, new ReferenceRange(refLine, Chromosome.instance(currentChromosome), currentStart, currentEnd, currentRefRangeId, methodNameSet.build()));
methodNameSet = ImmutableSet.builder();
currentRefRangeId = id;
currentChromosome = chromosome;
currentStart = start;
currentEnd = end;
methodNameSet.add(methodName);
}
}
ImmutableSet methods = methodNameSet.build();
System.out.println("methods size: " + methods.size());
if (!methods.isEmpty()) {
builder.put(currentRefRangeId, new ReferenceRange(refLine, Chromosome.instance(currentChromosome), currentStart, currentEnd, currentRefRangeId, methods));
}
} catch (Exception se) {
myLogger.debug(se.getMessage(), se);
throw new IllegalStateException("CreateGraphUtils: referenceRanges: Problem querying the database: " + se.getMessage());
}
Map result = builder.build();
myLogger.info("referenceRangesAsMap: number of reference ranges: " + result.size());
myLogger.info("referenceRangesAsMap: time: " + ((double) (System.nanoTime() - time) / 1_000_000_000.0) + " secs.");
return result;
}
/**
* Retrieves all ReferenceRange instances with specified genome interval version name.
*
* @param database database connection
*
* @return ReferenceRanges
*/
public static SortedSet referenceRanges(Connection database) {
if (database == null) {
throw new IllegalArgumentException("CreateGraphUtils: referenceRanges: Must specify database connection.");
}
long time = System.nanoTime();
// Create method name for querying initial ref region and inter-region ref_range_group method ids
String refLine = getRefLineName(database);
StringBuilder querySB = new StringBuilder();
querySB.append("select reference_ranges.ref_range_id, chrom, range_start, range_end, methods.name from reference_ranges ");
querySB.append(" INNER JOIN ref_range_ref_range_method on ref_range_ref_range_method.ref_range_id=reference_ranges.ref_range_id ");
querySB.append(" INNER JOIN methods on ref_range_ref_range_method.method_id = methods.method_id ");
querySB.append(" AND methods.method_type = ");
querySB.append(DBLoadingUtils.MethodType.REF_RANGE_GROUP.getValue());
querySB.append(" ORDER BY reference_ranges.ref_range_id");
String query = querySB.toString();
myLogger.info("referenceRanges: query statement: " + query);
ImmutableSortedSet.Builder builder = ImmutableSortedSet.naturalOrder();
try (ResultSet rs = database.createStatement().executeQuery(query)) {
String currentChromosome = null;
int currentStart = -1;
int currentEnd = -1;
int currentRefRangeId = -1;
ImmutableSet.Builder methodNameSet = ImmutableSet.builder();
while (rs.next()) {
int id = rs.getInt("ref_range_id");
String chromosome = rs.getString("chrom");
int start = rs.getInt("range_start");
int end = rs.getInt("range_end");
String methodName = rs.getString("name");
if (currentRefRangeId == -1) {
currentRefRangeId = id;
currentChromosome = chromosome;
currentStart = start;
currentEnd = end;
methodNameSet.add(methodName);
} else if (currentRefRangeId == id) {
methodNameSet.add(methodName);
} else {
builder.add(new ReferenceRange(refLine, Chromosome.instance(currentChromosome), currentStart, currentEnd, currentRefRangeId, methodNameSet.build()));
methodNameSet = ImmutableSet.builder();
currentRefRangeId = id;
currentChromosome = chromosome;
currentStart = start;
currentEnd = end;
methodNameSet.add(methodName);
}
}
ImmutableSet methods = methodNameSet.build();
System.out.println("methods size: " + methods.size());
if (!methods.isEmpty()) {
builder.add(new ReferenceRange(refLine, Chromosome.instance(currentChromosome), currentStart, currentEnd, currentRefRangeId, methods));
}
} catch (Exception se) {
myLogger.debug(se.getMessage(), se);
throw new IllegalStateException("CreateGraphUtils: referenceRanges: Problem querying the database: " + se.getMessage());
}
SortedSet result = builder.build();
myLogger.info("referenceRanges: number of reference ranges: " + result.size());
myLogger.info("referenceRanges: time: " + ((double) (System.nanoTime() - time) / 1_000_000_000.0) + " secs.");
return result;
}
/**
* Retrieves all groups of taxa.
*
* @param database database connection
*
* @return map of TaxaList, key is gamete_groups.gamete_grp_id
*/
public static Map taxaListMap(Connection database) {
if (database == null) {
throw new IllegalArgumentException("CreateGraphUtils: taxaListMap: Must specify database connection.");
}
long time = System.nanoTime();
//
// select gamete_haplotypes.gamete_grp_id, genotypes.line_name
// from gamete_haplotypes
// inner join gametes on gamete_haplotypes.gameteid = gametes.gameteid
// inner join genotypes on gametes.genoid = genotypes.genoid
// order by gamete_haplotypes.gamete_grp_id;
//
StringBuilder builder = new StringBuilder();
builder.append("SELECT gamete_haplotypes.gamete_grp_id, genotypes.line_name ");
builder.append("FROM gamete_haplotypes ");
builder.append("INNER JOIN gametes ON gamete_haplotypes.gameteid = gametes.gameteid ");
builder.append("INNER JOIN genotypes on gametes.genoid = genotypes.genoid ");
builder.append("ORDER BY gamete_haplotypes.gamete_grp_id;");
String query = builder.toString();
myLogger.info("taxaListMap: query statement: " + query);
try (ResultSet rs = database.createStatement().executeQuery(query)) {
Map taxaCache = new HashMap<>();
ImmutableMap.Builder resultBuilder = ImmutableMap.builder();
int currentGroupId = -1;
List currentTaxaList = new ArrayList<>();
if (rs.next()) {
currentGroupId = rs.getInt("gamete_grp_id");
currentTaxaList.add(taxon(rs.getString("line_name"), taxaCache));
}
while (rs.next()) {
int groupId = rs.getInt("gamete_grp_id");
if (groupId == currentGroupId) {
currentTaxaList.add(taxon(rs.getString("line_name"), taxaCache));
} else {
TaxaListBuilder taxaBuilder = new TaxaListBuilder();
taxaBuilder.addAll(currentTaxaList);
resultBuilder.put(currentGroupId, taxaBuilder.build());
currentGroupId = groupId;
currentTaxaList = new ArrayList<>();
currentTaxaList.add(taxon(rs.getString("line_name"), taxaCache));
}
}
TaxaListBuilder taxaBuilder = new TaxaListBuilder();
taxaBuilder.addAll(currentTaxaList);
resultBuilder.put(currentGroupId, taxaBuilder.build());
Map result = resultBuilder.build();
myLogger.info("taxaListMap: number of taxa lists: " + result.size());
myLogger.info("taxaListMap: time: " + ((double) (System.nanoTime() - time) / 1_000_000_000.0) + " secs.");
return result;
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("CreateGraphUtils: taxaListMap: Problem querying the database: " + e.getMessage());
}
}
/**
* Gets Taxon from name. Reuses Taxon instance if already created.
*
* @param name taxon name
* @param taxaCache taxa cache
*
* @return Taxon instance
*/
private static Taxon taxon(String name, Map taxaCache) {
Taxon result = taxaCache.get(name);
if (result == null) {
result = new Taxon(name);
taxaCache.put(name, result);
}
return result;
}
/**
* Creates lists of HaplotypeNodes organized by reference Range based on the given method.
*
* @param database database connection
* @param referenceRangeMap ReferenceRange map ({@link #referenceRangeMap(Connection)}
* @param taxaListMap TaxaList map {@link #taxaListMap(Connection)}
* @param methods
* @param includeVariantContext whether to include variant contexts in haplotype nodes
* @param includeHapids includes specified hapids. include everything if null
*
* @return Map of HaplotypeNode Lists (keys are ReferenceRange)
*/
private static final String ALL_CHROMOSOMES = "ALL_CHROMOSOMES";
public static TreeMap> createHaplotypeNodes(Connection database, Map referenceRangeMap,
Map taxaListMap, List> methods,
boolean includeSequences, boolean includeVariantContext,
SortedSet includeHapids, String localGVCFFolder,
List chromosomes, TaxaList taxaToKeep) {
if (database == null) {
throw new IllegalArgumentException("CreateGraphUtils: createHaplotypeNodes: Must specify database connection.");
}
if (includeHapids != null && !includeHapids.isEmpty() && taxaToKeep != null && !taxaToKeep.isEmpty()) {
throw new IllegalStateException("CreateGraphUtils: createHaplotypeNodes: can't specify both hapids and taxa");
}
// used in addNodes() if we need to download gvcf files.
Map gvcfIdToRemotePath = VariantUtils.gvcfIdsToGvcfFileMap(database);
if (methods == null && includeHapids != null && !includeHapids.isEmpty()) {
return createHaplotypeNodes(database, referenceRangeMap, taxaListMap, includeSequences, includeVariantContext,
includeHapids, chromosomes, gvcfIdToRemotePath, localGVCFFolder);
} else if (methods == null) {
throw new IllegalArgumentException("CreateGraphUtils: createHaplotypeNodes: either methods or haplotypeIds must be specified.");
}
long time = System.nanoTime();
TreeMap> result = new TreeMap<>();
for (Tuple methodPair : methods) {
String haplotypeMethod = methodPair.x;
if (haplotypeMethod == null || haplotypeMethod.isEmpty()) {
throw new IllegalArgumentException("CreateGraphUtils: createHaplotypeNodes: haplotype method must be specified.");
}
String rangeGroupMethod = methodPair.y;
if (rangeGroupMethod == null || rangeGroupMethod.isEmpty()) {
rangeGroupMethod = null;
}
myLogger.info("createHaplotypeNodes: haplotype method: " + haplotypeMethod + " range group method: " + rangeGroupMethod);
int methodId = methodId(database, haplotypeMethod);
//
// select gamete_grp_id, ref_range_id, sequence, seq_hash, variant_list
// from haplotype where method_id = method_id
// AND haplotypes_id in (11945906, 11945907, 11945909)
//
StringBuilder builder = new StringBuilder();
builder.append("SELECT haplotypes_id, gamete_grp_id, haplotypes.ref_range_id, asm_contig, asm_start_coordinate," +
" asm_end_coordinate, asm_strand, genome_file_id");
if (includeSequences) {
builder.append(", sequence");
}
builder.append(", seq_hash, seq_len");
if (includeVariantContext) {
builder.append(", gvcf_file_id");
}
builder.append(" FROM haplotypes ");
// If getting subset by chromosomes, join with reference_ranges table
// because that's where chromosome is defined.
if (chromosomes != null && !chromosomes.isEmpty()) {
builder.append("inner join reference_ranges on haplotypes.ref_range_id = reference_ranges.ref_range_id ");
}
builder.append("WHERE method_id = ");
builder.append(methodId);
// Add clause to query only chromosomes specified
if (chromosomes != null && !chromosomes.isEmpty()) {
StringJoiner joiner = new StringJoiner(",");
chromosomes.stream().map(s -> "'" + s + "'").forEach(s -> joiner.add(s));
builder.append(" AND chrom in (");
builder.append(joiner.toString());
builder.append(")");
}
if (includeHapids != null && !includeHapids.isEmpty()) {
builder.append(" AND haplotypes_id in (");
boolean notFirst = false;
for (int id : includeHapids) {
if (notFirst) {
builder.append(", ");
} else {
notFirst = true;
}
builder.append(id);
}
builder.append(")");
}
builder.append(";");
String query = builder.toString();
String msg = "createHaplotypeNodes: query statement: " + query;
if (msg.length() > 200) msg = msg.substring(0,200) + "...";
myLogger.info(msg);
addNodes(result, database, query, referenceRangeMap, taxaListMap, includeSequences, includeVariantContext,
rangeGroupMethod, gvcfIdToRemotePath, taxaToKeep, localGVCFFolder);
}
myLogger.info("createHaplotypeNodes: time: " + ((double) (System.nanoTime() - time) / 1_000_000_000.0) + " secs.");
if (includeHapids != null && !includeHapids.isEmpty()) {
warnIfMissingHapids(includeHapids, result);
}
return result;
}
private static TreeMap> createHaplotypeNodes(Connection database, Map referenceRangeMap, Map taxaListMap, boolean includeSequences,
boolean includeVariantContext,
SortedSet includeHapids,
List chromosomes,
Map gvcfIdToRemoteServerFile,
String localGVCFFolder) {
if (includeHapids == null || includeHapids.isEmpty()) {
throw new IllegalArgumentException("CreateGraphUtils: createHaplotypeNodes: haplotypeIds must be specified.");
}
long time = System.nanoTime();
TreeMap> result = new TreeMap<>();
//
// select gamete_grp_id, ref_range_id, sequence, seq_hash, variant_list
// from haplotype where haplotypes_id in (11945906, 11945907, 11945909)
//
StringBuilder builder = new StringBuilder();
builder.append("SELECT haplotypes_id, gamete_grp_id, haplotypes.ref_range_id, asm_contig, asm_start_coordinate, " +
"asm_end_coordinate, asm_strand, genome_file_id");
if (includeSequences) {
builder.append(", sequence");
}
builder.append(", seq_hash, seq_len");
if (includeVariantContext) {
builder.append(", gvcf_file_id");
}
builder.append(" FROM haplotypes ");
// If getting subset by chromosomes, join with reference_ranges table
// because that's where chromosome is defined.
if (chromosomes != null && !chromosomes.isEmpty()) {
builder.append("inner join reference_ranges on haplotypes.ref_range_id = reference_ranges.ref_range_id ");
}
builder.append("WHERE haplotypes_id in (");
boolean notFirst = false;
for (int id : includeHapids) {
if (notFirst) {
builder.append(", ");
} else {
notFirst = true;
}
builder.append(id);
}
builder.append(")");
// Add clause to query only chromosomes specified
if (chromosomes != null && !chromosomes.isEmpty()) {
StringJoiner joiner = new StringJoiner(",");
chromosomes.stream().map(s -> "'" + s + "'").forEach(s -> joiner.add(s));
builder.append(" AND chrom in (");
builder.append(joiner.toString());
builder.append(")");
}
builder.append(";");
String query = builder.toString();
String msg = "createHaplotypeNodes: query statement: " + query;
if (msg.length() > 200) msg = msg.substring(0,200) + "...";
myLogger.info(msg);
addNodes(result, database, query, referenceRangeMap, taxaListMap, includeSequences, includeVariantContext, null,
gvcfIdToRemoteServerFile, null, localGVCFFolder);
myLogger.info("createHaplotypeNodes: time: " + ((double) (System.nanoTime() - time) / 1_000_000_000.0) + " secs.");
warnIfMissingHapids(includeHapids, result);
return result;
}
private static void warnIfMissingHapids(SortedSet includeHapids, TreeMap> result) {
HashSet resultIDs = result.entrySet().stream()
.map(entry -> entry.getValue())
.flatMap(List::stream)
.map(node -> node.id())
.collect(Collectors.toCollection(HashSet::new));
long numMissing = includeHapids.stream()
.filter(id -> !resultIDs.contains(id))
.count();
if (numMissing != 0) {
myLogger.warn("warnIfMissingHapids: the graph is missing this number of specified hapids : " + numMissing);
}
}
/**
* Creates lists of HaplotypeNodes with variant contexts corresponding to the specified nodes organized by reference
* Range.
* LCJ June 16, 2022- this function is only called from MergeGVCFPlugin:extractNodesWithVariants()
* MergeGVCFPlugin has been deprecated, so I am not making changes here to pull gvcf
* files for the variants.
*
* If this function is called at some point from a non-deprecated method, we
* can re-look at what changes are needed.
*
* @param database database connection
* @param includeHapNodes includes specified hapids
*
* @return Map of HaplotypeNode Lists (keys are ReferenceRange)
*/
public static TreeMap> createHaplotypeNodesWithVariants(Connection database,
Set includeHapNodes) {
if (database == null) {
throw new IllegalArgumentException("CreateGraphUtils: createHaplotypeNodesWithVariants: Must specify database connection.");
}
if (includeHapNodes == null || includeHapNodes.isEmpty()) {
throw new IllegalArgumentException("CreateGraphUtils: createHaplotypeNodesWithVariants: Must specify at least one haplotype node to include.");
}
long time = System.nanoTime();
Map nodeMap = new HashMap<>();
for (HaplotypeNode node : includeHapNodes) {
nodeMap.put(node.id(), node);
}
TreeMap> result = getNodesWithVariants(database, nodeMap);
myLogger.info("createHaplotypeNodesWithVariants: number of reference ranges: " + result.size());
myLogger.info("createHaplotypeNodesWithVariants: time: " + ((double) (System.nanoTime() - time) / 1_000_000_000.0) + " secs.");
return result;
}
/**
* Creates HaplotypeGraph with variant contexts corresponding to the given HaplotypeGraph.
*
* @param database database connection
* @param graph graph without variant contexts
*
* @return graph with variant contexts
*/
public static HaplotypeGraph createHaplotypeNodesWithVariants(Connection database, HaplotypeGraph graph) {
if (database == null) {
throw new IllegalArgumentException("CreateGraphUtils: createHaplotypeNodesWithVariants: Must specify database connection.");
}
if (graph == null) {
throw new IllegalArgumentException("CreateGraphUtils: createHaplotypeNodesWithVariants: Must specify haplotype graph.");
}
long time = System.nanoTime();
Map nodeMap = graph.nodeStream().parallel()
.collect(() -> new HashMap<>(),
(nodeMap1, node) -> nodeMap1.put(node.id(), node),
(BiConsumer