net.maizegenetics.pangenome.hapCalling.HapCountBestPathToTextPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.hapCalling;
import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;
import htsjdk.variant.variantcontext.VariantContext;
import net.maizegenetics.pangenome.api.*;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils;
import net.maizegenetics.pangenome.db_loading.PHGDataWriter;
import net.maizegenetics.pangenome.db_loading.PHGdbAccess;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
import javax.swing.*;
import java.awt.*;
import java.io.BufferedWriter;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.Supplier;
import java.util.stream.Collector;
/**
* This plugin processes haplotype node data either from of an inclusion file, or from PHG database
* tables. The HapCountBestPathPlugin is called to find the best path through the provided
* haplotype graph. The output will be either stored to the PHG db, printed to text files or both depending
* on the input type.
*
* Assumptions made by this software:
* 1. This method can determine paths from haplotype counts (perfect hits/exclusion) that are stored in the db's haplotype_counts
* table, or from haplotype counts (perfect hits/exclusion) pulled from an inclusion file If an inclusion
* file is present, its data is used. Otherwise the DB table is used.
* 2. When the DB is used to pull haplotype_counts data, any paths created will be stored in the DB.
* If an inclusion file is used, paths are NOT stored to the DB. This is because the paths table
* needs the haplotype_counts_id. When an inclusion file is used, the assumption is that these
* hapltoype counts are NOT stored in the DB.
* 3. The exported inclusion files (that are input to this Plugin) and the exported path files (created by
* this plugin) each contain "header" data needed to load their contents into the DB. These are the first
* lines in the file, and each begin with #. A separate plugin should be written to load these files to db
* if desired.
*
*
* @author Terry Casstevens, Zack Miller, Lynn Johnson
*/
@Deprecated
public class HapCountBestPathToTextPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(HapCountBestPathToTextPlugin.class);
public HapCountBestPathToTextPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
private PluginParameter configFile = new PluginParameter.Builder("configFile", null, String.class)
.description("Database configuration file")
.guiName("Config File")
.required(true)
.inFile()
.build();
private PluginParameter taxaFilterString = new PluginParameter.Builder<>("taxa", null, String.class)
.description("A comma delimited list of taxa (no spaces allowed) to include in graph. Only nodes containing these taxa will be included in the graph."
+ " If no taxa list is supplied, then all taxa in the full graph will be used.")
.build();
private PluginParameter inclusionFilenameDir = new PluginParameter.Builder<>("inclusionFileDir", null, String.class)
.description("The name of the file containing read inclusion and exclusion counts for hapids.")
.inDir()
.build();
private PluginParameter targetTaxon = new PluginParameter.Builder<>("target", null, String.class)
.description("The taxon that will be used to evaluate the node list returned.")
.build();
private PluginParameter refRangeFile = new PluginParameter.Builder<>("refRangeFile", null, String.class)
.description("The name of the file containing the reference ranges to keep.")
.inFile()
.build();
private PluginParameter myReferenceFileName = new PluginParameter.Builder<>("refFileName", null, String.class)
.required(false)
.inFile()
.guiName("Reference File Name")
.description("Reference file name in case you want to index on the fly")
.build();
private PluginParameter myOutputDir = new PluginParameter.Builder<>("outputDir", null, String.class)
.required(true)
.outDir()
.description("Output Directory")
.build();
// Method details are programmatically generated.
private PluginParameter myHapCountMethod = new PluginParameter.Builder<>("hapCountMethod", null, String.class)
.required(true)
.description("Name of method used to creates inclusion/exclusion counts in FastqToHapCountPLugin")
.build();
private PluginParameter myPathMethod = new PluginParameter.Builder<>("pMethod", null, String.class)
.guiName("Path Method")
.required(true)
.description("Name of method to be used to create paths through the graph.")
.build();
@Override
public DataSet processData(DataSet input) {
// Process the config file as a properties file
Properties properties = new Properties();
try {
properties.load(Utils.getBufferedReader(configFile()));
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalArgumentException("HapCountBestPathTextPlugin:buildHaplotypeGraph: connection: problem reading properties file: " + configFile());
}
//Check to see if the tassel pipeline passes in a graph
List temp = input.getDataOfType(HaplotypeGraph.class);
if (temp.size() != 1) {
throw new IllegalArgumentException("FastqToHapCountPlugin: processData: must input one HaplotypeGraph: " + temp.size());
}
HaplotypeGraph graph = ((HaplotypeGraph)temp.get(0).getData());
System.out.println("Number of Nodes: "+graph.numberOfNodes());
List inclusionFiles = null;
if (inclusionFilenameDir() != null) {
inclusionFiles = DirectoryCrawler.listPaths("glob:*{.txt,.txt.gz}", Paths.get(inclusionFilenameDir()));
}
// Many of the parameters come from the config file, Load to properties file, set up HapCountBestPathPlugin
Properties dbProperties = loadProperties();
HapCountBestPathPlugin bestPathPlugin = setupHapCountBestPathPlugin( dbProperties);
//if inclusion files are present, process from the files. Otherwise, process from the db
List variants;
if (inclusionFiles != null && inclusionFiles.size() > 0) {
variants = processInclusionFiles(input,dbProperties, bestPathPlugin);
} else {
throw new IllegalStateException("Error processing inclusion files. No inclusion files are found in:"+inclusionFilenameDir()+"\n" +
"Check to see if folder exists(is mounted correctly) and check if index file(.mmi) exists.");
//variants = processHapCountsFromDB(input,dbProperties, bestPathPlugin); TODO Undeprecate this call when DB changes are in place.
}
return null;
}
private HapCountBestPathPlugin setupHapCountBestPathPlugin(Properties dbProperties) {
//note that setting minReads to 0 overrides removeEqual, setting it to false in HapCountBestPathPlugin
return new HapCountBestPathPlugin(null, false)
.emissionMethod(ReferenceRangeEmissionProbability.METHOD.valueOf(dbProperties.getProperty("emissionMethod","allCounts")))
.probReadMappedCorrectly(Double.parseDouble(dbProperties.getProperty("probReadMappedCorrectly","0.99")))
.minTaxaPerRange(Integer.parseInt(dbProperties.getProperty("minTaxaPerRange","20")))
.refRangeFile(refRangeFile())
.minReads(Integer.parseInt(dbProperties.getProperty("minReads","1")))
.removeRangesWithEqualCounts(Boolean.parseBoolean(dbProperties.getProperty("removeEqual","true")))
.maxNodesPerRange(Integer.parseInt(dbProperties.getProperty("maxNodesPerRange","30")))
.maxReadsPerKB(Integer.parseInt(dbProperties.getProperty("maxReadsPerKB","100")))
.splitTaxa(Boolean.parseBoolean(dbProperties.getProperty("splitTaxa","false")))
.minTransitionProb(Double.parseDouble(dbProperties.getProperty("minTransitionProb","0.001")))
.taxaFilterString(taxaFilterString());
}
private Properties loadProperties() {
Properties configProperties = new Properties();
try {
configProperties.load(Utils.getBufferedReader(configFile()));
} catch (Exception exc) {
myLogger.error("HapCountBestPathToTextPlugin: loadProperties Failed to Load Properties: " + exc.getMessage());
throw new IllegalStateException("HapCountBestPathToTextPlugin: loadProperties Failed to Load Properties.", exc);
}
return configProperties;
}
private String createPathMethodDetails(Properties dbProperties) {
// Create method details from parameters, Store the path method.
// This returns new (if created) or existing path method ID
StringBuilder methodDetailsSB = new StringBuilder();
methodDetailsSB.append("minTaxaPerRange:").append(dbProperties.getProperty("minTaxaPerRange","20"));
methodDetailsSB.append(" minReads:").append(dbProperties.getProperty("minReads","1"));
methodDetailsSB.append(" removeEqual:").append(dbProperties.getProperty("removeEqual","true"));
methodDetailsSB.append(" maxReadsPerKB:").append(dbProperties.getProperty("maxReadsPerKB","100"));
methodDetailsSB.append(" maxNodesPerRange:").append(dbProperties.getProperty("splitTaxa","false"));
methodDetailsSB.append(" minTransitionProb:").append(dbProperties.getProperty("minTransitionProb","0.001"));
methodDetailsSB.append(" probReadMappedCorrectly:").append(dbProperties.getProperty("probReadMappedCorrectly","0.99"));
methodDetailsSB.append(" emmissionMethod:").append(dbProperties.getProperty("emissionMethod","allCounts"));
methodDetailsSB.append(" splitTaxa:").append(dbProperties.getProperty("splitTaxa","false"));
if (taxaFilterString() != null) {
String[] taxa = taxaFilterString().split(",");
for (String taxon : taxa) {
methodDetailsSB.append(taxon).append(" ");
}
}
// New line at end not needed, ExportHaplotypePathToPlugin will add one when writing to a file.
return methodDetailsSB.toString();
}
private List processHapCountsFromDB(DataSet graph,Properties dbProperties,HapCountBestPathPlugin bestPathPlugin) {
Connection conn = DBLoadingUtils.connection(configFile(), false);
// Grab the haplotype_counts_id and data from the db
PHGDataWriter phg = new PHGdbAccess(conn);
String methodDetails = createPathMethodDetails(dbProperties);
// Store the path method. This returns new (if created) or existing path method ID
// Deprecated, so no changes added to allow for test method type
int pathMethodID = phg.putMethod(pathMethod(), DBLoadingUtils.MethodType.PATHS, pluginParameters());
// data from the haplotype_counts table
// Map of haplotype_counts_id to Tuple
Map> hapCountsIDtoDataMap = phg.getHapCountsIDAndDataForVersionMethod( hapCountMethod());
// See which of the haplotype_counts entries we have already stored in the paths table for this method
Map hapCountsIDToPathMethod = phg.getHapCountsIDAndPathsForMethod(pathMethod());
// Find which hapids in the hapCountsIDtoDataMap are NOT in the hapCountsIDToPathMethod.
// These to be run through HapCountBestPathPlugin !!
Map> hapCountsNotStored = new HashMap>();
for (Integer hapCountsId : hapCountsIDtoDataMap.keySet()) {
if (!hapCountsIDToPathMethod.containsKey(hapCountsId)) {
// not in paths table - add to list
hapCountsNotStored.put(hapCountsId, hapCountsIDtoDataMap.get(hapCountsId));
}
}
// Combine the datasets - this is input to HapCountBestPathPlugin called in stream below
DataSet hapCountsDS = new DataSet(new Datum("HapCountsMap",hapCountsNotStored,null),this);
List combinedDS = new ArrayList<>();
combinedDS.add(graph);
combinedDS.add(hapCountsDS);
// The Multimap stores paths = hapCountsNotStored.keySet().stream()
.map(path -> {
String taxonName = hapCountsNotStored.get(path).getX();
//Get the Best Path given db inclusion/exclusion data and graph
DataSet processedDataSet = bestPathPlugin.haplotypeCountsId(path)
.performFunction(DataSet.getDataSet(combinedDS, this));
List nodes = (List) processedDataSet.getData(0).getData();
// Code below is error checking to verify the returned path contains no more
// than a single node in any given reference range.
SortedSet temp = new TreeSet<>();
for (HaplotypeNode node : nodes) {
if (!temp.add(node.referenceRange())) {
throw new IllegalStateException("HaplotypeCountBestPathToVCFPlugin: processInclusionFiles: HapCountBestPathPlugin returned path has more than one node in the same reference range: " + node.referenceRange());
}
}
myLogger.info("processHapCountsFromDB: number of nodes for taxon/id: " + taxonName + "/" + path + " " + nodes.size());
return new Tuple<>(path, nodes);
})
// Every node on the list returned from HapCountBestPathPlugin gets
// its own entry on the Multimap created below.
.collect((Supplier>) () -> TreeMultimap.create(),
(map, entries) -> {
for (HaplotypeNode entry : entries.y) {
if (entry.id() != -1) {
map.put(entries.x, entry);
}
}
},
(map1, map2) -> map1.putAll(map2));
// phg.putPathsData(pathMethod, methodDetails, paths); // Write to DB
try {
((PHGdbAccess)phg).close();
} catch (Exception exc) {
myLogger.warn("Error closing phg db connection");
}
// Export the path data to files, on a per taxon/haplotype_counts_id basis
// To get the taxon name, index the haplotype_counts_id into the hapCountsNotStored map from above
exportHaplotypePaths(paths, hapCountsNotStored, outputDir(), hapCountMethod(),
pathMethod(), methodDetails);
List outputs = new ArrayList<>();
outputs.add(graph.getData(0));
outputs.add(new Datum("Paths", paths, null));
return null;
}
private List processInclusionFiles(DataSet graph, Properties dbProperties,HapCountBestPathPlugin bestPathPlugin) {
List inclusionFiles = DirectoryCrawler.listPaths("glob:*{.txt,.txt.gz}", Paths.get(inclusionFilenameDir()));
// parallelStream() causes failures running RunSmallSeqTestsDocker - December 15, 2017
// Run with "stream" until we figure out the parallelization problem.
//Multimap paths = inclusionFiles.parallelStream()
Multimap paths = inclusionFiles.stream()
.map(path -> {
String taxonName = parseTaxonName(path.toString());
//Get the Best Path given inclusion file and graph
DataSet processedDataSet = bestPathPlugin.inclusionFilename(path.toString())
.performFunction(graph);
List nodes = (List) processedDataSet.getData(0).getData();
// Code below is error checking to verify the returned path contains no more
// than a single node in any given reference range.
SortedSet temp = new TreeSet<>();
for (HaplotypeNode node : nodes) {
if (!temp.add(node.referenceRange())) {
throw new IllegalStateException("HaplotypeCountBestPathToVCFPlugin: processInclusionFiles: HapCountBestPathPlugin returned path has more than one node in the same reference range: " + node.referenceRange());
}
}
myLogger.info("processInclusionFiles: number of nodes for taxon: " + taxonName + " path: " + nodes.size());
return new Tuple<>(taxonName, nodes);
})
.collect((Supplier>) () -> TreeMultimap.create(),
(map, nodes) -> {
for (HaplotypeNode node : nodes.y) {
if (node.id() != -1) {
map.put(nodes.x, node); // add taxonName, nodes to map
}
}
},
(map1, map2) -> map1.putAll(map2));
// NOTE - Path data is only written to the db when processing haplotype counts from
// the PHG db haplotype_counts table. When processing from an inclusion file, the
// inclusion data may not exist in the db, so the required haplotype_counts_id for the
// paths table is not present.
List outputs = new ArrayList<>();
outputs.add(graph.getData(0));
outputs.add(new Datum("Paths", paths, null));
// The original ExportHaplotypePathToFilePlugin is called to create the path output files
// The version and method data are included to become header lines in the exported file.
// This provides data that will allow the path file to be imported to the DB later if desired.
ExportHaplotypePathToFilePlugin export = new ExportHaplotypePathToFilePlugin(getParentFrame(), isInteractive())
.refVersion("REF_VERSION")
.outputFileDirectory(outputDir())
.hapCountMethod(hapCountMethod())
.pathMethod(pathMethod())
.pathMethodDetails(createPathMethodDetails(dbProperties));
export.performFunction(new DataSet(outputs, this));
return null;
}
private static String parseTaxonName(String currentPath) {
String[] currentPathSplit = currentPath.split("/");
String fileName = currentPathSplit[currentPathSplit.length - 1];
String taxonName = fileName.substring(0,fileName.lastIndexOf("."));
return taxonName;
}
// Exports the paths to a file with the name format of outputDir/_path.txt
// This method is called when processing data from the DB.
private static void exportHaplotypePaths (Multimap paths,Map> hapCountsNotStored, String outputDir,
String hcMethod, String pMethod, String pMethodDetails) {
//loop through each haplotype counts id for which we have a path
for (Integer hapCountId : paths.keySet()) {
String taxonName = hapCountsNotStored.get(hapCountId).getX();
myLogger.debug("HapCountBestPathTextPlugin:exportHaplotypePaths:Processing Taxon:" + taxonName);
try (BufferedWriter writer = Utils.getBufferedWriter(outputDir + "/" + taxonName + "_path.txt")) {
// Write header data - necessary if this file is used to load the DB paths table.
writer.write("#hapCountMethod=" + hcMethod + "\n");
writer.write("#pathMethod=" + pMethod + "\n");
if (pMethodDetails != null) {
writer.write("#pathMethodDetails=" + pMethodDetails + "\n");
} else {
writer.write("#pathMethodDetails=none\n");
}
writer.write("#haplotype_counts_id=" + hapCountId + "\n");
myLogger.debug("Extracting the haplotypeIds");
SortedSet sortedIdSet = paths.get(hapCountId).stream()
.map(haplotypeNode -> haplotypeNode.id())
.filter(hapId -> hapId != -1)
.collect(Collector.of(TreeSet::new,
(set, hapId) -> set.add(hapId),
(leftSet, rightSet) -> {
leftSet.addAll(rightSet);
return leftSet;
}));
myLogger.debug("Writing out each hapId to the textFile");
for (Integer hapId : sortedIdSet) {
writer.write("" + hapId);
writer.write("\n");
}
} catch (Exception e) {
throw new IllegalStateException("ExportHaplotypePathToFilePlugin: exportPathsToTextFiles: error opening up the file");
}
}
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return "Find Best Path from HapCounts";
}
@Override
public String getToolTipText() {
return "Find Best Path from HapCounts";
}
/**
* Database configuration file
*
* @return Database Config File
*/
public String configFile() {
return configFile.value();
}
/**
* Set Database Config File. Database configuration file
*
* @param value Database Config File
*
* @return this plugin
*/
public HapCountBestPathToTextPlugin configFile(String value) {
configFile = new PluginParameter<>(configFile, value);
return this;
}
/**
* A comma delimited list of taxa (no spaces allowed) to include in graph. Only nodes containing these taxa will be
* included in the graph. If no taxa list is supplied, then all taxa in the full graph will be used.
*
* @return Taxa
*/
public String taxaFilterString() {
return taxaFilterString.value();
}
/**
* Set Taxa. A comma delimited list of taxa (no spaces allowed) to include in graph. Only nodes containing these
* taxa will be included in the graph. If no taxa list is supplied, then all taxa in the full graph will be used.
*
* @param value Taxa
*
* @return this plugin
*/
public HapCountBestPathToTextPlugin taxaFilterString(String value) {
taxaFilterString = new PluginParameter<>(taxaFilterString, value);
return this;
}
/**
* The name of the file containing read inclusion and exclusion counts for hapids.
*
* @return Inclusion File Dir
*/
public String inclusionFilenameDir() {
return inclusionFilenameDir.value();
}
/**
* Set Inclusion File Dir. The name of the file containing read inclusion and exclusion counts for hapids.
*
* @param value Inclusion File Dir
*
* @return this plugin
*/
public HapCountBestPathToTextPlugin inclusionFilenameDir(String value) {
inclusionFilenameDir = new PluginParameter<>(inclusionFilenameDir, value);
return this;
}
/**
* The taxon that will be used to evaluate the node list returned.
*
* @return Target
*/
public String targetTaxon() {
return targetTaxon.value();
}
/**
* Set Target. The taxon that will be used to evaluate the node list returned.
*
* @param value Target
*
* @return this plugin
*/
public HapCountBestPathToTextPlugin targetTaxon(String value) {
targetTaxon = new PluginParameter<>(targetTaxon, value);
return this;
}
/**
* The name of the file containing the reference ranges to keep.
*
* @return Ref Range File
*/
public String refRangeFile() {
return refRangeFile.value();
}
/**
* Set Ref Range File. The name of the file containing the reference ranges to keep.
*
* @param value Ref Range File
*
* @return this plugin
*/
public HapCountBestPathToTextPlugin refRangeFile(String value) {
refRangeFile = new PluginParameter<>(refRangeFile, value);
return this;
}
/**
* Reference file name in case you want to index on the fly
*
* @return Ref File Name
*/
public String referenceFileName() {
return myReferenceFileName.value();
}
/**
* Set Ref File Name. Reference file name in case you want to index on the fly
*
* @param value Ref File Name
*
* @return this plugin
*/
public HapCountBestPathToTextPlugin referenceFileName(String value) {
myReferenceFileName = new PluginParameter<>(myReferenceFileName, value);
return this;
}
/**
* Output Directory
*
* @return Output Dir
*/
public String outputDir() {
return myOutputDir.value();
}
/**
* Set Output Dir. Output Directory
*
* @param value Output Dir
*
* @return this plugin
*/
public HapCountBestPathToTextPlugin outputDir(String value) {
myOutputDir = new PluginParameter<>(myOutputDir, value);
return this;
}
/**
* Name of method used to creates inclusion/exclusion
* counts in FastqToHapCountPLugin
*
* @return Hc Method
*/
public String hapCountMethod() {
return myHapCountMethod.value();
}
/**
* Set Hap Count Method. Name of method used to creates inclusion/exclusion
* counts in FastqToHapCountPLugin
*
* @param value Hap Count Method
*
* @return this plugin
*/
public HapCountBestPathToTextPlugin hapCountMethod(String value) {
myHapCountMethod = new PluginParameter<>(myHapCountMethod, value);
return this;
}
/**
* Name of method to be used to create paths through the
* graph.
*
* @return Path Method
*/
public String pathMethod() {
return myPathMethod.value();
}
/**
* Set P Method. Name of method to be used to create paths
* through the graph.
*
* @param value P Method
*
* @return this plugin
*/
public HapCountBestPathToTextPlugin pathMethod(String value) {
myPathMethod = new PluginParameter<>(myPathMethod, value);
return this;
}
}