net.maizegenetics.pangenome.hapCalling.HapCountBestPathPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.hapCalling;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multiset;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.pangenome.api.*;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils;
import net.maizegenetics.pangenome.hapCalling.ReadMappingUtils;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
import javax.swing.*;
import java.awt.*;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Deprecated
public class HapCountBestPathPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(HapCountBestPathPlugin.class);
private PluginParameter minTaxaPerRange = new PluginParameter.Builder<>("minTaxa", 20, Integer.class)
.description("minimum number of taxa per anchor reference range. Ranges with fewer taxa will not be included in the output node list.")
.build();
private PluginParameter minReads = new PluginParameter.Builder<>("minReads", 1, Integer.class)
.description("minimum number of reads per anchor reference range. Ranges with fewer reads will not be included in the output node list.")
.build();
private PluginParameter maxReadsPerKB = new PluginParameter.Builder<>("maxReads", 10000, Integer.class)
.description("maximum number of include counts per anchor reference range Kb. Ranges with more reads will not be included in the output node list.")
.build();
private PluginParameter maxNodesPerRange = new PluginParameter.Builder<>("maxNodes", 10000, Integer.class)
.description("maximum number of nodes per reference range. Ranges with more nodes will not be included in the output node list.")
.build();
private PluginParameter taxaFilterString = new PluginParameter.Builder<>("taxa", null, String.class)
.description("A comma delimited list of taxa (no spaces allowed) to include in graph. Only nodes containing these taxa will be included in the graph."
+ " If no taxa list is supplied, then all taxa in the full graph will be used.")
.build();
private PluginParameter minTransitionProb = new PluginParameter.Builder<>("minTransitionProb", 0.001, Double.class)
.description("minimum probability of a transition between nodes at adjacent reference ranges.")
.build();
private PluginParameter probReadMappedCorrectly = new PluginParameter.Builder<>("probCorrect", 0.99, Double.class)
.description("minimum number of reads per anchor reference range. Ranges with fewer reads will not be included in the output node list.")
.build();
private PluginParameter inclusionFilename = new PluginParameter.Builder<>("inclusionFile", null, String.class)
.description("The name of the file containing read inclusion and exclusion counts for hapids.")
.inFile()
.build();
private PluginParameter targetTaxon = new PluginParameter.Builder<>("target", null, String.class)
.description("The taxon that will be used to evaluate the node list returned.")
.build();
private PluginParameter emissionMethod = new PluginParameter.Builder<>("emission",
ReferenceRangeEmissionProbability.METHOD.allCounts, ReferenceRangeEmissionProbability.METHOD.class)
.guiName("Emission Method")
.range(ReferenceRangeEmissionProbability.METHOD.values())
.build();
private PluginParameter refRangeFile = new PluginParameter.Builder<>("refRangeFile", null, String.class)
.description("The name of the file containing the reference ranges to keep.")
.inFile()
.build();
private PluginParameter splitTaxa = new PluginParameter.Builder<>("splitTaxa", false, Boolean.class)
.description("split consensus nodes into one node per taxon.")
.build();
private PluginParameter splitTransitionProb = new PluginParameter.Builder<>("splitProb", 0.99, Double.class)
.description("When the consensus nodes are split by taxa, this is the transition probability for moving from a node to the next node of the same taxon. It equals 1 minus the probability of a recombination between adjacent nodes.")
.build();
private PluginParameter useBackwardForward = new PluginParameter.Builder<>("usebf", false, Boolean.class)
.description("Use the Backward-Forward algorithm instead of the Viterbi algorithm for the HMM.")
.build();
private PluginParameter minProbBF = new PluginParameter.Builder<>("minP", 0.8, Double.class)
.description("Only nodes with minP or greater probability will be kept in the path when using the Backward-Forward algorithm,")
.build();
private PluginParameter bfInfoFilename = new PluginParameter.Builder<>("bfInfoFile", null, String.class)
.description("The name of the file to node probabilities from the backward-forward algorithm will be written. If the name is not supplied the diagnostic information will not be reported. The target taxon name should be supplied as well.")
.outFile()
.dependentOnParameter(useBackwardForward)
.build();
private PluginParameter haplotypeCountsId = new PluginParameter.Builder<>("hapCountsId", null, Integer.class)
.description("DB assigned haplotype_counts_id from the haplotype_counts table. This is assigned programatically and only present when plugin is called from an internal method.")
.build();
private PluginParameter pathInfoFilename = new PluginParameter.Builder<>("pathInfoFile", null, String.class)
.description("The name of the file to which detailed path diagnostic information will be written. If the name is not supplied the diagnotic information will not be reported. The target taxon name must be supplied as well.")
.outFile()
.build();
private PluginParameter graphExportBasename = new PluginParameter.Builder<>("graphExport", null, String.class)
.description("The base name for R-igraph export files. If a value is supplied, vertices, edges, and a layout will be exported.")
.build();
private PluginParameter removeRangesWithEqualCounts = new PluginParameter.Builder("removeEqual", true, Boolean.class)
.description("Ranges with equal read counts for all haplotypes should be removed from the graph. Defaults to true but will be always be false if minReads = 0.")
.build();
public HapCountBestPathPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
protected void preProcessParameters(DataSet input) {
if (pathInfoFilename.value() != null && targetTaxon.value() == null ) {
myLogger.info(String.format("Warning: detailed path information will not be written to %s because there is no target taxon.", pathInfoFilename.value()));
}
}
@Override
public DataSet processData(DataSet input) {
List temp = input.getDataOfType(Multiset.class);
Multiset nodeMultiset = null;
if (!temp.isEmpty()) {
nodeMultiset = (Multiset) temp.get(0).getData();
}
temp = input.getDataOfType(HaplotypeGraph.class);
if (temp.size() != 1) {
throw new IllegalArgumentException("HapCountBestPathPlugin: processData: must input one HaplotypeGraph: " + temp.size());
}
HaplotypeGraph hapgraph = (HaplotypeGraph) temp.get(0).getData();
List rangesToKeep = readRefRangeFile();
//TODO convert this to use the new read mapping code. Will also need to rewrite writePathInformation.
Multimap readMap = ReadMappingUtils.readInMultimapHits(inclusionFilename(), hapgraph);
ConvertReadsToPathUsingHMM converter = new ConvertReadsToPathUsingHMM()
// .minTaxaPerRange(minTaxaPerRange.value())
.minReadsPerRange(minReads.value())
.removeRangesWithEqualCounts(removeRangesWithEqualCounts.value())
.maxReadsPerRangeKB(maxReadsPerKB.value())
// .maxNodesPerRange(maxNodesPerRange.value())
.taxaFilterList(taxaFilterString.value())
.minTransitionProbability(minTransitionProb.value())
// .splitTaxa(splitTaxa.value())
.transitionProbabilitySameTaxon(splitTransitionProb.value())
.probabilityReadMappingCorrect(probReadMappedCorrectly.value())
.targetTaxon(targetTaxon.value());
// .readMap(readMap);
if (minReads.value() == 0) converter.removeRangesWithEqualCounts(false);
if (rangesToKeep == null) converter.filterHaplotypeGraph(hapgraph);
else converter.filterHaplotypeGraph(hapgraph, rangesToKeep);
List result;
if (useBackwardForward.value()) {
converter.haplotypeCountsToPathProbability();
result = converter.nodeListFromProbabilities(minProbBF.value(), bfInfoFilename());
} else {
result = converter.haplotypeCountsToPath();
}
HaplotypeGraph filteredGraph = converter.filteredGraph();
evaluateNodes(result, filteredGraph);
if (pathInfoFilename.value() != null) writePathInformation(pathInfoFilename.value(), result, filteredGraph, readMap);
//output R-igraph export files
if (graphExportBasename.value() != null) {
for (Chromosome chr : filteredGraph.chromosomes())
writeGraphInfoFilesForChr(graphExportBasename.value(), result, filteredGraph, readMap, chr, targetTaxon.value());
}
String inputName = "";
if (input != null && input.getData(0) != null && input.getData(0).getName() != null) {
inputName = input.getData(0).getName();
}
if (inclusionFilename.value() != null) {
inputName = inclusionFilename.value();
}
String name = "NodeList";
String comment = "List of HaplotypeNode representing the best path through a PHG base on sequence reads. From " + inputName + ".";
return new DataSet(new Datum(name, result, comment), this);
}
private void evaluateNodes(List nodeList, HaplotypeGraph filteredGraph) {
if (targetTaxon.value() == null) {
myLogger.info("No target taxon in HapcountBestPathPlugin.");
} else {
long numberContainingTarget = nodeList.stream().filter(node -> node.taxaList().indexOf(targetTaxon.value()) > -1).count();
long numberOfTotalNodesContainingTarget = filteredGraph.nodeStream().filter(node -> node.taxaList().indexOf(targetTaxon.value()) > -1).count();
long numberOfReferenceRanges = filteredGraph.numberOfRanges();
myLogger.info(String.format("%d nodes returned by HapCountBestPathPlugin%n", nodeList.size()));
myLogger.info(String.format("%d (%1.4f) of those nodes contain the target taxon, %s%n", numberContainingTarget,
(double) numberContainingTarget / nodeList.size(), targetTaxon.value()));
myLogger.info(String.format("%d total number of nodes containing target taxon in filtered graph.\n%1.3f of those nodes were chosen as part of Path, %s%n", numberOfTotalNodesContainingTarget,
(double) numberContainingTarget / numberOfTotalNodesContainingTarget, targetTaxon.value()));
myLogger.info(String.format("Nodes containing target cover %1.3f of the reference ranges.%n",
(double) numberOfTotalNodesContainingTarget / numberOfReferenceRanges));
}
}
/**
* Method is used to write diagnostic information about a path to a file.
* Information about each node on the path is written to a text file.
* Where a node in the path does not contain the target taxon, information is written for both that node and
* the node containing the target taxon from the same range.
*
* @param outFilename
* @param nodeList
* @param filteredGraph
*/
public void writePathInformation(String outFilename, List nodeList, HaplotypeGraph filteredGraph, Multimap readMap) {
myLogger.info(String.format("In writePathInformation nodeList has size = %d", nodeList.size()));
try (PrintWriter pw = new PrintWriter(outFilename)) {
pw.println("inPath\thasTarget\thapid\tchr\tstart\tnTaxa\tpropN\tinclusionCount\texclusionCount");
nodeList.stream().flatMap(hn -> getPathNodeInformation(hn, filteredGraph, readMap).stream())
.map(strlist -> strlist.stream().collect(Collectors.joining("\t"))).forEach(pw::println);
} catch (IOException e) {
myLogger.error("Unable to write path information");
e.printStackTrace();
}
}
private List> getPathNodeInformation(HaplotypeNode hn, HaplotypeGraph filteredGraph, Multimap readMap) {
//List is list of
//inPath, hasTarget, hapid, refRange.chrname, refRange.start, numberOfTaxa, propN
//if the path node contains target, then a list of one item (a string list) is returned
//if the path node does not contain the target, then two string lists are returned
//one for the path node and one for the node containing the target in the same reference range
String target = targetTaxon.value();
if (target == null) return null;
List> nodeInfoList = new ArrayList<>();
List nodeInfo = new ArrayList<>();
nodeInfo.add("T"); //inPath
boolean hasTarget; //hasTarget
if (hn.taxaList().indexOf(target) >= 0) {
hasTarget = true;
nodeInfo.add("T");
} else {
hasTarget = false;
nodeInfo.add("F");
}
Collection hapMappingCollection = readMap.get(hn.referenceRange());
int readCount = hapMappingCollection.size();
nodeInfo.add(Integer.toString(hn.id())); //hapid
nodeInfo.add(hn.referenceRange().chromosome().getName()); //refRange.chrname
nodeInfo.add(Integer.toString(hn.referenceRange().start())); //refRange.start
nodeInfo.add(Integer.toString(hn.taxaList().numberOfTaxa())); //numberOfTaxa
nodeInfo.add(Double.toString(proportionN(hn.haplotypeSequence().sequence()))); //propN
if (hn.id() == -1) {
nodeInfo.add("0"); //inclusionCount
nodeInfo.add("0"); //exclusionCount
} else {
int inclusionCount = (int) hapMappingCollection.stream().filter(mapping -> mapping.getHapIdSet().contains(hn.id())).count();
nodeInfo.add(Integer.toString(inclusionCount)); //inclusionCount
nodeInfo.add(Integer.toString(readCount - inclusionCount)); //exclusionCount
}
nodeInfoList.add(nodeInfo);
if (!hasTarget) {
Optional optTargetNode = filteredGraph.nodes(hn.referenceRange())
.stream()
.filter(node -> node.taxaList().indexOf(target) >= 0)
.findFirst();
if (optTargetNode.isPresent()) {
HaplotypeNode targetNode = optTargetNode.get();
nodeInfo = new ArrayList<>();
nodeInfo.add("F"); //inPath
nodeInfo.add("T"); //hasTarget
nodeInfo.add(Integer.toString(targetNode.id())); //hapid
nodeInfo.add(hn.referenceRange().chromosome().getName()); //refRange.chrname
nodeInfo.add(Integer.toString(hn.referenceRange().start())); //refRange.start
nodeInfo.add(Integer.toString(targetNode.taxaList().numberOfTaxa())); //numberOfTaxa
nodeInfo.add(Double.toString(proportionN(targetNode.haplotypeSequence().sequence()))); //propN
if (targetNode.id() == -1) {
nodeInfo.add("0"); //inclusionCount
nodeInfo.add("0"); //exclusionCount
} else {
int inclusionCount = (int) hapMappingCollection.stream().filter(mapping -> mapping.getHapIdSet().contains(targetNode.id())).count();
nodeInfo.add(Integer.toString(inclusionCount)); //inclusionCount
nodeInfo.add(Integer.toString(readCount - inclusionCount)); //exclusionCount
}
nodeInfoList.add(nodeInfo);
}
}
return nodeInfoList;
}
public void writeGraphInfoFilesForChr(String outbase, List path,
HaplotypeGraph graph, Multimap readMap,
Chromosome chr, String target) {
//write edge data, vertex data with inclusion counts, path (list of haplotype id's)
String edgefilename = outbase + "_chr" + chr.getName() + "_edges.txt";
String vertexfilename = outbase + "_chr" + chr.getName() + "_vertices.txt";
//set up data structures
NavigableMap> tree = graph.tree(chr);
Map graphidMap = new HashMap<>();
int rangeCount = 0;
int nodeCount = 0;
for (List nodeList : tree.values()) {
for (HaplotypeNode node : nodeList) {
graphidMap.put(node, new int[] {rangeCount, nodeCount++});
}
rangeCount++;
}
//write the edges
try(PrintWriter pw = new PrintWriter(edgefilename)) {
pw.println("from\tto\tprob");
tree.values().stream().flatMap(List::stream)
.flatMap(hn -> graph.rightEdges(hn).stream())
.forEach(he -> {
int sourceid = graphidMap.get(he.leftHapNode())[1];
int targetid = graphidMap.get(he.rightHapNode())[1];
pw.printf("%d\t%d\t%1.3e%n", sourceid, targetid, he.edgeProbability());
});
} catch (FileNotFoundException e) {
myLogger.error("PrintWriter unable to write to " + edgefilename);
}
//write the vertices
try(PrintWriter pw = new PrintWriter(vertexfilename)) {
pw.println("id\trange\thapid\tname\ttaxa\tispath\tistarget\tinclusion\texclusion\tweightedExc\tchr\tstart\tend\tseqlength\tpN");
for (HaplotypeNode node : graphidMap.keySet()) {
//generate inclusion/exclusion counts
int[] ids = graphidMap.get(node);
kotlin.Triple weightedCounts = HaplotypeEmissionProbabilityKt.calculateWeightedExclusionCount(graph, node, readMap);
pw.print(Integer.toString(ids[1])); //id
pw.print("\t");
pw.print(Integer.toString(ids[0])); //range
pw.print("\t");
pw.print(Integer.toString(node.id())); //hapid (place holder)
pw.print("\t");
pw.print(Integer.toString(node.numTaxa())); //name (place holder)
pw.print("\t");
pw.print(node.taxaList().stream().map(Taxon::getName).collect(Collectors.joining(","))); //taxa
pw.print("\t");
pw.print(path.contains(node) ? "T" : "F");//ispath
pw.print("\t");
if (target == null) pw.print("F"); //istarget
else pw.print(node.taxaList().indexOf(target) >= 0 ? "T" : "F");
pw.print("\t");
pw.print(Integer.toString(weightedCounts.component1())); //inclusion
pw.print("\t");
pw.print(Integer.toString(weightedCounts.component2())); //exclusion
pw.print("\t");
pw.print(Double.toString(weightedCounts.component3())); //weighted exclusion
pw.print("\t");
pw.print(node.referenceRange().chromosome().getName()); //chr
pw.print("\t");
pw.print(Integer.toString(node.referenceRange().start())); //start
pw.print("\t");
pw.print(Integer.toString(node.referenceRange().end())); //end
pw.print("\t");
pw.print(Integer.toString(node.haplotypeSequence().length())); //sequence length
//calculate proportion of N's in the sequence
String seq = node.haplotypeSequence().sequence();
if (seq == null) pw.print("\tNA");
else {
pw.print("\t");
pw.print(Double.toString(proportionN(seq)));
};
pw.println();
}
} catch (FileNotFoundException e) {
myLogger.error("PrintWriter unable to write to " + vertexfilename);
}
}
private double proportionN(String seqString) {
int n = seqString.length();
if (n < 1) return 1.0;
int Ncount = (int) seqString.chars().filter(c -> c == 'N').count();
return (double) Ncount / n;
}
private List