net.maizegenetics.pangenome.hapCalling.ScoreRangesByInclusionCountsPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.hapCalling;
import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultiset;
import net.maizegenetics.analysis.clustering.Haplotype;
import net.maizegenetics.pangenome.api.GraphUtils;
import net.maizegenetics.pangenome.api.HaplotypeGraph;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
import javax.swing.*;
import java.awt.*;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.List;
import java.util.stream.Collector;
import java.util.stream.Collectors;
/**
* Created by zrm22 on 10/11/17.
*
* This plugin will take in any number of inclusion file and will score each reference range by counting how many taxa hit this ref range at least once
*/
@Deprecated
public class ScoreRangesByInclusionCountsPlugin extends AbstractPlugin {
private PluginParameter inclusionFilenameDir = new PluginParameter.Builder<>("inclusionFileDir", null, String.class)
.description("The name of the file containing read inclusion and exclusion counts for hapids.")
.inDir()
.required(true)
.build();
private PluginParameter outputFileName = new PluginParameter.Builder<>("outputFile",null, String.class)
.description("Name of output file")
.outFile()
.required(true)
.build();
private PluginParameter filterRefRanges = new PluginParameter.Builder<>("filterRefRangeFile",null,String.class)
.description("Name of ref range filter file")
.inFile()
.required(false)
.build();
public ScoreRangesByInclusionCountsPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public DataSet processData(DataSet input) {
HaplotypeGraph graph = (HaplotypeGraph)input.getDataOfType(HaplotypeGraph.class).get(0).getData();
Optional> setOfRefRangesToFilter = parseFilterRefFile(filterRefRanges());
// Multiset inclusionCounts = countInclusionFiles(graph,inclusionFilenameDir(), setOfRefRangesToFilter);
Tuple, Map> inclusionCounts = countInclusionFiles(graph,inclusionFilenameDir(), setOfRefRangesToFilter);
writeCountsToTSV(outputFileName(), inclusionCounts);
return null;
}
/**
* Method to parse the Ref file. Only needs to have the referenceRange id as the first entry on each line to work properly
* @param filterRefFileName
* @return
*/
private Optional> parseFilterRefFile(String filterRefFileName) {
if(filterRefFileName == null) {
return Optional.empty();
}
Set refIdsToKeep = new HashSet<>();
try(BufferedReader reader = Utils.getBufferedReader(filterRefFileName)) {
String currentLine = reader.readLine();
while((currentLine = reader.readLine())!=null) {
int tabIndex = currentLine.indexOf("\t");
refIdsToKeep.add(Integer.parseInt(currentLine.substring(0,tabIndex)));
}
}
catch(Exception e) {
throw new IllegalStateException("Unable to read in refrange id file:",e);
}
return Optional.of(refIdsToKeep);
}
/**
* Method to count up the Inclusion files and will return a tuple containing two datasets
* @param graph graph object
* @param inclusionFileDir inclusion file directory
* @param setOfRefRangesToFilter Set of reference range ids for filtering
* @return Return a tuple containing two datasets,
* one a Multiset of Reference ranges will count how many taxon hit each Reference Range,
* the other is a Map which counts how many nodes are in each reference range viewed in the Inclusion file
*/
private Tuple, Map> countInclusionFiles(HaplotypeGraph graph, String inclusionFileDir, Optional> setOfRefRangesToFilter) {
Multiset refRangeTaxaCounts = TreeMultiset.create();
Map> refRangeToSetOfNodeMap = new HashMap<>();
java.util.List inclusionFiles = DirectoryCrawler.listPaths("glob:*.txt", Paths.get(inclusionFileDir));
for (Path currentInclusionFilePath : inclusionFiles) {
System.out.println("Counting the following file:"+currentInclusionFilePath.getFileName().toString());
try {
//Get a sorted list of all the nodes we need from the graph
SortedSet nodeHitSet = Files.lines(currentInclusionFilePath).map(line -> extractId(line)).collect(Collector.of(TreeSet::new, (set, nodeId) -> set.add(nodeId), (leftSet, rightSet) -> {leftSet.addAll(rightSet); return leftSet;}));
//Extract the nodes we hit from the graph
List nodeList = GraphUtils.nodes(graph,nodeHitSet);
Multiset refRangeHits = nodeList.stream()
//.map(node -> node.referenceRange().id()) //convert the node to a reference range and id
.filter(node -> isRefRangeInListToFilter(node.referenceRange().id(),setOfRefRangesToFilter))
.map(node -> node.referenceRange())
.distinct() //Filter out any ids that are the same as we are just looking for unique ranges
.collect(Collector.of(TreeMultiset::create, (multiSet, refRange) -> multiSet.add(refRange), (leftSet, rightSet) -> {leftSet.addAll(rightSet); return leftSet;}));
refRangeTaxaCounts.addAll(refRangeHits);
//Count the number of nodes per reference range
for(HaplotypeNode node : nodeList) {
if(!refRangeToSetOfNodeMap.containsKey(node.referenceRange())) {
refRangeToSetOfNodeMap.put(node.referenceRange(),new HashSet<>());
}
refRangeToSetOfNodeMap.get(node.referenceRange()).add(node);
}
}
catch(Exception e) {
e.printStackTrace();
throw new IllegalStateException("Error reading inclusion file:"+currentInclusionFilePath,e);
}
}
Map refRangeNodeCount = refRangeToSetOfNodeMap.keySet().stream().collect(Collectors.toMap(key -> key, key -> refRangeToSetOfNodeMap.get(key).size()));
//Wrap the counts into a Tuple
return new Tuple(refRangeTaxaCounts, refRangeNodeCount);
}
/**
* Method to check to see if the reference range is included in the filter. If we do not have the setOfRefRangesToFilter, we include all reference ranges
* @param rangeId
* @param setOfRefRangesToFilter
* @return
*/
private boolean isRefRangeInListToFilter(int rangeId, Optional> setOfRefRangesToFilter) {
if(!setOfRefRangesToFilter.isPresent()) {
return true;
}
if(setOfRefRangesToFilter.get().contains(rangeId)) {
return true;
}
else {
return false;
}
}
/**
* Method to extract the reference id from the text file's line
* @param line
* @return
*/
private int extractId(String line) {
int tabIndex = line.indexOf("\t");
return Integer.parseInt(line.substring(0,tabIndex));
}
/**
* Method to write out the counts to a tab delimited file.
* @param outputFileName
* @param dataToExport
*/
private void writeCountsToTSV(String outputFileName, Tuple, Map> dataToExport) {
Multiset counts = dataToExport.getX();
Map nodeCounts = dataToExport.getY();
try(BufferedWriter writer = Utils.getBufferedWriter(outputFileName)) {
writer.write("RefRangeId\tChr\tstPos\tendPos\tCount\tnodeCount\n");
for(ReferenceRange refRange : counts.elementSet()) {
writer.write(refRange.id() + "\t" + refRange.chromosome().getName() + "\t" + refRange.start() + "\t" + refRange.end() + "\t" + counts.count(refRange)+"\t"+nodeCounts.get(refRange)+"\n");
}
}
catch(Exception e) {
throw new IllegalStateException("Error writing out count file:",e);
}
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return "VerifyInclusion";
}
@Override
public String getToolTipText() {
return "Verify Inclusion Files";
}
@Override
public String pluginDescription() {
return "Plugin to check the number of nodes and reference ranges filtered out by the HMM processing";
}
//TODO implement documentation
@Override
public String pluginUserManualURL() {
return "https://bitbucket.org/tasseladmin/tassel5source/wiki/UserManual";
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(ScoreRangesByInclusionCountsPlugin.class);
// }
/**
* Convenience method to run plugin with one return object.
*/
// TODO: Replace with specific type.
// public runPlugin(DataSet input) {
// return () performFunction(input).getData(0).getData();
// }
/**
* The name of the file containing read inclusion and
* exclusion counts for hapids.
*
* @return Inclusion File Dir
*/
public String inclusionFilenameDir() {
return inclusionFilenameDir.value();
}
/**
* Set Inclusion File Dir. The name of the file containing
* read inclusion and exclusion counts for hapids.
*
* @param value Inclusion File Dir
*
* @return this plugin
*/
public ScoreRangesByInclusionCountsPlugin inclusionFilenameDir(String value) {
inclusionFilenameDir = new PluginParameter<>(inclusionFilenameDir, value);
return this;
}
/**
* Name of output file
*
* @return Output File
*/
public String outputFileName() {
return outputFileName.value();
}
/**
* Set Output File. Name of output file
*
* @param value Output File
*
* @return this plugin
*/
public ScoreRangesByInclusionCountsPlugin outputFileName(String value) {
outputFileName = new PluginParameter<>(outputFileName, value);
return this;
}
/**
* Name of ref range filter file
*
* @return Filter Ref Range File
*/
public String filterRefRanges() {
return filterRefRanges.value();
}
/**
* Set Filter Ref Range File. Name of ref range filter
* file
*
* @param value Filter Ref Range File
*
* @return this plugin
*/
public ScoreRangesByInclusionCountsPlugin filterRefRanges(String value) {
filterRefRanges = new PluginParameter<>(filterRefRanges, value);
return this;
}
}