net.maizegenetics.pangenome.hapCalling.ScoreRangesByInclusionCountsPlugin Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
package net.maizegenetics.pangenome.hapCalling;

import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultiset;
import net.maizegenetics.analysis.clustering.Haplotype;
import net.maizegenetics.pangenome.api.GraphUtils;
import net.maizegenetics.pangenome.api.HaplotypeGraph;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

import javax.swing.*;
import java.awt.*;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.List;
import java.util.stream.Collector;
import java.util.stream.Collectors;

/**
 * Created by zrm22 on 10/11/17.
 *
 * This plugin will take in any number of inclusion file and will score each reference range by counting how many taxa hit this ref range at least once
 */

@Deprecated
public class ScoreRangesByInclusionCountsPlugin extends AbstractPlugin {

    private PluginParameter inclusionFilenameDir = new PluginParameter.Builder<>("inclusionFileDir", null, String.class)
            .description("The name of the file containing read inclusion and exclusion counts for hapids.")
            .inDir()
            .required(true)
            .build();

    private PluginParameter outputFileName = new PluginParameter.Builder<>("outputFile",null, String.class)
            .description("Name of output file")
            .outFile()
            .required(true)
            .build();

    private PluginParameter filterRefRanges = new PluginParameter.Builder<>("filterRefRangeFile",null,String.class)
            .description("Name of ref range filter file")
            .inFile()
            .required(false)
            .build();


    public ScoreRangesByInclusionCountsPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }

    @Override
    public DataSet processData(DataSet input) {
        HaplotypeGraph graph = (HaplotypeGraph)input.getDataOfType(HaplotypeGraph.class).get(0).getData();

        Optional> setOfRefRangesToFilter = parseFilterRefFile(filterRefRanges());
//        Multiset inclusionCounts = countInclusionFiles(graph,inclusionFilenameDir(), setOfRefRangesToFilter);
        Tuple, Map> inclusionCounts = countInclusionFiles(graph,inclusionFilenameDir(), setOfRefRangesToFilter);

        writeCountsToTSV(outputFileName(), inclusionCounts);

        return null;
    }

    /**
     * Method to parse the Ref file.  Only needs to have the referenceRange id as the first entry on each line to work properly
     * @param filterRefFileName
     * @return
     */
    private Optional> parseFilterRefFile(String filterRefFileName) {
        if(filterRefFileName == null) {
            return Optional.empty();
        }

        Set refIdsToKeep = new HashSet<>();

        try(BufferedReader reader = Utils.getBufferedReader(filterRefFileName)) {
            String currentLine = reader.readLine();
            while((currentLine = reader.readLine())!=null) {
                int tabIndex = currentLine.indexOf("\t");

                refIdsToKeep.add(Integer.parseInt(currentLine.substring(0,tabIndex)));
            }
        }
        catch(Exception e) {
            throw new IllegalStateException("Unable to read in refrange id file:",e);
        }

        return Optional.of(refIdsToKeep);
    }

    /**
     * Method to count up the Inclusion files and will return a tuple containing two datasets
     * @param graph graph object
     * @param inclusionFileDir inclusion file directory
     * @param setOfRefRangesToFilter Set of reference range ids for filtering
     * @return Return a tuple containing two datasets,
     *          one a Multiset of Reference ranges will count how many taxon hit each Reference Range,
     *          the other is a Map which counts how many nodes are in each reference range viewed in the Inclusion file
     */
    private Tuple, Map> countInclusionFiles(HaplotypeGraph graph, String inclusionFileDir, Optional> setOfRefRangesToFilter) {
        Multiset refRangeTaxaCounts = TreeMultiset.create();
        Map> refRangeToSetOfNodeMap = new HashMap<>();

        java.util.List inclusionFiles = DirectoryCrawler.listPaths("glob:*.txt", Paths.get(inclusionFileDir));
        for (Path currentInclusionFilePath : inclusionFiles) {
            System.out.println("Counting the following file:"+currentInclusionFilePath.getFileName().toString());
            try {
                //Get a sorted list of all the nodes we need from the graph
                SortedSet nodeHitSet = Files.lines(currentInclusionFilePath).map(line -> extractId(line)).collect(Collector.of(TreeSet::new, (set, nodeId) -> set.add(nodeId), (leftSet, rightSet) -> {leftSet.addAll(rightSet); return leftSet;}));

                //Extract the nodes we hit from the graph
                List nodeList = GraphUtils.nodes(graph,nodeHitSet);
                Multiset refRangeHits = nodeList.stream()
                        //.map(node -> node.referenceRange().id()) //convert the node to a reference range and id
                        .filter(node -> isRefRangeInListToFilter(node.referenceRange().id(),setOfRefRangesToFilter))
                        .map(node -> node.referenceRange())
                        .distinct() //Filter out any ids that are the same as we are just looking for unique ranges
                        .collect(Collector.of(TreeMultiset::create, (multiSet, refRange) ->  multiSet.add(refRange), (leftSet, rightSet) -> {leftSet.addAll(rightSet); return leftSet;}));

                refRangeTaxaCounts.addAll(refRangeHits);

                //Count the number of nodes per reference range
                for(HaplotypeNode node : nodeList) {
                    if(!refRangeToSetOfNodeMap.containsKey(node.referenceRange())) {
                        refRangeToSetOfNodeMap.put(node.referenceRange(),new HashSet<>());
                    }
                    refRangeToSetOfNodeMap.get(node.referenceRange()).add(node);
                }

            }
            catch(Exception e) {
                e.printStackTrace();
                throw new IllegalStateException("Error reading inclusion file:"+currentInclusionFilePath,e);
            }
        }

        Map refRangeNodeCount = refRangeToSetOfNodeMap.keySet().stream().collect(Collectors.toMap(key -> key, key -> refRangeToSetOfNodeMap.get(key).size()));
        //Wrap the counts into a Tuple
        return new Tuple(refRangeTaxaCounts, refRangeNodeCount);
    }

    /**
     * Method to check to see if the reference range is included in the filter.  If we do not have the setOfRefRangesToFilter, we include all reference ranges
     * @param rangeId
     * @param setOfRefRangesToFilter
     * @return
     */
    private boolean isRefRangeInListToFilter(int rangeId, Optional> setOfRefRangesToFilter) {
        if(!setOfRefRangesToFilter.isPresent()) {
            return true;
        }

        if(setOfRefRangesToFilter.get().contains(rangeId)) {
            return true;
        }
        else {
            return false;
        }
    }

    /**
     * Method to extract the reference id from the text file's line
     * @param line
     * @return
     */
    private int extractId(String line) {
        int tabIndex = line.indexOf("\t");
        return Integer.parseInt(line.substring(0,tabIndex));
    }

    /**
     * Method to write out the counts to a tab delimited file.
     * @param outputFileName
     * @param dataToExport
     */
    private void writeCountsToTSV(String outputFileName, Tuple, Map> dataToExport) {
        Multiset counts = dataToExport.getX();
        Map nodeCounts = dataToExport.getY();
        try(BufferedWriter writer = Utils.getBufferedWriter(outputFileName)) {
            writer.write("RefRangeId\tChr\tstPos\tendPos\tCount\tnodeCount\n");
            for(ReferenceRange refRange : counts.elementSet()) {
                writer.write(refRange.id() + "\t" + refRange.chromosome().getName() + "\t" + refRange.start() + "\t" + refRange.end() + "\t" + counts.count(refRange)+"\t"+nodeCounts.get(refRange)+"\n");
            }
        }
        catch(Exception e) {
            throw new IllegalStateException("Error writing out count file:",e);
        }
    }

    @Override
    public ImageIcon getIcon() {
        return null;
    }

    @Override
    public String getButtonName() {
        return "VerifyInclusion";
    }

    @Override
    public String getToolTipText() {
        return "Verify Inclusion Files";
    }

    @Override
    public String pluginDescription() {
        return "Plugin to check the number of nodes and reference ranges filtered out by the HMM processing";
    }

    //TODO implement documentation
    @Override
    public String pluginUserManualURL() {
        return "https://bitbucket.org/tasseladmin/tassel5source/wiki/UserManual";
    }


    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
//     public static void main(String[] args) {
//         GeneratePluginCode.generate(ScoreRangesByInclusionCountsPlugin.class);
//     }

    /**
     * Convenience method to run plugin with one return object.
     */
    // TODO: Replace  with specific type.
//    public  runPlugin(DataSet input) {
//        return () performFunction(input).getData(0).getData();
//    }

    /**
     * The name of the file containing read inclusion and
     * exclusion counts for hapids.
     *
     * @return Inclusion File Dir
     */
    public String inclusionFilenameDir() {
        return inclusionFilenameDir.value();
    }

    /**
     * Set Inclusion File Dir. The name of the file containing
     * read inclusion and exclusion counts for hapids.
     *
     * @param value Inclusion File Dir
     *
     * @return this plugin
     */
    public ScoreRangesByInclusionCountsPlugin inclusionFilenameDir(String value) {
        inclusionFilenameDir = new PluginParameter<>(inclusionFilenameDir, value);
        return this;
    }

    /**
     * Name of output file
     *
     * @return Output File
     */
    public String outputFileName() {
        return outputFileName.value();
    }

    /**
     * Set Output File. Name of output file
     *
     * @param value Output File
     *
     * @return this plugin
     */
    public ScoreRangesByInclusionCountsPlugin outputFileName(String value) {
        outputFileName = new PluginParameter<>(outputFileName, value);
        return this;
    }

    /**
     * Name of ref range filter file
     *
     * @return Filter Ref Range File
     */
    public String filterRefRanges() {
        return filterRefRanges.value();
    }

    /**
     * Set Filter Ref Range File. Name of ref range filter
     * file
     *
     * @param value Filter Ref Range File
     *
     * @return this plugin
     */
    public ScoreRangesByInclusionCountsPlugin filterRefRanges(String value) {
        filterRefRanges = new PluginParameter<>(filterRefRanges, value);
        return this;
    }
}