net.maizegenetics.pangenome.hapCalling.HapCountBestPathToTextPlugin Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
package net.maizegenetics.pangenome.hapCalling;

import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;
import htsjdk.variant.variantcontext.VariantContext;
import net.maizegenetics.pangenome.api.*;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils;
import net.maizegenetics.pangenome.db_loading.PHGDataWriter;
import net.maizegenetics.pangenome.db_loading.PHGdbAccess;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

import org.apache.log4j.Logger;

import javax.swing.*;
import java.awt.*;
import java.io.BufferedWriter;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.Supplier;
import java.util.stream.Collector;

/**
 * This plugin processes haplotype node data either from of an inclusion file, or from PHG database
 * tables.  The HapCountBestPathPlugin is called to find the best path through the provided
 * haplotype graph.  The output will be either stored to the PHG db, printed to text files or both depending
 * on the input type.
 *   
 * Assumptions made by this software:
 * 1.  This method can determine paths from haplotype counts (perfect hits/exclusion) that are stored in the db's haplotype_counts
 *     table, or from haplotype counts (perfect hits/exclusion) pulled from an inclusion file   If an inclusion
 *     file is present, its data is used.  Otherwise the DB table is used.
 * 2.  When the DB is used to pull haplotype_counts data, any paths created will be stored in the DB.
 *     If an inclusion file is used, paths are NOT stored to the DB.  This is because the paths table
 *     needs the haplotype_counts_id.  When an inclusion file is used, the assumption is that these
 *     hapltoype counts are NOT stored in the DB.
 * 3.  The exported inclusion files (that are input to this Plugin) and the exported path files (created by
 *     this plugin) each contain "header" data needed to load their contents into the DB.  These are the first
 *     lines in the file, and each begin with #.  A separate plugin should be written to load these files to db
 *     if desired.
 *      
 * 
 * @author Terry Casstevens, Zack Miller, Lynn Johnson
 */
@Deprecated
public class HapCountBestPathToTextPlugin extends AbstractPlugin {

    private static final Logger myLogger = Logger.getLogger(HapCountBestPathToTextPlugin.class);

    public HapCountBestPathToTextPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }

    private PluginParameter configFile = new PluginParameter.Builder("configFile", null, String.class)
            .description("Database configuration file")
            .guiName("Config File")
            .required(true)
            .inFile()
            .build();

    private PluginParameter taxaFilterString = new PluginParameter.Builder<>("taxa", null, String.class)
            .description("A comma delimited list of taxa (no spaces allowed) to include in graph. Only nodes containing these taxa will be included in the graph."
                    + " If no taxa list is supplied, then all taxa in the full graph will be used.")
            .build();

    private PluginParameter inclusionFilenameDir = new PluginParameter.Builder<>("inclusionFileDir", null, String.class)
            .description("The name of the file containing read inclusion and exclusion counts for hapids.")
            .inDir()
            .build();

    private PluginParameter targetTaxon = new PluginParameter.Builder<>("target", null, String.class)
            .description("The taxon that will be used to evaluate the node list returned.")
            .build();

    private PluginParameter refRangeFile = new PluginParameter.Builder<>("refRangeFile", null, String.class)
            .description("The name of the file containing the reference ranges to keep.")
            .inFile()
            .build();

    private PluginParameter myReferenceFileName = new PluginParameter.Builder<>("refFileName", null, String.class)
            .required(false)
            .inFile()
            .guiName("Reference File Name")
            .description("Reference file name in case you want to index on the fly")
            .build();

    private PluginParameter myOutputDir = new PluginParameter.Builder<>("outputDir", null, String.class)
            .required(true)
            .outDir()
            .description("Output Directory")
            .build();
    
    // Method details are programmatically generated.
    private PluginParameter myHapCountMethod = new PluginParameter.Builder<>("hapCountMethod", null, String.class)
            .required(true)
            .description("Name of method used to creates inclusion/exclusion counts in FastqToHapCountPLugin")
            .build();
    private PluginParameter myPathMethod = new PluginParameter.Builder<>("pMethod", null, String.class)
            .guiName("Path Method")
            .required(true)
            .description("Name of method to be used to create paths through the graph.")
            .build();


    @Override
    public DataSet processData(DataSet input) {
        
        // Process the config file as a properties file       
        Properties properties = new Properties();
        try {
            properties.load(Utils.getBufferedReader(configFile()));
        } catch (Exception e) {
            myLogger.debug(e.getMessage(), e);
            throw new IllegalArgumentException("HapCountBestPathTextPlugin:buildHaplotypeGraph: connection: problem reading properties file: " + configFile());
        }

        //Check to see if the tassel pipeline passes in a graph
        List temp = input.getDataOfType(HaplotypeGraph.class);
        if (temp.size() != 1) {
            throw new IllegalArgumentException("FastqToHapCountPlugin: processData: must input one HaplotypeGraph: " + temp.size());
        }
        HaplotypeGraph graph = ((HaplotypeGraph)temp.get(0).getData());
        System.out.println("Number of Nodes: "+graph.numberOfNodes());
        
        List inclusionFiles = null;
        if (inclusionFilenameDir() != null) {
            inclusionFiles = DirectoryCrawler.listPaths("glob:*{.txt,.txt.gz}", Paths.get(inclusionFilenameDir()));
        }
        
        // Many of the parameters come from the config file,  Load to properties file, set up HapCountBestPathPlugin
        Properties dbProperties = loadProperties();
        HapCountBestPathPlugin bestPathPlugin = setupHapCountBestPathPlugin( dbProperties);
        //if inclusion files are present, process from the files.  Otherwise, process from the db
        List variants;
        if (inclusionFiles != null && inclusionFiles.size() > 0) {
            variants = processInclusionFiles(input,dbProperties, bestPathPlugin);
        } else {
            throw new IllegalStateException("Error processing inclusion files.  No inclusion files are found in:"+inclusionFilenameDir()+"\n" +
                    "Check to see if folder exists(is mounted correctly) and check if index file(.mmi) exists.");
            //variants = processHapCountsFromDB(input,dbProperties, bestPathPlugin); TODO Undeprecate this call when DB changes are in place.
        }
        
        return null;
    }
    
    private HapCountBestPathPlugin setupHapCountBestPathPlugin(Properties dbProperties) {
        //note that setting minReads to 0 overrides removeEqual, setting it to false in HapCountBestPathPlugin
        return new HapCountBestPathPlugin(null, false)                
                .emissionMethod(ReferenceRangeEmissionProbability.METHOD.valueOf(dbProperties.getProperty("emissionMethod","allCounts")))             
                .probReadMappedCorrectly(Double.parseDouble(dbProperties.getProperty("probReadMappedCorrectly","0.99")))
                .minTaxaPerRange(Integer.parseInt(dbProperties.getProperty("minTaxaPerRange","20")))
                .refRangeFile(refRangeFile())
                .minReads(Integer.parseInt(dbProperties.getProperty("minReads","1")))
                .removeRangesWithEqualCounts(Boolean.parseBoolean(dbProperties.getProperty("removeEqual","true")))
                .maxNodesPerRange(Integer.parseInt(dbProperties.getProperty("maxNodesPerRange","30")))
                .maxReadsPerKB(Integer.parseInt(dbProperties.getProperty("maxReadsPerKB","100")))
                .splitTaxa(Boolean.parseBoolean(dbProperties.getProperty("splitTaxa","false")))
                .minTransitionProb(Double.parseDouble(dbProperties.getProperty("minTransitionProb","0.001")))
                .taxaFilterString(taxaFilterString());

     }

    private Properties loadProperties() {
        Properties configProperties = new Properties();
        try {
            configProperties.load(Utils.getBufferedReader(configFile()));
        } catch (Exception exc) {
            myLogger.error("HapCountBestPathToTextPlugin: loadProperties Failed to Load Properties: " + exc.getMessage());
            throw new IllegalStateException("HapCountBestPathToTextPlugin: loadProperties Failed to Load Properties.", exc);
        }
        return configProperties;
    }


    private String createPathMethodDetails(Properties dbProperties) {
        // Create method details from parameters, Store the path method. 
        // This returns new (if created) or existing path method ID
        StringBuilder methodDetailsSB = new StringBuilder();
        methodDetailsSB.append("minTaxaPerRange:").append(dbProperties.getProperty("minTaxaPerRange","20"));
        methodDetailsSB.append(" minReads:").append(dbProperties.getProperty("minReads","1"));
        methodDetailsSB.append(" removeEqual:").append(dbProperties.getProperty("removeEqual","true"));
        methodDetailsSB.append(" maxReadsPerKB:").append(dbProperties.getProperty("maxReadsPerKB","100"));
        methodDetailsSB.append(" maxNodesPerRange:").append(dbProperties.getProperty("splitTaxa","false"));
        methodDetailsSB.append(" minTransitionProb:").append(dbProperties.getProperty("minTransitionProb","0.001"));
        methodDetailsSB.append(" probReadMappedCorrectly:").append(dbProperties.getProperty("probReadMappedCorrectly","0.99"));
        methodDetailsSB.append(" emmissionMethod:").append(dbProperties.getProperty("emissionMethod","allCounts"));
        methodDetailsSB.append(" splitTaxa:").append(dbProperties.getProperty("splitTaxa","false"));
        if (taxaFilterString() != null) {
            String[] taxa = taxaFilterString().split(",");           
            for (String taxon : taxa) {
                methodDetailsSB.append(taxon).append(" ");
            }
        }
        // New line at end not needed, ExportHaplotypePathToPlugin will add one when writing to a file.
        return methodDetailsSB.toString();
    }
    private List processHapCountsFromDB(DataSet graph,Properties dbProperties,HapCountBestPathPlugin bestPathPlugin) {

        Connection conn = DBLoadingUtils.connection(configFile(), false);
        
        // Grab the haplotype_counts_id and data from the db
        PHGDataWriter phg = new PHGdbAccess(conn);
        
        String methodDetails = createPathMethodDetails(dbProperties);
        // Store the path method. This returns new (if created) or existing path method ID
        // Deprecated, so no changes added to allow for test method type
        int pathMethodID = phg.putMethod(pathMethod(), DBLoadingUtils.MethodType.PATHS, pluginParameters());
        
        // data from the haplotype_counts table
        // Map of haplotype_counts_id to Tuple
        Map> hapCountsIDtoDataMap = phg.getHapCountsIDAndDataForVersionMethod( hapCountMethod());
        
        // See which of the haplotype_counts entries we have already stored in the paths table for this method
        Map hapCountsIDToPathMethod = phg.getHapCountsIDAndPathsForMethod(pathMethod());
               
        // Find which hapids in the hapCountsIDtoDataMap are NOT in the hapCountsIDToPathMethod.
        // These to be run through HapCountBestPathPlugin !!
        Map> hapCountsNotStored = new HashMap>();
        for (Integer hapCountsId : hapCountsIDtoDataMap.keySet()) {
            if (!hapCountsIDToPathMethod.containsKey(hapCountsId)) {
                // not in paths table - add to list
                hapCountsNotStored.put(hapCountsId, hapCountsIDtoDataMap.get(hapCountsId));
            }
        }

        // Combine the datasets - this is input to HapCountBestPathPlugin called in stream below
        DataSet hapCountsDS = new DataSet(new Datum("HapCountsMap",hapCountsNotStored,null),this);       
        List combinedDS = new ArrayList<>();
        combinedDS.add(graph);
        combinedDS.add(hapCountsDS);        

        // The Multimap stores  paths = hapCountsNotStored.keySet().stream()
                .map(path -> {
 
                    String taxonName = hapCountsNotStored.get(path).getX();

                    //Get the Best Path given db inclusion/exclusion data and graph
                    DataSet processedDataSet = bestPathPlugin.haplotypeCountsId(path)
                                               .performFunction(DataSet.getDataSet(combinedDS, this));
                    List nodes = (List) processedDataSet.getData(0).getData();
                    // Code below is error checking to verify the returned path contains no more
                    // than a single node in any given reference range.
                    SortedSet temp = new TreeSet<>();
                    for (HaplotypeNode node : nodes) {
                        if (!temp.add(node.referenceRange())) {
                            throw new IllegalStateException("HaplotypeCountBestPathToVCFPlugin: processInclusionFiles: HapCountBestPathPlugin returned path has more than one node in the same reference range: " + node.referenceRange());
                        }
                    }
                    myLogger.info("processHapCountsFromDB: number of nodes for taxon/id: " + taxonName + "/" + path + " " + nodes.size());
                    return new Tuple<>(path, nodes);
                })
                // Every node on the list returned from HapCountBestPathPlugin gets
                // its own entry on the Multimap created below.                              
                .collect((Supplier>) () -> TreeMultimap.create(),
                        (map, entries) -> {
                            for (HaplotypeNode entry : entries.y) {
                                if (entry.id() != -1) {
                                    map.put(entries.x, entry);
                                }
                            }
                        },
                        (map1, map2) -> map1.putAll(map2));  
 
//        phg.putPathsData(pathMethod, methodDetails, paths); // Write to DB
        
        try {
            ((PHGdbAccess)phg).close();
        } catch (Exception exc) {
            myLogger.warn("Error closing phg db connection");
        }

        // Export the path data to files, on a per taxon/haplotype_counts_id basis
        // To get the taxon name, index the haplotype_counts_id into the hapCountsNotStored map from above

        exportHaplotypePaths(paths, hapCountsNotStored, outputDir(),  hapCountMethod(), 
                pathMethod(), methodDetails);

        List outputs = new ArrayList<>();
        outputs.add(graph.getData(0));
        outputs.add(new Datum("Paths", paths, null));

        return null;

    }

    private List processInclusionFiles(DataSet graph, Properties dbProperties,HapCountBestPathPlugin bestPathPlugin) {

        List inclusionFiles = DirectoryCrawler.listPaths("glob:*{.txt,.txt.gz}", Paths.get(inclusionFilenameDir()));

        // parallelStream() causes failures running RunSmallSeqTestsDocker - December 15, 2017
        // Run with "stream" until we figure out the parallelization problem.
        //Multimap paths = inclusionFiles.parallelStream()
        Multimap paths = inclusionFiles.stream()
                .map(path -> {
                    String taxonName = parseTaxonName(path.toString());
                    //Get the Best Path given inclusion file and graph
                    DataSet processedDataSet = bestPathPlugin.inclusionFilename(path.toString())
                            .performFunction(graph);

                    List nodes = (List) processedDataSet.getData(0).getData();
                    // Code below is error checking to verify the returned path contains no more
                    // than a single node in any given reference range.
                    SortedSet temp = new TreeSet<>();
                    for (HaplotypeNode node : nodes) {
                        if (!temp.add(node.referenceRange())) {
                            throw new IllegalStateException("HaplotypeCountBestPathToVCFPlugin: processInclusionFiles: HapCountBestPathPlugin returned path has more than one node in the same reference range: " + node.referenceRange());
                        }
                    }
                    myLogger.info("processInclusionFiles: number of nodes for taxon: " + taxonName + " path: " + nodes.size());
                    return new Tuple<>(taxonName, nodes);
                })
                .collect((Supplier>) () -> TreeMultimap.create(),
                        (map, nodes) -> {
                            for (HaplotypeNode node : nodes.y) {
                                if (node.id() != -1) {
                                    map.put(nodes.x, node); // add taxonName, nodes to map
                                }
                            }
                        },
                        (map1, map2) -> map1.putAll(map2));

        
        // NOTE - Path data is only written to the db when processing haplotype counts from
        // the PHG db haplotype_counts table.  When processing from an inclusion file, the
        // inclusion data may not exist in the db, so the required haplotype_counts_id for the
        // paths table is not present. 
        List outputs = new ArrayList<>();
        outputs.add(graph.getData(0));
        outputs.add(new Datum("Paths", paths, null));

        // The original ExportHaplotypePathToFilePlugin is called to create the path output files
        // The version and method data are included to become header lines in the exported file.
        // This provides data that will allow the path file to be imported to the DB later if desired.
 
        ExportHaplotypePathToFilePlugin export = new ExportHaplotypePathToFilePlugin(getParentFrame(), isInteractive())
                .refVersion("REF_VERSION")
                .outputFileDirectory(outputDir())
                .hapCountMethod(hapCountMethod())
                .pathMethod(pathMethod())
                .pathMethodDetails(createPathMethodDetails(dbProperties));
        export.performFunction(new DataSet(outputs, this));
        return null;

    }
    private static String parseTaxonName(String currentPath) {
        String[] currentPathSplit = currentPath.split("/");

        String fileName = currentPathSplit[currentPathSplit.length - 1];
        String taxonName = fileName.substring(0,fileName.lastIndexOf("."));

        return taxonName;
    }

    // Exports the paths to a file with the name format of outputDir/_path.txt
    // This method is called when processing data from the DB.  
    private static void exportHaplotypePaths (Multimap paths,Map> hapCountsNotStored, String outputDir, 
            String hcMethod, String pMethod, String pMethodDetails) {

        //loop through each haplotype counts id for which we have a path
        for (Integer hapCountId : paths.keySet()) {
            String taxonName = hapCountsNotStored.get(hapCountId).getX();
            myLogger.debug("HapCountBestPathTextPlugin:exportHaplotypePaths:Processing Taxon:" + taxonName);
            try (BufferedWriter writer = Utils.getBufferedWriter(outputDir + "/" + taxonName + "_path.txt")) {
                // Write header data - necessary if this file is used to load the DB paths table.
                writer.write("#hapCountMethod=" + hcMethod + "\n");
                writer.write("#pathMethod=" + pMethod + "\n");
                
                if (pMethodDetails != null) {
                    writer.write("#pathMethodDetails=" + pMethodDetails + "\n");
                } else {
                    writer.write("#pathMethodDetails=none\n");
                }
                writer.write("#haplotype_counts_id=" + hapCountId + "\n");

                myLogger.debug("Extracting the haplotypeIds");
                SortedSet sortedIdSet = paths.get(hapCountId).stream()
                        .map(haplotypeNode -> haplotypeNode.id())
                        .filter(hapId -> hapId != -1)
                        .collect(Collector.of(TreeSet::new,
                                (set, hapId) -> set.add(hapId),
                                (leftSet, rightSet) -> {
                                    leftSet.addAll(rightSet);
                                    return leftSet;
                                }));
                myLogger.debug("Writing out each hapId to the textFile");
                for (Integer hapId : sortedIdSet) {
                    writer.write("" + hapId);
                    writer.write("\n");
                }

            } catch (Exception e) {
                throw new IllegalStateException("ExportHaplotypePathToFilePlugin: exportPathsToTextFiles: error opening up the file");
            }
        }
    }
    
    @Override
    public ImageIcon getIcon() {
        return null;
    }

    @Override
    public String getButtonName() {
        return "Find Best Path from HapCounts";
    }

    @Override
    public String getToolTipText() {
        return "Find Best Path from HapCounts";
    }

    /**
     * Database configuration file
     *
     * @return Database Config File
     */
    public String configFile() {
        return configFile.value();
    }

    /**
     * Set Database Config File. Database configuration file
     *
     * @param value Database Config File
     *
     * @return this plugin
     */
    public HapCountBestPathToTextPlugin configFile(String value) {
        configFile = new PluginParameter<>(configFile, value);
        return this;
    }

    /**
     * A comma delimited list of taxa (no spaces allowed) to include in graph. Only nodes containing these taxa will be
     * included in the graph. If no taxa list is supplied, then all taxa in the full graph will be used.
     *
     * @return Taxa
     */
    public String taxaFilterString() {
        return taxaFilterString.value();
    }

    /**
     * Set Taxa. A comma delimited list of taxa (no spaces allowed) to include in graph. Only nodes containing these
     * taxa will be included in the graph. If no taxa list is supplied, then all taxa in the full graph will be used.
     *
     * @param value Taxa
     *
     * @return this plugin
     */
    public HapCountBestPathToTextPlugin taxaFilterString(String value) {
        taxaFilterString = new PluginParameter<>(taxaFilterString, value);
        return this;
    }

    /**
     * The name of the file containing read inclusion and exclusion counts for hapids.
     *
     * @return Inclusion File Dir
     */
    public String inclusionFilenameDir() {
        return inclusionFilenameDir.value();
    }

    /**
     * Set Inclusion File Dir. The name of the file containing read inclusion and exclusion counts for hapids.
     *
     * @param value Inclusion File Dir
     *
     * @return this plugin
     */
    public HapCountBestPathToTextPlugin inclusionFilenameDir(String value) {
        inclusionFilenameDir = new PluginParameter<>(inclusionFilenameDir, value);
        return this;
    }

    /**
     * The taxon that will be used to evaluate the node list returned.
     *
     * @return Target
     */
    public String targetTaxon() {
        return targetTaxon.value();
    }

    /**
     * Set Target. The taxon that will be used to evaluate the node list returned.
     *
     * @param value Target
     *
     * @return this plugin
     */
    public HapCountBestPathToTextPlugin targetTaxon(String value) {
        targetTaxon = new PluginParameter<>(targetTaxon, value);
        return this;
    }

    /**
     * The name of the file containing the reference ranges to keep.
     *
     * @return Ref Range File
     */
    public String refRangeFile() {
        return refRangeFile.value();
    }

    /**
     * Set Ref Range File. The name of the file containing the reference ranges to keep.
     *
     * @param value Ref Range File
     *
     * @return this plugin
     */
    public HapCountBestPathToTextPlugin refRangeFile(String value) {
        refRangeFile = new PluginParameter<>(refRangeFile, value);
        return this;
    }

    /**
     * Reference file name in case you want to index on the fly
     *
     * @return Ref File Name
     */
    public String referenceFileName() {
        return myReferenceFileName.value();
    }

    /**
     * Set Ref File Name. Reference file name in case you want to index on the fly
     *
     * @param value Ref File Name
     *
     * @return this plugin
     */
    public HapCountBestPathToTextPlugin referenceFileName(String value) {
        myReferenceFileName = new PluginParameter<>(myReferenceFileName, value);
        return this;
    }

    /**
     * Output Directory
     *
     * @return Output Dir
     */
    public String outputDir() {
        return myOutputDir.value();
    }

    /**
     * Set Output Dir. Output Directory
     *
     * @param value Output Dir
     *
     * @return this plugin
     */
    public HapCountBestPathToTextPlugin outputDir(String value) {
        myOutputDir = new PluginParameter<>(myOutputDir, value);
        return this;
    }

    /**
     * Name of method used to creates inclusion/exclusion
     * counts in FastqToHapCountPLugin
     *
     * @return Hc Method
     */
    public String hapCountMethod() {
        return myHapCountMethod.value();
    }

    /**
     * Set Hap Count Method. Name of method used to creates inclusion/exclusion
     * counts in FastqToHapCountPLugin
     *
     * @param value Hap Count Method
     *
     * @return this plugin
     */
    public HapCountBestPathToTextPlugin hapCountMethod(String value) {
        myHapCountMethod = new PluginParameter<>(myHapCountMethod, value);
        return this;
    }

    /**
     * Name of method to be used to create paths through the
     * graph.
     *
     * @return Path Method
     */
    public String pathMethod() {
        return myPathMethod.value();
    }

    /**
     * Set P Method. Name of method to be used to create paths
     * through the graph.
     *
     * @param value P Method
     *
     * @return this plugin
     */
    public HapCountBestPathToTextPlugin pathMethod(String value) {
        myPathMethod = new PluginParameter<>(myPathMethod, value);
        return this;
    }
    
}