All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.processAssemblyGenomes.Mummer4DoonerBZStats Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
/**
 * 
 */
package net.maizegenetics.pangenome.processAssemblyGenomes;

import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.sql.Connection;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import javax.swing.ImageIcon;

import org.apache.log4j.Logger;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.RangeSet;
import com.google.common.collect.TreeRangeMap;
import com.google.common.collect.TreeRangeSet;

import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.pangenome.api.CreateGraphUtils;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

/**
 * This method takes a coords file, the genome fastas, ranges to be covered
 * Prints out tab-delimited file of metrics related to the region.
 * 
 * THe output is tab-delimited, rows are anchors.
 *  columns are 
 *    anchor-coordinates, 
 *    covered coordinates (list, semicolon separated)
 *    percent covered.
 *    
 * @author lcj34
 *
 */
public class Mummer4DoonerBZStats extends AbstractPlugin{
    private static final Logger myLogger = Logger.getLogger(Mummer4DoonerBZStats.class);
    private static final Pattern tab = Pattern.compile("\t");

    private PluginParameter coordsFile = new PluginParameter.Builder("ref", null, String.class).guiName("Mummer Coords File").required(true).inFile()
            .description("Output file created by Mummer show-coords ").build(); 
    private PluginParameter intervalsFile = new PluginParameter.Builder("intervalsFile", null, String.class).guiName("Anchor Intervals File").required(false).inFile()
            .description("Anchor Intervals file to be used when intervals are different than DB, e.g. when using just a small region ").build(); 
    
    private PluginParameter mummerParams = new PluginParameter.Builder("mummerParams", null, String.class).guiName("Mummer Parameters").required(true)
            .description("Mummer parameters used").build();
    
    private PluginParameter prefix = new PluginParameter.Builder("prefix", null, String.class).guiName("Output File refix").required(true)
            .description("Name to  prefix to output results file").build();
    
    private PluginParameter outputDir = new PluginParameter.Builder("outputDir", null, String.class).guiName("Output Directory").required(true).outDir()
            .description("Output directory including trailing / for writing files").build();
    
    private PluginParameter configFile = new PluginParameter.Builder("configFile", null, String.class).guiName("DB Config File").required(true).inFile()
            .description("File containing lines with data for host=, user=, password= and DB=, DBtype= used for db connection").build();
    
    private PluginParameter refChrom = new PluginParameter.Builder("refChrom", null, String.class).guiName("Reference Chromosome Name").required(true)
            .description("Name of reference chromsome as stored in the database.  This is the chromosome whose anchors will be pulled.").build();
    
    public Mummer4DoonerBZStats() {
        super(null, false);
    }

    public Mummer4DoonerBZStats(Frame parentFrame) {
        super(parentFrame, false);
    }

    public Mummer4DoonerBZStats(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }
    
    /**
     * @param args
     */
    public static void main(String[] args) {
        //GeneratePluginCode.generate(Mummer4DoonerBZStats.class);
        
        String baseDir = "/Users/lcj34/notes_files/phg_2018/assembly_testing/mummer/ed_deltaGvsdelta_metrics/";
        
        
        String intervalsFile = null;
        String prefix = "b73ph207_chr9_c250mum_delta_anchorDetail";
        String coordsFile = baseDir + "ref_PH207_c250mum.coords";
//        String intervalsFile = "/Users/lcj34/notes_files/phg_2018/assembly_testing/mummer/metrics_junit/intervals_bzRegion.txt";                
//        String prefix = "b73ph207_bzRegions_c250mum";
//        String coordsFile = baseDir + "b73ph207_bzRegion_c250mum.coords"; 
                
        String mummerParams = "c 250 mum ";
        String config = "/Users/lcj34/notes_files/phg_2018/assembly_testing/mummer/configSQLite.txt";
        String refChrom = "9";
        String outputDir = baseDir;
//        if (args.length != 6) {
//            System.out.println("Expecting 6 parameters in this order: ");
//            System.out.println("  cordsFile name");
//            System.out.println("  String of Mummer parameters used");
//            System.out.println("  Prefix to append to output file as name");
//            System.out.println("  Path to config file for connecting to DB");
//            System.out.println("  Name of reference chromosome to pull anchors from DB");
//            System.out.println("  Path to output Directory for metrics file");
//            System.out.println("  Please fix parameters and try again.");
//            return;
//        }
//        
//        String coordsFile = args[0];
//        String mummerParams = args[1];
//        String prefix = args[2];
//        String config = args[3];
//        String refChrom = args[4];
//        String outputDir = args[5];
        
        System.out.println("Begin - call Mummer4DoonerBZStats");
        new Mummer4DoonerBZStats()
        .coordsFile(coordsFile)
        .mummerParams(mummerParams)
        .prefix(prefix)
        .configFile(config)
        .intervalsFile(intervalsFile)
        .refChrom(refChrom)
        .outputDir(outputDir)
        .performFunction(null);
        System.out.println("FINished junit");
    }
    
    @Override
    public DataSet processData(DataSet input) {
 
        // Get db connection to grab anchor info            
        System.out.println("Get db connection ...");
        Connection connection = DBLoadingUtils.connection(configFile(),false);

        if (connection == null) {
            System.out.println("\ntestConnectToPOstgres: COuld not get connection for config file db");
            throw new IllegalStateException ("MummerAnalysisMetricsPlugin: could not get db connection from configFile " + configFile() );
        }

        // Get anchor regions for chrom.  using Chromosome object as the DB will have stripped
        // off any leading chr/chromosome.  need this name to match what is stored in the db
        Chromosome refChromosome = Chromosome.instance(refChrom());

        System.out.println("Get refRangesForChrom");
        // get anchorID, ReferenceRanges for this chromosome.  This gets translated into a RangeSet 
        Map idToRefRangeMap;
        if (intervalsFile() == null) {
            idToRefRangeMap = refRangesForChrom( connection,  refChromosome.getName());
        } else {
            idToRefRangeMap = getAnchorFromIntervalsFile( intervalsFile(),  refChromosome.getName());
        }
        
        System.out.println("Number of all anchors: " + idToRefRangeMap.keySet().size());
        
        // Split RefRangeMap above into a RangeSet of anchor coordinates
        Map> refAnchorIDRangeSetMap = getRangesForChrom(idToRefRangeMap,  refChromosome, DBLoadingUtils.AnchorType.ANCHOR); 
        
        int numAnchors = refAnchorIDRangeSetMap.keySet().size();
        int numAnchorsRepresented = 0;
        
        // Read the coordinates file.  Need to store the rangeSet for ref, but also associate it with chrom start/end.
        System.out.println("Begin coords file processing");
        
        RangeMap coordsFileRefToDataMap = TreeRangeMap.create();
        String fullChromSummary = outputDir() + "/" + prefix() + "_metrics.txt";
        try (BufferedReader br = Utils.getBufferedReader(coordsFile());
             BufferedWriter bw = Utils.getBufferedWriter(fullChromSummary)) {
            
            int headerCount = 0;
            int linesProcessed = 0;
            String line = null;
 
            //TODO:  assume file is sorted and has no header, then remove the header related lines
            while ((line = br.readLine()) != null) {
//                headerCount++;
//                if (headerCount < 5) {
//                    continue; // skipping 4 lines of header
//                }
                linesProcessed++;
                addToCoordsMap(coordsFileRefToDataMap,line);             
            }
            
            // Now I have a map of the coordinates.  I have the anchors from above.
            // create lines to write to outputfile
            
            String outputHeaderLine = "DB_anchorID\tdb-anchor-coordinates\tMummer-Ref_overlapping-Coordinates\tMummer-asm-coordinates\tPercent_IDs\tanchor-len\tbps-aligned\tPercent_anchor-covered\tMummer-params\n";
            bw.write(outputHeaderLine);
            for (Map.Entry> entry : refAnchorIDRangeSetMap.entrySet()) {
                // This list contains the Entries overlapping the anchor 
                RangeMap anchorOverlapEntries = getCoordsEntriesForAnchor(entry.getValue(), coordsFileRefToDataMap);
                
                if (anchorOverlapEntries.asMapOfRanges().isEmpty()) continue; // no portion of anchor was aligned
                numAnchorsRepresented++;
                
                int anchorID = entry.getKey();
                Range dbAnchorRange = entry.getValue();
                // compute overlap.  Write to range set. call getRegionCoverage?
                RangeSet overlappingAnchorRanges = getAnchorRangeSet(anchorOverlapEntries);
                Tuple anchorCoverage = AssemblyProcessingUtils.getRegionCoverage(overlappingAnchorRanges, entry.getValue());
                
                // Write the entries!
                StringBuilder fileSB = new StringBuilder();
                fileSB.append(anchorID).append("\t"); // DB anchor_id
                fileSB.append(entry.getValue().lowerEndpoint()).append("-").append(entry.getValue().upperEndpoint()).append("\t"); // db anchor coords
                
                StringBuilder refSB = new StringBuilder();
                StringBuilder asmSB = new StringBuilder();
                StringBuilder idSB = new StringBuilder();
                for (Map.Entry, String> overlapEntry : anchorOverlapEntries.asMapOfRanges().entrySet()) {
                    Range refCoords = overlapEntry.getKey();
                    String refData = overlapEntry.getValue();
                    refSB.append(refCoords.lowerEndpoint()).append("-").append(refCoords.upperEndpoint()).append(";");
                    
                    // refData has querystart\tqueryend\t%id: created in addToCoordsMap()
                    String[] refDataTokens = tab.split(refData);
                    asmSB.append(refDataTokens[0]).append("-").append(refDataTokens[1]).append(";");
                    idSB.append(refDataTokens[2]).append(";");                   
                }
                fileSB.append(refSB.toString()).append("\t"); // ref coords from mummer
                fileSB.append(asmSB.toString()).append("\t"); // assembly coords from mummer
                fileSB.append(idSB.toString()).append("\t"); // id lines from represented regions in file (for this anchor)
                
                int anchorLen = (dbAnchorRange.upperEndpoint() - dbAnchorRange.lowerEndpoint()) +1;
                fileSB.append(anchorLen).append("\t");
                fileSB.append(anchorCoverage.x).append("\t"); // number of bps represented
                fileSB.append(anchorCoverage.y).append("\t"); // percent of bps covered in this anchor
                fileSB.append(mummerParams()).append("\n"); // mummer params used (same for all anchors)
                bw.write(fileSB.toString());
                               
            }
            
            System.out.println("Finished:  Total lines processed: " + linesProcessed 
                    + ", totoal number of anchors for chrom: " + refAnchorIDRangeSetMap.keySet().size() 
                    + ", number of anchors represented: " + numAnchorsRepresented);
        } catch (Exception exc) {
            exc.printStackTrace();
            throw new IllegalStateException("Mummer4DoonerBZStats - error processing data");
        }
         
        return null;
    }
    
    public Map getAnchorFromIntervalsFile( String intervalsFile,  String chrom){
        ImmutableMap.Builder builder = ImmutableMap.builder();
        
        try (BufferedReader br = Utils.getBufferedReader(intervalsFile)) {
            String line = br.readLine(); // skip header
            while ((line = br.readLine())!= null) {
                String[] tokens = tab.split(line);
                int id = Integer.parseInt(tokens[0]);
                String chromosome = tokens[1];
                int start = Integer.parseInt(tokens[2]);
                int end = Integer.parseInt(tokens[3]);
                builder.put(id, new ReferenceRange("B73Ref", Chromosome.instance(chromosome), start, end, id));
            }
            
        } catch (Exception ioe) {
            ioe.printStackTrace(); // numberFormat or IO exception
            throw new IllegalStateException("getAnchorFromIntervalsFIle - error processing file " + intervalsFile);
        }
        Map result = builder.build();
        return result;
    }
    
    public static Map refRangesForChrom(Connection dbConn, String chrom) {
        
        // Create method name for querying initial ref region and inter-region ref_range_group method ids
        String refLine = CreateGraphUtils.getRefLineName( dbConn);

        String refMethodName = DBLoadingUtils.REGION_REFERENCE_RANGE_GROUP;
        String refInterMethodName = DBLoadingUtils.INTER_REGION_REFERENCE_RANGE_GROUP;
        
        List refGrpMethodsList = new ArrayList();
        int refGrpMethodID = CreateGraphUtils.methodId(dbConn, refMethodName);       
        int refInterGrpMethodID = CreateGraphUtils.methodId(dbConn, refInterMethodName);
        
        // Add method Ids to list if valid.  There are not always inter-group methods in the db
        if (refGrpMethodID > 0) refGrpMethodsList.add(Integer.toString(refGrpMethodID));
        if (refInterGrpMethodID > 0) refGrpMethodsList.add(Integer.toString(refInterGrpMethodID));
               
        String refGrpMethodString = refGrpMethodsList.stream().collect(Collectors.joining(","));
        // query anchors, filter on chrom/version.  This gets focus AND non-focus as the latter will be
        // needed later.
        
        // Query based on methodIds created above

        String query = "select reference_ranges.ref_range_id, chrom,range_start,range_end, methods.name " +
                " from reference_ranges, ref_range_ref_range_group,ref_range_groups, methods " +
                " where reference_ranges.ref_range_id=ref_range_ref_range_group.ref_range_id " +
                " AND ref_range_groups.ref_range_group_id = ref_range_ref_range_group.ref_range_group_id " +
                " AND ref_range_groups.group_method_id = methods.method_id " +
                " AND methods.method_type = " + DBLoadingUtils.MethodType.REF_RANGE_GROUP.getValue();

        myLogger.info("refRangesForChrom: query statement: " + query);

        ImmutableMap.Builder builder = ImmutableMap.builder();
        try (ResultSet rs = dbConn.createStatement().executeQuery(query)) {

            while (rs.next()) {
                int id = rs.getInt("ref_range_id");
                String chromosome = rs.getString("chrom");
                int start = rs.getInt("range_start");
                int end = rs.getInt("range_end");
                String methodName = rs.getString("name");
                
                builder.put(id, new ReferenceRange("B73Ref", Chromosome.instance(chromosome), start, end, id, methodName));
            }

        } catch (Exception se) {
            myLogger.debug(se.getMessage(), se);
            throw new IllegalStateException("CreateGraphUtils: referenceRanges: Problem querying the database: " + se.getMessage());
        }

        Map result = builder.build();
        return result;
    }
    
    // Returns a range set from the referenceRangeMap.  If "anchor" is true, only
    // return the anchor.  If "false", return interanchors.
    // Return value is a Map of 
    public static Map> getRangesForChrom(Map refRangeMap, Chromosome chrom, DBLoadingUtils.AnchorType anchorType) {
        Map> anchorIdRangeSet = new HashMap>();
        
        for (Map.Entry entry : refRangeMap.entrySet()) {
            int gid = entry.getKey();
            ReferenceRange refRange = entry.getValue();
            if (!(refRange.chromosome().equals(chrom))) continue; // skip anchors for other chroms
            if (anchorType == DBLoadingUtils.AnchorType.ANCHOR || anchorType == DBLoadingUtils.AnchorType.BOTH)  {
                //if (refRange.isAnchor()) {
                //    anchorIdRangeSet.put(gid,Range.closed(refRange.start(), refRange.end()));
                //}
            } else if (anchorType == DBLoadingUtils.AnchorType.INTER_ANCHOR || anchorType == DBLoadingUtils.AnchorType.BOTH) {
                //if (!refRange.isAnchor()) {
                //    anchorIdRangeSet.put(gid,Range.closed(refRange.start(), refRange.end()));
                //}
            }
        }    
        System.out.println("getRangesForChrom: anchorType : " + anchorType + ", anchorRangeSet size: " + anchorIdRangeSet.keySet().size());
        return anchorIdRangeSet;
    }
    
    public static void addToCoordsMap(RangeMap coordsFileRefToDataMap,String line) {
        String[] tokens = tab.split(line);       
        // mummer coords file has 4 header lines when tab-delimited, then these columns:
        // S1  E1  S2 E2  [LEN 1] {LEN 2]  [%id]  chr1  chr2
        
        
        // RangeSet coalesce the ranges, which is what I want
        int refStart = Integer.parseInt(tokens[0]);
        int refEnd = Integer.parseInt(tokens[1]);
        // If the end is < start, it mapped on reverse strand.
        // These are not reverse strand coordinates with the reverse starting at 0
        // The coordinates are relative to the positive strand, so just flip them
        if (refStart > refEnd) {
            int tempStart = refEnd;
            refEnd = refStart;
            refStart = tempStart;
        }       
        Range refCoords = Range.closed(refStart,refEnd);       
        
        int queryStart = Integer.parseInt(tokens[2]);
        int queryEnd = Integer.parseInt(tokens[3]);
        
        if (queryStart > queryEnd) {                    
            int tempStart = queryEnd;
            queryEnd = queryStart;
            queryStart = tempStart;
        }       
        
        // add %ID to use in creating average
        double id = Double.parseDouble(tokens[6]);
        String refData = queryStart + "\t" + queryEnd + "\t" + id;
        
        coordsFileRefToDataMap.put(refCoords,refData);
    }
    
    //This returns a list of entries in the coordsFileRefToDataMap that overlap the anchorRange
    public RangeMap  getCoordsEntriesForAnchor(Range anchorRange, RangeMap coordsFileRefToDataMap) {
        RangeMap  matchingRanges = TreeRangeMap.create();;
        
        List, String>> overlaps = new ArrayList<>(
                                coordsFileRefToDataMap.subRangeMap(anchorRange).asMapOfRanges().entrySet());
        if (overlaps.size() > 0 ) {
            // find the original entries and add them
            for (int idx = 0; idx < overlaps.size(); idx++) {
                Map.Entry, String> overlappingEntry = coordsFileRefToDataMap.getEntry(overlaps.get(idx).getKey().lowerEndpoint());
                matchingRanges.put(overlappingEntry.getKey(), overlappingEntry.getValue());
            }            
        }
        return matchingRanges;
    }
    
    // Create a RangeSet from a map of ranges.
    public RangeSet  getAnchorRangeSet(RangeMap anchorOverlapEntries) {
        RangeSet ranges = TreeRangeSet.create();
        for (Range range : anchorOverlapEntries.asMapOfRanges().keySet()) {
            ranges.add(range);
        }
        return ranges;
    }
    
    @Override
    public ImageIcon getIcon() {
        // TODO Auto-generated method stub
        return null;
    }

    @Override
    public String getButtonName() {
        // TODO Auto-generated method stub
        return null;
    }

    @Override
    public String getToolTipText() {
        // TODO Auto-generated method stub
        return null;
    }
    
    /**
     * Output of Mummer coords file 
     *
     * @return Mummer Coords File
     */
    public String coordsFile() {
        return coordsFile.value();
    }

    /**
     * Set Mummer Coords File. Output of Mummer coords file
     * 
     *
     * @param value Mummer Coords File
     *
     * @return this plugin
     */
    public Mummer4DoonerBZStats coordsFile(String value) {
        coordsFile = new PluginParameter<>(coordsFile, value);
        return this;
    }

    /**
     * Anchor Intervals file to be used when intervals are
     * different than DB, e.g. when using just a small region
     * 
     *
     * @return Anchor Intervals File
     */
    public String intervalsFile() {
        return intervalsFile.value();
    }

    /**
     * Set Anchor Intervals File. Anchor Intervals file to
     * be used when intervals are different than DB, e.g.
     * when using just a small region 
     *
     * @param value Anchor Intervals File
     *
     * @return this plugin
     */
    public Mummer4DoonerBZStats intervalsFile(String value) {
        intervalsFile = new PluginParameter<>(intervalsFile, value);
        return this;
    }

    /**
     * Mummer parameters used
     *
     * @return Mummer Parameters
     */
    public String mummerParams() {
        return mummerParams.value();
    }

    /**
     * Set Mummer Parameters. Mummer parameters used
     *
     * @param value Mummer Parameters
     *
     * @return this plugin
     */
    public Mummer4DoonerBZStats mummerParams(String value) {
        mummerParams = new PluginParameter<>(mummerParams, value);
        return this;
    }

    /**
     * Name to  prefix to output results file
     *
     * @return Output File refix
     */
    public String prefix() {
        return prefix.value();
    }

    /**
     * Set Output File refix. Name to  prefix to output results
     * file
     *
     * @param value Output File refix
     *
     * @return this plugin
     */
    public Mummer4DoonerBZStats prefix(String value) {
        prefix = new PluginParameter<>(prefix, value);
        return this;
    }

    /**
     * Output directory including trailing / for writing files
     *
     * @return Output Directory
     */
    public String outputDir() {
        return outputDir.value();
    }

    /**
     * Set Output Directory. Output directory including trailing
     * / for writing files
     *
     * @param value Output Directory
     *
     * @return this plugin
     */
    public Mummer4DoonerBZStats outputDir(String value) {
        outputDir = new PluginParameter<>(outputDir, value);
        return this;
    }

    /**
     * File containing lines with data for host=, user=, password=
     * and DB=, DBtype= used for db connection
     *
     * @return DB Config File
     */
    public String configFile() {
        return configFile.value();
    }

    /**
     * Set DB Config File. File containing lines with data
     * for host=, user=, password= and DB=, DBtype= used for
     * db connection
     *
     * @param value DB Config File
     *
     * @return this plugin
     */
    public Mummer4DoonerBZStats configFile(String value) {
        configFile = new PluginParameter<>(configFile, value);
        return this;
    }
    
    /**
     * Name of reference chromsome as stored in the database.
     *  This is the chromosome whose anchors will be pulled.
     *
     * @return Reference Chromosome Name
     */
    public String refChrom() {
        return refChrom.value();
    }

    /**
     * Set Reference Chromosome Name. Name of reference chromsome
     * as stored in the database.  This is the chromosome
     * whose anchors will be pulled.
     *
     * @param value Reference Chromosome Name
     *
     * @return this plugin
     */
    public Mummer4DoonerBZStats refChrom(String value) {
        refChrom = new PluginParameter<>(refChrom, value);
        return this;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy