net.maizegenetics.pangenome.db_loading.FindTrimmedAnchorCoordinates Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
/**
 * 
 */
package net.maizegenetics.pangenome.db_loading;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

/**
 * NOTE:  This version works on the REF only. The assemblies
 * do not have their coordinates in the fasta files - they only have
 * the start, not the end.  OH - but I could get it from the length.
 * Due to time, I'm not going to mess this one up.  I"ll write another
 * one for assemblies and merge them later.
 * 
 * Method takes 
 * - a refAnchor fasta file, (I dumped reference anchors from the pre-trimmed db)
 * - a list of files from a directory that contain the trimmed versions
 * on a per-anchor basis.  Find the start and end of the trimmed sequence
 * in the ref sequence.
 * 
 * The start is where it matches.  The end is the length of the trimmed sequence
 * added to the start.
 * 
 * Output:  The important output is the csv file that will be used to load
 * the new reference anchors to the "trimmed" database.
 * 
 * @author lcj34
 *
 */
public class FindTrimmedAnchorCoordinates {
    public static void processMain(String refFasta, Map trimmedFastas, String outputDir, String lineName) {
        BufferedReader refbr  = Utils.getBufferedReader(refFasta);
        BufferedReader trimmedbr = null;
        String anchorCSVFile = outputDir + lineName + "_anchor_coordinates.csv";
        String anchorFastaFile = outputDir + lineName + "_trimmed_anchors.fa";
        BufferedWriter csvwr = Utils.getBufferedWriter(anchorCSVFile);
        BufferedWriter fastawr = Utils.getBufferedWriter(anchorFastaFile);
        try {
            String refline;
            int anchorid = 0;
            
            String chrom = "-1";
//            int origAnchorStart = 0;
//            int trimAnchorStart = 0;
//            int origAnchorEnd = 0;
//            int trimAnchorEnd = 0;
            
            String csvHeader = "anchorid,Chr,regexStart,regexEnd,regexSeqLength\n";
            csvwr.write(csvHeader);
            
            //  THe outer loop runs through each anchor in the reffasta file 
            //  (which has all anchors, all chroms. Not concerned with chrom 
            // changing as trimmedAnchor files are on a per-anchor basis,
            // not per-Line or per-chrom. 
            while ((refline=refbr.readLine()) != null) {
                // idline has anchor-chrom:start:end, e.g. >1-1:44289:49837
                
                if (refline.startsWith(">")) {
                    String lineTokens[] = refline.split("-");                 
                    chrom = lineTokens[1].split(":")[0];
                     anchorid = Integer.parseInt(lineTokens[0].replace(">", ""));
//                    origAnchorStart = Integer.parseInt(lineTokens[1].split(":")[1]);
//                    origAnchorEnd = Integer.parseInt(lineTokens[1].split(":")[2]);
                    continue; // skipping idlines
                }
               // anchorid++;
                // the count is the anchor ID - use that to index map to get trimmed file
                if (trimmedbr != null) trimmedbr.close();
                // read next trimmedFasta from Zack - these are on per-anchor basis.
                trimmedbr = Utils.getBufferedReader(trimmedFastas.get(anchorid));
                String anchorLine = trimmedbr.readLine(); // skip idline - first shoudl be B73
                anchorLine = trimmedbr.readLine();
                
                // Write data to fasta file, write also to csv
                Map,String> regexDataMap = findTrimmedSeq(refline,anchorLine);
                if (regexDataMap.size() > 1) {
                    System.out.println( anchorid + ": regex returned " + regexDataMap.size() + "  entries");
                }
                
                if (regexDataMap.size() == 0) {
                    System.out.println( anchorid + ": regex returned " + regexDataMap.size() + "  entries");
                }    
                for (Map.Entry, String> entry : regexDataMap.entrySet()) {
                    // only want to process the first one
                    int regexStart = entry.getKey().x;
                    int regexEnd = entry.getKey().y;
                    int length = entry.getValue().length();
                    csvwr.write(anchorid + "," + chrom + "," + regexStart + "," + regexEnd + "," + length + "\n");
                    
                    fastawr.write(">" + anchorid + ":" + chrom + ":" + regexStart + ":" + regexEnd + "\n");
                    fastawr.write(entry.getValue());
                    fastawr.write("\n");
                    break;
                }
                
                //if (anchorid > 5) break; // for TESTING _ only try with 5 anchors.
                if (anchorid %1000 == 0) {
                    System.out.println("Finished anchor: " + anchorid);
                }
                
            }
            csvwr.close();
            fastawr.close();
        } catch (Exception exc) {
            exc.printStackTrace();
        }
    }
    private static Map,String> findTrimmedSeq(String refSeq, String trimmedSeq) {
 
        // regex is 0-based.  Return the actual ref sequence with all the N's
        String trimmedSeqRegex = trimmedSeq.replace("NN","[N]+"); // trimmedSeqRegex is now an expression - the brackets allow as many as exist

        Pattern pattern = Pattern.compile(trimmedSeqRegex);
        Matcher matcher = pattern.matcher(refSeq); // look for the trimmed Sequence  in the reference sequence.

        // should only be 1 item on the map, using Tuple and map to prevent going back and forth string-integer
        Map,String> regexDataMap = new HashMap,String>();
        while(matcher.find()) {
           Tuple startEnd = new Tuple(matcher.start(),matcher.end());
           // matcher.group() returns the input sequence matched by the previous pattern.
           regexDataMap.put(startEnd, matcher.group());
        }
        return regexDataMap;
    }
   
    /**
     * @param args
     */
    public static void main(String[] args) {
        // TODO Auto-generated method stub

        String trimmedDirectory = null;
        String refFasta = null;
        String outputDir = null;
        String lineName = null;
        if (args.length != 4) {
            System.out.println(" Expecting 4 input arguments in this order: " );            
            System.out.println("     Full path name to ref fasta File of pre-trimed anchors,");
            System.out.println("     name of line, e.g. B73Ref or W22,");
            System.out.println("     Directory of trimmed anchors,");
            System.out.println("     Full directory path name with trailing / for output files to be written");
                    
            return;
        }
        Map trimmedFileMap = new HashMap();
        try {
            refFasta = args[0];
            lineName = args[1];
            trimmedDirectory = args[2];
            outputDir = args[3];
            
            for (int idx = 1; idx < 37805; idx++) {
                String filename = trimmedDirectory + "exportedFastaFromV2DB_anchorId" + idx + "_LongNsRemoved_MAFFTAligned_Trimmed.fa";
                trimmedFileMap.put(idx,filename);
            }
           
        } catch (Exception exc){
            exc.printStackTrace();
            return;
        }
        processMain(refFasta,trimmedFileMap, outputDir, lineName);
        
    }

}