net.maizegenetics.pangenome.db_loading.FindTrimmedAssemblyCoordinates Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
package net.maizegenetics.pangenome.db_loading;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

/**
 * THis file is similar to FindTrimmedAnchorCoordinates used for the reference. The
 * difference is the ref has all anchors.  Assemblies do not.  We need to read
 * the assembly fasta, determine which anchor is represented
 * 
 * The anchor is there in the idLine - the last value after the space, e.g.
 * >1:44289:49837;chr1:1 1
 * 
 * THis is creating a fasta, but it isn't useful because the IDline is not what
 * is needed when loading into the DB.  
 * Idline must have >B73RefChrom:B73startpos:B73EndPos;assemblyChrom:assemblyStart:asemblyEnd anchorid
 * 
 * @author lcj34
 *
 */
public class FindTrimmedAssemblyCoordinates {
    public static void processMain(String assemblyFasta, Map trimmedFastas, String outputDir, String lineName) {
        BufferedReader refbr  = Utils.getBufferedReader(assemblyFasta);
        BufferedReader trimmedbr = null;
        String anchorCSVFile = outputDir + lineName + "_anchor_coordinates.csv";
        String anchorFastaFile = outputDir + lineName + "_trimmed_anchors.fa";
        BufferedWriter csvwr = Utils.getBufferedWriter(anchorCSVFile);
        BufferedWriter fastawr = Utils.getBufferedWriter(anchorFastaFile);
      
        try {
            String assemblyLine;
            int anchorid = 0;
            
            String chrom = "-1";
            
            String csvHeader = "anchorid,Chr,regexStart,regexEnd,regexSeqLength\n";
            csvwr.write(csvHeader);
            
            // Not concerned with chrom changing as trimmedAnchor files are on a per-anchor basis,
            // not per-Line or per-chrom.
            while ((assemblyLine=refbr.readLine()) != null) {
                // idline has >1:50877:55716;chr1:3630 2 
                // i.e
                // >refChrom:refStart:refEnd;assemblyChrom:assemblyStart anchorID
                
                if (assemblyLine.startsWith(">")) {
                    anchorid = Integer.parseInt(assemblyLine.substring(assemblyLine.indexOf(" ")+1 ));
                    String lineTokens[] = assemblyLine.split(";");                    
                    chrom = lineTokens[1];
                    chrom = chrom.substring(0,chrom.indexOf(":")); // this gets ASSEMBLY chrom - needed when pull from assembly fasta later
                    continue; // done with ID line, continue to get the sequence line
                }
                
                // use anchorid to index map to get trimmed file
                if (trimmedbr != null) trimmedbr.close();
                trimmedbr = Utils.getBufferedReader(trimmedFastas.get(anchorid));
                // Find our Assembly in trimmedFasta file
                String anchorLine;
                String anchorSeq = null;
                while ((anchorLine = trimmedbr.readLine()) !=null) {
                    if (anchorLine.startsWith(">")) {
                        if (anchorLine.contains(lineName)) {
                            // trimmed anchors file has all haplotype sequences for the particular anchor.
                            // THis finds the idline for user specified line, now get the sequence
                            anchorSeq = trimmedbr.readLine();
                            break;  
                        }
                    }
                }
                if (anchorSeq == null) {
                    System.out.println("ERROR - could not find sequence for " + lineName + " for anchorid " + anchorid);
                    return;
                }
                
                // Write data to fasta file, write also to csv
                Map,String> regexDataMap = findTrimmedSeq(assemblyLine,anchorSeq);
                if (regexDataMap.size() > 1) {
                    System.out.println( anchorid + ": regex returned " + regexDataMap.size() + "  entries");
                }
                
                if (regexDataMap.size() == 0) {
                    System.out.println( anchorid + ": regex returned " + regexDataMap.size() + "  entries");
                }    
                for (Map.Entry, String> entry : regexDataMap.entrySet()) {
                    // only want to process the first one
                    int regexStart = entry.getKey().x;
                    int regexEnd = entry.getKey().y;
                    int length = entry.getValue().length();
                    csvwr.write(anchorid + "," + chrom + "," + regexStart + "," + regexEnd + "," + length + "\n");
                    
                    fastawr.write(">" + anchorid + ":" + chrom + ":" + regexStart + ":" + regexEnd + "\n");
                    fastawr.write(entry.getValue());
                    fastawr.write("\n");
                    break;
                }
                
                //if (anchorid > 5) break; // for TESTING _ only try with 5 anchors.
                if (anchorid %1000 == 0) {
                    System.out.println("Finished anchor: " + anchorid);
                }
                
            }
            refbr.close();
            csvwr.close();
            fastawr.close();
            trimmedbr.close();
        } catch (Exception exc) {
            exc.printStackTrace();
        }
        
    }
    private static Map,String> findTrimmedSeq(String refSeq, String trimmedSeq) {
 
        // regex is 0-based.  Return the 
        String trimmedSeqRegex = trimmedSeq.replace("NN","[N]+"); // trimmedSeqRegex is now an expression - the brackets allow as many N's as exist

        Pattern pattern = Pattern.compile(trimmedSeqRegex);
        Matcher matcher = pattern.matcher(refSeq);

        // should only be 1 item on the map, using TUple and map to prevent going back and forth string-integer
        Map,String> regexDataMap = new HashMap,String>();
        while(matcher.find()) {
           Tuple startEnd = new Tuple(matcher.start(),matcher.end());
           regexDataMap.put(startEnd, matcher.group());
        }
        return regexDataMap;
    }
   
    /**
     * @param args
     */
    public static void main(String[] args) {
        // TODO Auto-generated method stub

        long time = System.nanoTime();
        String trimmedDirectory = null;
        String refFasta = null;
        String outputDir = null;
        String lineName = null;
        if (args.length != 4) {
            System.out.println(" Expecting 4 input arguments in this order: " );            
            System.out.println("     Full path name to assembly fasta File of pre-trimmed anchors,");
            System.out.println("     name of line, e.g. B73Ref or W22,");
            System.out.println("     Directory of trimmed anchors,");
            System.out.println("     Full directory path name with trailing / for output files to be written");
                    
            return;
        }
        Map trimmedFileMap = new HashMap();
        try {
            refFasta = args[0];
            lineName = args[1];
            trimmedDirectory = args[2];
            outputDir = args[3];
            
            for (int idx = 1; idx < 37805; idx++) {
                String filename = trimmedDirectory + "exportedFastaFromV2DB_anchorId" + idx + "_LongNsRemoved_MAFFTAligned_Trimmed.fa";
                trimmedFileMap.put(idx,filename);
            }
           
        } catch (Exception exc){
            exc.printStackTrace();
            return;
        }
        System.out.println("Begin ...");
        processMain(refFasta,trimmedFileMap, outputDir, lineName);
        System.out.println("Finished - totalTime " + (System.nanoTime()-time)/1e9 + " seconds");
        
    }
}