net.maizegenetics.pangenome.processAssemblyGenomes.FindRampSeqContigsInAssemblies Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10

package net.maizegenetics.pangenome.processAssemblyGenomes;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

/**
 * 
 * This method takes a fasta of ramp seq short sequences, and looks for them
 * in an assembly genome.  
 * 
 * This is for Dan.  Looking for exact matches of the 9000 across all entries
 * in the fasta file.  Look for both orig sequence, and reverse complement of sequence.
 * This one works well - it runs each assembly in sequence.  When processing
 * the assemblies, it parallelizes over every rampSeq contig in the rampSeq
 * map (file read into map).  This speeds things up considerably from parallel
 * processing just over the assemblies.
 * 
 * Using indexOf(seq,startPos) still seems quicker than knuth-morris-pratt
 * method, perhaps because of overhead of the latter.
 * 
 * INPUT:
 *   - fasta of rampSeq short contigs
 *   - directory path, including trailing /  where assembly genome fasta files live
 *   - directory path, including trailing / to which output files will be written
 * 
 * OUTPUT:
 *   - tab-delimited files without headers, but the columns are BED file positions
 *     (0-based, inclusive/exclusive).  
 *     
 *     ContigName  AssemblyIDLine  startPos  endPos  Strand
 *     
 *     In the above, Strand is whether the forward (as presented in file) or
 *     reverse-compliment of the strand matched in the assembly file.
 *     
 *     THe start/end positions are 0-based, inclusive/exclusive as for bedfiles.
 *     
 *     There is 1 tab-delimited file generated for each assembly.  The file name
 *     reflects the assembly name.
 * 
 * @author lcj34
 *
 */
public class FindRampSeqContigsInAssemblies {
    public static void processData(String shortSeqs, String assemblyDir, String outputDir) {

        Map shortSeqIdToStringMap = new HashMap();
        try (BufferedReader ninekbr = Utils.getBufferedReader(shortSeqs)) {
            // read short seqs into  a mpa of String,String for processing against all assembly fastas
            StringBuilder sb = new StringBuilder();
            String line = null;
            String idLine = null;
            while ((line = ninekbr.readLine()) != null) {
                if (line.startsWith(">")) {                   
                    if (sb.length() > 0) {
                        // add to map
                        shortSeqIdToStringMap.put(idLine, sb.toString());
                    }
                    idLine = line.replace(">", "");
                    sb.setLength(0);
                } else {
                    sb.append(line);
                }
            }
            // process last one
            if (sb.length() > 0) {                
                shortSeqIdToStringMap.put(idLine, sb.toString());
            }

        } catch (Exception exc) {
            exc.printStackTrace();
        }

        System.out.println("Size of shortSeqIdToStringMap after reading " + shortSeqIdToStringMap.keySet().size());
               
        // Short seq file has been read into map
        // Process the reads through the list of assemblyPaths
        List assemblyPaths = DirectoryCrawler.listPaths("glob:*.fasta", Paths.get(assemblyDir).toAbsolutePath());
        assemblyPaths.stream() // parallelization occurs in searchSeqsInFasta
        .forEach(inputSeqFile -> {
            // Processing each assembly separately.  Results of rampSeq short contigs matches
            // will be written to separate files for each assembly
            searchSeqsInFasta(shortSeqIdToStringMap,inputSeqFile.toString(),outputDir);
        });

    }

    // THis method runs through each line in the assembly file, using a parallele stream.
    // to call a method that checks for matches against each of the shortSeqs in the passed map
    public static void searchSeqsInFasta(Map shortSeqs, String fastaFile, String outputDir) {
        System.out.println("Processing  fasta " + fastaFile);
        Long time = System.nanoTime();
        int lastSlash = fastaFile.lastIndexOf("/");
        String fileName = fastaFile.substring(lastSlash+1);
        fileName = fileName.substring(0, fileName.indexOf(".fasta"));
        // Output file reflects name of assembly
        String outputFile = outputDir + fileName + "_shortSeqPositions.txt";
        
        Object myKey = shortSeqs.keySet().toArray()[0];
        int seqLen = shortSeqs.get(myKey).length(); // this assumes all are the same length!
        try (BufferedReader br = Utils.getBufferedReader(fastaFile);
             BufferedWriter writer = Utils.getBufferedWriter(outputFile)) {
            
            String idLine = null;
            String line = null;
            StringBuilder seqSB = new StringBuilder();
            while ((line = br.readLine()) != null) {
                if (line.startsWith(">")) {                   
                    if (seqSB.length() > 0) {
                        // call method to compare all short sequences against this assembly sequence
                        // Tuple X = RampSeqIDLine, Tuple Y = begin:end:strand.  
                        List> rampSeqMatches = shortSeqs.entrySet().parallelStream().flatMap(entry -> {
                            String rampSeqID = entry.getKey();
                            String rampSeqSeq = entry.getValue();
                            
                            return checkForSeqMatch(rampSeqID, rampSeqSeq,seqSB.toString()).stream();
                        }).collect(Collectors.toList());
                        writeDataToFile(rampSeqMatches,idLine,seqLen,writer);
                    }
                    idLine = line.replace(">", "");
                    seqSB.setLength(0);
                } else {
                    seqSB.append(line);
                }
            }
            // process last one
            if (seqSB.length() > 0) {
                // call method to compare all short sequences against this one, in parallel
                // Collect all results to a list to be written
                // key=idline, value=begin:end:strand.  
                List> rampSeqMatches = shortSeqs.entrySet().parallelStream().flatMap(entry -> {
                    String rampSeqID = entry.getKey();
                    String rampSeqSeq = entry.getValue();
                    return checkForSeqMatch(rampSeqID, rampSeqSeq,seqSB.toString()).stream();
                }).collect(Collectors.toList());
                writeDataToFile(rampSeqMatches,idLine,seqLen,writer);
            }
            
        } catch (Exception exc) {
            exc.printStackTrace();
        }
        
        System.out.println("Done with fasta " + fastaFile + ", processing took " + (System.nanoTime() - time)/1e9 + " seconds.");
    }
    public static List> checkForSeqMatch(String idLine, String shortSeq,String assemblySequence) {
 
        List> matchResults = new ArrayList>();
        try {
            
            int lastIndex = 0;
            int shortSeqLen = shortSeq.length();

            // Find matches of original sequence           
            while (lastIndex != -1) {
                lastIndex = assemblySequence.indexOf(shortSeq,lastIndex); // start search at lastIndex spot
                if (lastIndex != -1) {
                    int end = lastIndex + shortSeqLen; 
                    StringBuilder beginEndStrandSB = new StringBuilder();
                    beginEndStrandSB.append(lastIndex).append(":").append(end).append(":").append("+");
                    
                    matchResults.add(new Tuple(idLine,beginEndStrandSB.toString()));
                    lastIndex +=2;
                    if (lastIndex > assemblySequence.length()-shortSeqLen) lastIndex=-1;
                }
            }

            // look for reverse complement of this sequence in the assembly chrom/scaffold            
            String shortSeqReverse =  BaseEncoder.getReverseComplement(shortSeq); 
            lastIndex = 0;
            while (lastIndex != -1) {
                lastIndex = assemblySequence.indexOf(shortSeqReverse,lastIndex); // start search at lastIndex spot
                if (lastIndex != -1) {
                    int end = lastIndex + shortSeqLen; 
                    StringBuilder beginEndStrandSB = new StringBuilder();
                    beginEndStrandSB.append(lastIndex).append(":").append(end).append(":").append("-");
                    matchResults.add(new Tuple(idLine,beginEndStrandSB.toString()));                   
                    lastIndex +=2;
                    if (lastIndex > assemblySequence.length()-shortSeqLen) lastIndex=-1;
                }
            }    

        } catch (Exception exc) {
            exc.printStackTrace();
            return null;
        } 
        return matchResults;
    }

    public static void writeDataToFile(List> results, String assemblyIDLine, int seqLen, BufferedWriter writer) {
        
        // Collect to stringbuilder, write to file:
        StringBuilder sb = new StringBuilder();
        for (Tuple item: results) {
            
            String[]tokens = item.getY().split(":");
            String begin = tokens[0];
            String end = tokens[1];
            String strand = tokens[2];
            sb.append(item.getX()).append("\t").append(assemblyIDLine).append("\t");
            sb.append(begin).append("\t").append(end).append("\t").append(strand).append("\n");
        }
        
        try {
            if (sb.length() > 0) {
                System.out.println("  writing to file for assemblyIDLine " + assemblyIDLine);
                writer.write(sb.toString());
            }
            
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
        
    }

    /**
     * @param args
     */
    public static void main(String[] args) {

        String rampSeqSeqs = "/Volumes/Samsung_T1/rampSeq/PHG_306-combined_tags.fasta";
        String assemblyDir = "/Volumes/Samsung_T1/rampSeq/rampSeqAssemblies/";
        String outputDir = "/Volumes/Samsung_T1/rampSeq/rampSeqOutput/";
        if (args.length != 3) {
            System.out.println("MIssing arguments - need 9000 sequence file, assemblyFasta directory, outputDirectory in that order");
            return;
        }
        rampSeqSeqs = args[0];
        assemblyDir = args[1];
        outputDir = args[2];

        Long totalTime = System.nanoTime();
        processData(rampSeqSeqs, assemblyDir,outputDir);
        System.out.println("DONE!! total took " + (System.nanoTime()-totalTime)/1e9 + " seconds");
    }

}