net.maizegenetics.pangenome.processAssemblyGenomes.FindRampSeqContigsInAssemblies Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.processAssemblyGenomes;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
/**
*
* This method takes a fasta of ramp seq short sequences, and looks for them
* in an assembly genome.
*
* This is for Dan. Looking for exact matches of the 9000 across all entries
* in the fasta file. Look for both orig sequence, and reverse complement of sequence.
* This one works well - it runs each assembly in sequence. When processing
* the assemblies, it parallelizes over every rampSeq contig in the rampSeq
* map (file read into map). This speeds things up considerably from parallel
* processing just over the assemblies.
*
* Using indexOf(seq,startPos) still seems quicker than knuth-morris-pratt
* method, perhaps because of overhead of the latter.
*
* INPUT:
* - fasta of rampSeq short contigs
* - directory path, including trailing / where assembly genome fasta files live
* - directory path, including trailing / to which output files will be written
*
* OUTPUT:
* - tab-delimited files without headers, but the columns are BED file positions
* (0-based, inclusive/exclusive).
*
* ContigName AssemblyIDLine startPos endPos Strand
*
* In the above, Strand is whether the forward (as presented in file) or
* reverse-compliment of the strand matched in the assembly file.
*
* THe start/end positions are 0-based, inclusive/exclusive as for bedfiles.
*
* There is 1 tab-delimited file generated for each assembly. The file name
* reflects the assembly name.
*
* @author lcj34
*
*/
public class FindRampSeqContigsInAssemblies {
public static void processData(String shortSeqs, String assemblyDir, String outputDir) {
Map shortSeqIdToStringMap = new HashMap();
try (BufferedReader ninekbr = Utils.getBufferedReader(shortSeqs)) {
// read short seqs into a mpa of String,String for processing against all assembly fastas
StringBuilder sb = new StringBuilder();
String line = null;
String idLine = null;
while ((line = ninekbr.readLine()) != null) {
if (line.startsWith(">")) {
if (sb.length() > 0) {
// add to map
shortSeqIdToStringMap.put(idLine, sb.toString());
}
idLine = line.replace(">", "");
sb.setLength(0);
} else {
sb.append(line);
}
}
// process last one
if (sb.length() > 0) {
shortSeqIdToStringMap.put(idLine, sb.toString());
}
} catch (Exception exc) {
exc.printStackTrace();
}
System.out.println("Size of shortSeqIdToStringMap after reading " + shortSeqIdToStringMap.keySet().size());
// Short seq file has been read into map
// Process the reads through the list of assemblyPaths
List assemblyPaths = DirectoryCrawler.listPaths("glob:*.fasta", Paths.get(assemblyDir).toAbsolutePath());
assemblyPaths.stream() // parallelization occurs in searchSeqsInFasta
.forEach(inputSeqFile -> {
// Processing each assembly separately. Results of rampSeq short contigs matches
// will be written to separate files for each assembly
searchSeqsInFasta(shortSeqIdToStringMap,inputSeqFile.toString(),outputDir);
});
}
// THis method runs through each line in the assembly file, using a parallele stream.
// to call a method that checks for matches against each of the shortSeqs in the passed map
public static void searchSeqsInFasta(Map shortSeqs, String fastaFile, String outputDir) {
System.out.println("Processing fasta " + fastaFile);
Long time = System.nanoTime();
int lastSlash = fastaFile.lastIndexOf("/");
String fileName = fastaFile.substring(lastSlash+1);
fileName = fileName.substring(0, fileName.indexOf(".fasta"));
// Output file reflects name of assembly
String outputFile = outputDir + fileName + "_shortSeqPositions.txt";
Object myKey = shortSeqs.keySet().toArray()[0];
int seqLen = shortSeqs.get(myKey).length(); // this assumes all are the same length!
try (BufferedReader br = Utils.getBufferedReader(fastaFile);
BufferedWriter writer = Utils.getBufferedWriter(outputFile)) {
String idLine = null;
String line = null;
StringBuilder seqSB = new StringBuilder();
while ((line = br.readLine()) != null) {
if (line.startsWith(">")) {
if (seqSB.length() > 0) {
// call method to compare all short sequences against this assembly sequence
// Tuple X = RampSeqIDLine, Tuple Y = begin:end:strand.
List> rampSeqMatches = shortSeqs.entrySet().parallelStream().flatMap(entry -> {
String rampSeqID = entry.getKey();
String rampSeqSeq = entry.getValue();
return checkForSeqMatch(rampSeqID, rampSeqSeq,seqSB.toString()).stream();
}).collect(Collectors.toList());
writeDataToFile(rampSeqMatches,idLine,seqLen,writer);
}
idLine = line.replace(">", "");
seqSB.setLength(0);
} else {
seqSB.append(line);
}
}
// process last one
if (seqSB.length() > 0) {
// call method to compare all short sequences against this one, in parallel
// Collect all results to a list to be written
// key=idline, value=begin:end:strand.
List> rampSeqMatches = shortSeqs.entrySet().parallelStream().flatMap(entry -> {
String rampSeqID = entry.getKey();
String rampSeqSeq = entry.getValue();
return checkForSeqMatch(rampSeqID, rampSeqSeq,seqSB.toString()).stream();
}).collect(Collectors.toList());
writeDataToFile(rampSeqMatches,idLine,seqLen,writer);
}
} catch (Exception exc) {
exc.printStackTrace();
}
System.out.println("Done with fasta " + fastaFile + ", processing took " + (System.nanoTime() - time)/1e9 + " seconds.");
}
public static List> checkForSeqMatch(String idLine, String shortSeq,String assemblySequence) {
List> matchResults = new ArrayList>();
try {
int lastIndex = 0;
int shortSeqLen = shortSeq.length();
// Find matches of original sequence
while (lastIndex != -1) {
lastIndex = assemblySequence.indexOf(shortSeq,lastIndex); // start search at lastIndex spot
if (lastIndex != -1) {
int end = lastIndex + shortSeqLen;
StringBuilder beginEndStrandSB = new StringBuilder();
beginEndStrandSB.append(lastIndex).append(":").append(end).append(":").append("+");
matchResults.add(new Tuple(idLine,beginEndStrandSB.toString()));
lastIndex +=2;
if (lastIndex > assemblySequence.length()-shortSeqLen) lastIndex=-1;
}
}
// look for reverse complement of this sequence in the assembly chrom/scaffold
String shortSeqReverse = BaseEncoder.getReverseComplement(shortSeq);
lastIndex = 0;
while (lastIndex != -1) {
lastIndex = assemblySequence.indexOf(shortSeqReverse,lastIndex); // start search at lastIndex spot
if (lastIndex != -1) {
int end = lastIndex + shortSeqLen;
StringBuilder beginEndStrandSB = new StringBuilder();
beginEndStrandSB.append(lastIndex).append(":").append(end).append(":").append("-");
matchResults.add(new Tuple(idLine,beginEndStrandSB.toString()));
lastIndex +=2;
if (lastIndex > assemblySequence.length()-shortSeqLen) lastIndex=-1;
}
}
} catch (Exception exc) {
exc.printStackTrace();
return null;
}
return matchResults;
}
public static void writeDataToFile(List> results, String assemblyIDLine, int seqLen, BufferedWriter writer) {
// Collect to stringbuilder, write to file:
StringBuilder sb = new StringBuilder();
for (Tuple item: results) {
String[]tokens = item.getY().split(":");
String begin = tokens[0];
String end = tokens[1];
String strand = tokens[2];
sb.append(item.getX()).append("\t").append(assemblyIDLine).append("\t");
sb.append(begin).append("\t").append(end).append("\t").append(strand).append("\n");
}
try {
if (sb.length() > 0) {
System.out.println(" writing to file for assemblyIDLine " + assemblyIDLine);
writer.write(sb.toString());
}
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
/**
* @param args
*/
public static void main(String[] args) {
String rampSeqSeqs = "/Volumes/Samsung_T1/rampSeq/PHG_306-combined_tags.fasta";
String assemblyDir = "/Volumes/Samsung_T1/rampSeq/rampSeqAssemblies/";
String outputDir = "/Volumes/Samsung_T1/rampSeq/rampSeqOutput/";
if (args.length != 3) {
System.out.println("MIssing arguments - need 9000 sequence file, assemblyFasta directory, outputDirectory in that order");
return;
}
rampSeqSeqs = args[0];
assemblyDir = args[1];
outputDir = args[2];
Long totalTime = System.nanoTime();
processData(rampSeqSeqs, assemblyDir,outputDir);
System.out.println("DONE!! total took " + (System.nanoTime()-totalTime)/1e9 + " seconds");
}
}