net.maizegenetics.pangenome.db_loading.FindTrimmedAssemblyCoordinates Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.db_loading;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
/**
* THis file is similar to FindTrimmedAnchorCoordinates used for the reference. The
* difference is the ref has all anchors. Assemblies do not. We need to read
* the assembly fasta, determine which anchor is represented
*
* The anchor is there in the idLine - the last value after the space, e.g.
* >1:44289:49837;chr1:1 1
*
* THis is creating a fasta, but it isn't useful because the IDline is not what
* is needed when loading into the DB.
* Idline must have >B73RefChrom:B73startpos:B73EndPos;assemblyChrom:assemblyStart:asemblyEnd anchorid
*
* @author lcj34
*
*/
public class FindTrimmedAssemblyCoordinates {
public static void processMain(String assemblyFasta, Map trimmedFastas, String outputDir, String lineName) {
BufferedReader refbr = Utils.getBufferedReader(assemblyFasta);
BufferedReader trimmedbr = null;
String anchorCSVFile = outputDir + lineName + "_anchor_coordinates.csv";
String anchorFastaFile = outputDir + lineName + "_trimmed_anchors.fa";
BufferedWriter csvwr = Utils.getBufferedWriter(anchorCSVFile);
BufferedWriter fastawr = Utils.getBufferedWriter(anchorFastaFile);
try {
String assemblyLine;
int anchorid = 0;
String chrom = "-1";
String csvHeader = "anchorid,Chr,regexStart,regexEnd,regexSeqLength\n";
csvwr.write(csvHeader);
// Not concerned with chrom changing as trimmedAnchor files are on a per-anchor basis,
// not per-Line or per-chrom.
while ((assemblyLine=refbr.readLine()) != null) {
// idline has >1:50877:55716;chr1:3630 2
// i.e
// >refChrom:refStart:refEnd;assemblyChrom:assemblyStart anchorID
if (assemblyLine.startsWith(">")) {
anchorid = Integer.parseInt(assemblyLine.substring(assemblyLine.indexOf(" ")+1 ));
String lineTokens[] = assemblyLine.split(";");
chrom = lineTokens[1];
chrom = chrom.substring(0,chrom.indexOf(":")); // this gets ASSEMBLY chrom - needed when pull from assembly fasta later
continue; // done with ID line, continue to get the sequence line
}
// use anchorid to index map to get trimmed file
if (trimmedbr != null) trimmedbr.close();
trimmedbr = Utils.getBufferedReader(trimmedFastas.get(anchorid));
// Find our Assembly in trimmedFasta file
String anchorLine;
String anchorSeq = null;
while ((anchorLine = trimmedbr.readLine()) !=null) {
if (anchorLine.startsWith(">")) {
if (anchorLine.contains(lineName)) {
// trimmed anchors file has all haplotype sequences for the particular anchor.
// THis finds the idline for user specified line, now get the sequence
anchorSeq = trimmedbr.readLine();
break;
}
}
}
if (anchorSeq == null) {
System.out.println("ERROR - could not find sequence for " + lineName + " for anchorid " + anchorid);
return;
}
// Write data to fasta file, write also to csv
Map,String> regexDataMap = findTrimmedSeq(assemblyLine,anchorSeq);
if (regexDataMap.size() > 1) {
System.out.println( anchorid + ": regex returned " + regexDataMap.size() + " entries");
}
if (regexDataMap.size() == 0) {
System.out.println( anchorid + ": regex returned " + regexDataMap.size() + " entries");
}
for (Map.Entry, String> entry : regexDataMap.entrySet()) {
// only want to process the first one
int regexStart = entry.getKey().x;
int regexEnd = entry.getKey().y;
int length = entry.getValue().length();
csvwr.write(anchorid + "," + chrom + "," + regexStart + "," + regexEnd + "," + length + "\n");
fastawr.write(">" + anchorid + ":" + chrom + ":" + regexStart + ":" + regexEnd + "\n");
fastawr.write(entry.getValue());
fastawr.write("\n");
break;
}
//if (anchorid > 5) break; // for TESTING _ only try with 5 anchors.
if (anchorid %1000 == 0) {
System.out.println("Finished anchor: " + anchorid);
}
}
refbr.close();
csvwr.close();
fastawr.close();
trimmedbr.close();
} catch (Exception exc) {
exc.printStackTrace();
}
}
private static Map,String> findTrimmedSeq(String refSeq, String trimmedSeq) {
// regex is 0-based. Return the
String trimmedSeqRegex = trimmedSeq.replace("NN","[N]+"); // trimmedSeqRegex is now an expression - the brackets allow as many N's as exist
Pattern pattern = Pattern.compile(trimmedSeqRegex);
Matcher matcher = pattern.matcher(refSeq);
// should only be 1 item on the map, using TUple and map to prevent going back and forth string-integer
Map,String> regexDataMap = new HashMap,String>();
while(matcher.find()) {
Tuple startEnd = new Tuple(matcher.start(),matcher.end());
regexDataMap.put(startEnd, matcher.group());
}
return regexDataMap;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
long time = System.nanoTime();
String trimmedDirectory = null;
String refFasta = null;
String outputDir = null;
String lineName = null;
if (args.length != 4) {
System.out.println(" Expecting 4 input arguments in this order: " );
System.out.println(" Full path name to assembly fasta File of pre-trimmed anchors,");
System.out.println(" name of line, e.g. B73Ref or W22,");
System.out.println(" Directory of trimmed anchors,");
System.out.println(" Full directory path name with trailing / for output files to be written");
return;
}
Map trimmedFileMap = new HashMap();
try {
refFasta = args[0];
lineName = args[1];
trimmedDirectory = args[2];
outputDir = args[3];
for (int idx = 1; idx < 37805; idx++) {
String filename = trimmedDirectory + "exportedFastaFromV2DB_anchorId" + idx + "_LongNsRemoved_MAFFTAligned_Trimmed.fa";
trimmedFileMap.put(idx,filename);
}
} catch (Exception exc){
exc.printStackTrace();
return;
}
System.out.println("Begin ...");
processMain(refFasta,trimmedFileMap, outputDir, lineName);
System.out.println("Finished - totalTime " + (System.nanoTime()-time)/1e9 + " seconds");
}
}