net.maizegenetics.pangenome.db_loading.FindTrimmedAnchorCoordinates Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.db_loading;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
/**
* NOTE: This version works on the REF only. The assemblies
* do not have their coordinates in the fasta files - they only have
* the start, not the end. OH - but I could get it from the length.
* Due to time, I'm not going to mess this one up. I"ll write another
* one for assemblies and merge them later.
*
* Method takes
* - a refAnchor fasta file, (I dumped reference anchors from the pre-trimmed db)
* - a list of files from a directory that contain the trimmed versions
* on a per-anchor basis. Find the start and end of the trimmed sequence
* in the ref sequence.
*
* The start is where it matches. The end is the length of the trimmed sequence
* added to the start.
*
* Output: The important output is the csv file that will be used to load
* the new reference anchors to the "trimmed" database.
*
* @author lcj34
*
*/
public class FindTrimmedAnchorCoordinates {
public static void processMain(String refFasta, Map trimmedFastas, String outputDir, String lineName) {
BufferedReader refbr = Utils.getBufferedReader(refFasta);
BufferedReader trimmedbr = null;
String anchorCSVFile = outputDir + lineName + "_anchor_coordinates.csv";
String anchorFastaFile = outputDir + lineName + "_trimmed_anchors.fa";
BufferedWriter csvwr = Utils.getBufferedWriter(anchorCSVFile);
BufferedWriter fastawr = Utils.getBufferedWriter(anchorFastaFile);
try {
String refline;
int anchorid = 0;
String chrom = "-1";
// int origAnchorStart = 0;
// int trimAnchorStart = 0;
// int origAnchorEnd = 0;
// int trimAnchorEnd = 0;
String csvHeader = "anchorid,Chr,regexStart,regexEnd,regexSeqLength\n";
csvwr.write(csvHeader);
// THe outer loop runs through each anchor in the reffasta file
// (which has all anchors, all chroms. Not concerned with chrom
// changing as trimmedAnchor files are on a per-anchor basis,
// not per-Line or per-chrom.
while ((refline=refbr.readLine()) != null) {
// idline has anchor-chrom:start:end, e.g. >1-1:44289:49837
if (refline.startsWith(">")) {
String lineTokens[] = refline.split("-");
chrom = lineTokens[1].split(":")[0];
anchorid = Integer.parseInt(lineTokens[0].replace(">", ""));
// origAnchorStart = Integer.parseInt(lineTokens[1].split(":")[1]);
// origAnchorEnd = Integer.parseInt(lineTokens[1].split(":")[2]);
continue; // skipping idlines
}
// anchorid++;
// the count is the anchor ID - use that to index map to get trimmed file
if (trimmedbr != null) trimmedbr.close();
// read next trimmedFasta from Zack - these are on per-anchor basis.
trimmedbr = Utils.getBufferedReader(trimmedFastas.get(anchorid));
String anchorLine = trimmedbr.readLine(); // skip idline - first shoudl be B73
anchorLine = trimmedbr.readLine();
// Write data to fasta file, write also to csv
Map,String> regexDataMap = findTrimmedSeq(refline,anchorLine);
if (regexDataMap.size() > 1) {
System.out.println( anchorid + ": regex returned " + regexDataMap.size() + " entries");
}
if (regexDataMap.size() == 0) {
System.out.println( anchorid + ": regex returned " + regexDataMap.size() + " entries");
}
for (Map.Entry, String> entry : regexDataMap.entrySet()) {
// only want to process the first one
int regexStart = entry.getKey().x;
int regexEnd = entry.getKey().y;
int length = entry.getValue().length();
csvwr.write(anchorid + "," + chrom + "," + regexStart + "," + regexEnd + "," + length + "\n");
fastawr.write(">" + anchorid + ":" + chrom + ":" + regexStart + ":" + regexEnd + "\n");
fastawr.write(entry.getValue());
fastawr.write("\n");
break;
}
//if (anchorid > 5) break; // for TESTING _ only try with 5 anchors.
if (anchorid %1000 == 0) {
System.out.println("Finished anchor: " + anchorid);
}
}
csvwr.close();
fastawr.close();
} catch (Exception exc) {
exc.printStackTrace();
}
}
private static Map,String> findTrimmedSeq(String refSeq, String trimmedSeq) {
// regex is 0-based. Return the actual ref sequence with all the N's
String trimmedSeqRegex = trimmedSeq.replace("NN","[N]+"); // trimmedSeqRegex is now an expression - the brackets allow as many as exist
Pattern pattern = Pattern.compile(trimmedSeqRegex);
Matcher matcher = pattern.matcher(refSeq); // look for the trimmed Sequence in the reference sequence.
// should only be 1 item on the map, using Tuple and map to prevent going back and forth string-integer
Map,String> regexDataMap = new HashMap,String>();
while(matcher.find()) {
Tuple startEnd = new Tuple(matcher.start(),matcher.end());
// matcher.group() returns the input sequence matched by the previous pattern.
regexDataMap.put(startEnd, matcher.group());
}
return regexDataMap;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
String trimmedDirectory = null;
String refFasta = null;
String outputDir = null;
String lineName = null;
if (args.length != 4) {
System.out.println(" Expecting 4 input arguments in this order: " );
System.out.println(" Full path name to ref fasta File of pre-trimed anchors,");
System.out.println(" name of line, e.g. B73Ref or W22,");
System.out.println(" Directory of trimmed anchors,");
System.out.println(" Full directory path name with trailing / for output files to be written");
return;
}
Map trimmedFileMap = new HashMap();
try {
refFasta = args[0];
lineName = args[1];
trimmedDirectory = args[2];
outputDir = args[3];
for (int idx = 1; idx < 37805; idx++) {
String filename = trimmedDirectory + "exportedFastaFromV2DB_anchorId" + idx + "_LongNsRemoved_MAFFTAligned_Trimmed.fa";
trimmedFileMap.put(idx,filename);
}
} catch (Exception exc){
exc.printStackTrace();
return;
}
processMain(refFasta,trimmedFileMap, outputDir, lineName);
}
}