net.maizegenetics.pangenome.db_loading.CreateCSV_TrimmedAnchorLoading Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.db_loading;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.HashMap;
import java.util.Map;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
/**
* This method takes as input 2 csv files
* It takes the data from FindTrimmedAnchorCoordinates.jar,
* then calculates new coordinates using the old ref coordinates CSV file
* used for loading.
*
* The data is written to another csv file that will contain:
* chr,trimmedStart,trimmedEnd,GeneStart,GeneEnd
*
* The genes come from the old ref coordinates CSV file.
*
* Here are the caluclations for this method:
*
* Origial anchor 2:
* start/end: 50358-56716 = len 6358
*
* Regex:
* start/end: 387-5359 = len 4972
*
* New anchor 2:
* start = oldStart + regexStart = 50358+387 = 50745
* end = start + regexLen -1 = 50745 + 4972 -1 = 55716
* number trimmed at start = 387
* number trimmed at end = oldEnd - newEnd = 56716-55716 = 1000
*
* Output:
* Method creates the csv file needed for loading ref anchors to new trimmed DB.
*
* @author lcj34
*
*/
public class CreateCSV_TrimmedAnchorLoading {
public static void processMain(String regexDataFile, String origRefAnchors, String outputFile) {
BufferedReader regexRD = Utils.getBufferedReader(regexDataFile);
BufferedReader origRefRD = Utils.getBufferedReader(origRefAnchors);
BufferedWriter bw = Utils.getBufferedWriter(outputFile);
try {
// Read the data into maps, then process it.
//
Map regexMap = new HashMap();
String regexLine = regexRD.readLine(); // skip header
while ((regexLine = regexRD.readLine()) != null) {
String[] tokens = regexLine.split(",");
String chromStartEndLength = tokens[1] + ":" + tokens[2] + ":" + tokens[3] + ":" + tokens[4];
// add to map
regexMap.put(Integer.parseInt(tokens[0]), chromStartEndLength);
}
regexRD.close();
// hold per anchor gene data: Map
Map> geneMap = new HashMap>();
// This file does not contain anchorid, but it is ordered by chrom/startPos
// and was used to create the anchorids. SO use a count to get anchorid
Map anchorMap = new HashMap();
String anchorLine = origRefRD.readLine(); // skip header
int count = 1;
while ((anchorLine = origRefRD.readLine()) != null) {
String[] tokens = anchorLine.split(",");
// add to map!
String chromStartEnd = tokens[0] + ":" + tokens[1] + ":" + tokens[2];
anchorMap.put(count, chromStartEnd);
Tuple geneStartEnd = new Tuple(tokens[3],tokens[4]);
geneMap.put(count,geneStartEnd);
count++;
}
origRefRD.close();
if (anchorMap.keySet().size() != regexMap.keySet().size()) {
System.out.println("Map sizes differ - regexMap " + regexMap.size() + ", anchorMap " + anchorMap.size());
return;
}
// process the data per the algorithm at the top.
// bw.write("anchorid,chr,trimmedStart,trimmedEnd,leftCount,rightCount\n"); old
bw.write("chr,trimmedStart,trimmedEnd,GeneStart,GeneEnd\n");
for (Map.Entry regexEntry : regexMap.entrySet()) {
int anchorid = regexEntry.getKey();
String[] regexData = regexMap.get(anchorid).split(":");
String[] anchorData = anchorMap.get(anchorid).split(":");
if (!regexData[0].equals(anchorData[0])) {
System.out.println("ERROR - chromosomes don't match for anchorid " + anchorid +
" regexChrom: " + regexData[0] + ", oldANchorChrom: " + anchorData[0]);
return;
}
int regexStart = Integer.parseInt(regexData[1]);
int regexLen = Integer.parseInt(regexData[3]);
int oldStart = Integer.parseInt(anchorData[1]);
int trimmedStart = oldStart + regexStart;
int trimmedEnd = trimmedStart + regexLen -1;
int leftTrim = regexStart;
int oldEnd = Integer.parseInt(anchorData[2]);
int rightTrim = oldEnd - trimmedEnd;
// get gene info
Tuple geneStartEnd = geneMap.get(anchorid);
bw.write(regexData[0] + "," + trimmedStart + "," + trimmedEnd + "," + geneStartEnd.x + "," + geneStartEnd.y + "\n");
// bw.write(anchorid + "," + regexData[0] + "," + trimmedStart + "," + trimmedEnd + "," + leftTrim + "," + rightTrim + "\n");
}
bw.close();
} catch (Exception exc) {
exc.printStackTrace();
}
}
/**
* @param args
*/
public static void main(String[] args) {
// THis method can be run on the laptop.
String regexData = "/Users/lcj34/notes_files/repgen/wgs_pipeline/interAnchors_June/B73Ref_regex_coordinates.csv";
String origRefAnchors = "/Users/lcj34/notes_files/repgen/wgs_pipeline/interAnchors_June/anchorsFile_MergedPlus1000orGapDiffcoordinate_allchrsMay26.csv";
// THis is the old one, before the gene file was integrated into this method. This old file was not used
// for anything other than later created the real file to load. Skip this step - create the file to lod
// in here.
//String outputFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/interAnchors_June/trimmedAnchorsToLoad_B73Ref.txt";
String outputFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/interAnchors_June/trimmedAnchorsToLoad_B73Ref_withGenes.txt";
System.out.println("Begin processing");
processMain(regexData, origRefAnchors,outputFile);
System.out.println("FInished!!");
}
}