net.maizegenetics.pangenome.db_loading.CreateCSV_TrimmedAnchorLoading Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
/**
 * 
 */
package net.maizegenetics.pangenome.db_loading;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.HashMap;
import java.util.Map;

import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

/**
 * This method takes as input 2 csv files
 * It  takes the data from FindTrimmedAnchorCoordinates.jar,
 * then calculates new coordinates using the old ref coordinates CSV file 
 * used for loading.
 * 
 * The data is written to another csv file that will contain:
 *   chr,trimmedStart,trimmedEnd,GeneStart,GeneEnd
 *   
 * The genes come from the old ref coordinates CSV file.
 *
 * Here are the caluclations for this method:
 * 
 * Origial anchor 2:
 *   start/end: 50358-56716 = len 6358
 *
 * Regex:
 *   start/end: 387-5359 = len 4972
 *
 * New anchor 2:  
 *   start = oldStart + regexStart = 50358+387 = 50745
 *   end = start + regexLen -1 = 50745 + 4972 -1 = 55716
 *   number trimmed at start = 387
 *   number trimmed at end = oldEnd - newEnd  = 56716-55716  = 1000
 *   
 * Output:
 *   Method creates the csv file needed for loading ref anchors to new trimmed DB.
 *  
 * @author lcj34
 *
 */
public class CreateCSV_TrimmedAnchorLoading {

    public static void processMain(String regexDataFile, String origRefAnchors, String outputFile) {
        BufferedReader regexRD = Utils.getBufferedReader(regexDataFile);
        BufferedReader origRefRD = Utils.getBufferedReader(origRefAnchors);
        
        
        BufferedWriter bw = Utils.getBufferedWriter(outputFile);
        
        try {
            
            // Read the data into maps, then process it.
            //
            Map regexMap = new HashMap();
           
            String regexLine = regexRD.readLine(); // skip header
            while ((regexLine = regexRD.readLine()) != null) {
                String[] tokens = regexLine.split(",");
                String chromStartEndLength = tokens[1] + ":" + tokens[2] + ":" + tokens[3] + ":" + tokens[4];
                // add to map
                regexMap.put(Integer.parseInt(tokens[0]), chromStartEndLength);
            }                       
            regexRD.close();
            
            // hold per anchor gene data:  Map
            Map> geneMap = new HashMap>();
            // This file does not contain anchorid, but it is ordered by chrom/startPos
            // and was used to create the anchorids.  SO use a count to get anchorid
            Map anchorMap = new HashMap();
            String anchorLine = origRefRD.readLine(); // skip header
            int count = 1;
            
            while ((anchorLine = origRefRD.readLine()) != null) {
                String[] tokens = anchorLine.split(",");
                // add to map!
                String chromStartEnd = tokens[0] + ":" + tokens[1] + ":" + tokens[2];
                anchorMap.put(count, chromStartEnd);
                Tuple geneStartEnd = new Tuple(tokens[3],tokens[4]);
                geneMap.put(count,geneStartEnd);
                count++;
            }
            origRefRD.close();
            
            if (anchorMap.keySet().size() != regexMap.keySet().size()) {
                System.out.println("Map sizes differ - regexMap " + regexMap.size() + ", anchorMap " + anchorMap.size());
                return;
            }
            // process the data per the algorithm at the top.
           // bw.write("anchorid,chr,trimmedStart,trimmedEnd,leftCount,rightCount\n"); old
            bw.write("chr,trimmedStart,trimmedEnd,GeneStart,GeneEnd\n");
            for (Map.Entry regexEntry : regexMap.entrySet()) {
                int anchorid = regexEntry.getKey();
                String[] regexData = regexMap.get(anchorid).split(":");
                String[] anchorData = anchorMap.get(anchorid).split(":");
                
                if (!regexData[0].equals(anchorData[0])) {
                    System.out.println("ERROR - chromosomes don't match for anchorid " + anchorid + 
                            " regexChrom: " + regexData[0] + ", oldANchorChrom: " + anchorData[0]);
                    return;
                }
                int regexStart = Integer.parseInt(regexData[1]);
                int regexLen = Integer.parseInt(regexData[3]);
                int oldStart = Integer.parseInt(anchorData[1]);
                int trimmedStart = oldStart + regexStart;
                int trimmedEnd = trimmedStart + regexLen -1;
                int leftTrim = regexStart;
                int oldEnd = Integer.parseInt(anchorData[2]);
                int rightTrim = oldEnd - trimmedEnd;
                
                // get gene info
                Tuple geneStartEnd = geneMap.get(anchorid);
                bw.write(regexData[0] + "," + trimmedStart + "," + trimmedEnd + "," + geneStartEnd.x + "," + geneStartEnd.y + "\n");
               // bw.write(anchorid + "," + regexData[0] + "," + trimmedStart + "," + trimmedEnd + "," + leftTrim + "," + rightTrim + "\n");
            }
            bw.close();
        } catch (Exception exc) {
            exc.printStackTrace();
        }
    }
    /**
     * @param args
     */
    public static void main(String[] args) {
        // THis method can be run on the laptop.
        
        String regexData = "/Users/lcj34/notes_files/repgen/wgs_pipeline/interAnchors_June/B73Ref_regex_coordinates.csv";
        String origRefAnchors = "/Users/lcj34/notes_files/repgen/wgs_pipeline/interAnchors_June/anchorsFile_MergedPlus1000orGapDiffcoordinate_allchrsMay26.csv";
        // THis is the old one, before the gene file was integrated into this method.  This old file was not used
        // for anything other than later created the real file to load.  Skip this step - create the file to lod
        // in here.
        //String outputFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/interAnchors_June/trimmedAnchorsToLoad_B73Ref.txt";
        String outputFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/interAnchors_June/trimmedAnchorsToLoad_B73Ref_withGenes.txt";
        System.out.println("Begin processing");
                
        processMain(regexData, origRefAnchors,outputFile);

        System.out.println("FInished!!");
    }

}