All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.db_loading.TestPHGStuff Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
/**
 * 
 */
package net.maizegenetics.pangenome.db_loading;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multiset;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;

import net.maizegenetics.pangenome.db_loading.DBLoadingUtils.AnchorType;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils.MethodType;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.util.CheckSum;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

/**
 * @author lcj34
 *
 */
public class TestPHGStuff {

    public static void checkChrom2DupAnchor (String refGenome){
        GenomeSequence refSequence = GenomeSequenceBuilder.instance(refGenome);
        
     // Get reference string
        Chromosome chrom2 = new Chromosome("2");
        byte[] refBytes = refSequence.chromosomeSequence(chrom2,8244392,8247762);
        String anchor1String = NucleotideAlignmentConstants.nucleotideBytetoString(refBytes);
        
        byte[] ref2Bytes = 
                refSequence.chromosomeSequence(chrom2,8394514,8397884);
        String anchor2String = NucleotideAlignmentConstants.nucleotideBytetoString(refBytes);
        if (anchor1String.equals(anchor2String)) System.out.println("STrings equal");
        else System.out.println("Strings NOT equal");
    }
    
    public static int calcLevenshtein(String seq1, String seq2, int maxEditDistance) {
        long time = System.nanoTime();
        seq1 = seq1.toUpperCase();
        seq2 = seq2.toUpperCase();
        int editDistance = -1;
        // i == 0
        int [] costs = new int [seq2.length() + 1];
        for (int jdx = 0; jdx < costs.length; jdx++)
            costs[jdx] = jdx;
        for (int idx = 1; idx <= seq1.length(); idx++) {
            // jdx == 0; nw = lev(idx - 1, jdx)
            costs[0] = idx;
            int nw = idx - 1;
//            for (int jdx = 1; jdx <= seq2.length(); jdx++) { // original - before N computation
//                
//                int cj = Math.min(1 + Math.min(costs[jdx], costs[jdx - 1]), seq1.charAt(idx - 1) == seq2.charAt(jdx - 1) ? nw : nw + 1);
//                nw = costs[jdx];
//                costs[jdx] = cj;
//            }

            // The edit distance is on the diagonal of the matrix.  If this value is
            // greater than our max edit distance, bail returning -1
            // If diagonal is when idx=jdx
            int minDistance = 100000;
            boolean keepProcessing = true;;
            for (int jdx = 1; jdx <= seq2.length(); jdx++) { // seq was set to all lower case 
                int considerN = (seq1.charAt(idx-1) == 'N' || seq2.charAt(jdx-1) == 'N' || seq1.charAt(idx-1) == seq2.charAt(jdx-1)) ?
                        nw : nw + 1;
                int cj = Math.min(1 + Math.min(costs[jdx], costs[jdx - 1]), considerN);
                if (cj < minDistance) minDistance = cj;
                nw = costs[jdx];
                costs[jdx] = cj;
                System.out.println("costs["+jdx+"]="+cj+", idx=" + idx);
//                if (jdx == idx && cj > maxEditDistance ) {
//                    keepProcessing = false;
//                    break;
//                }                    
            }
            System.out.println("idx=" + idx + ",min distance after J loop: " + minDistance);
            if (minDistance > maxEditDistance) return editDistance;
           // if (!keepProcessing) return editDistance;
            
        }
        System.out.println("Time to calculate: " + (System.nanoTime() - time) / 1e9 + " seconds");
        return costs[seq2.length()];
    }
    
    public static void testPythonLD(String seq1, String seq2, String mode, String task) {
        // BLAST the fasta file of genes
//        String cmd = "blastn -num_threads " + threads + " -db " + blastDB + " -query " + fastaFile + " -evalue " + eval + " -max_target_seqs " 
//                + maxTarget + " -max_hsps " + maxHSPS + " -outfmt 6"; // to stdout
        
        System.out.println("testPythonLD - begin, calling python script:");
        String cmd = "/Users/lcj34/development/edlib-master/bindings/python/levenshtienDistance_edlib.py " + seq1 + " " + seq2;

        StringBuilder cmdSB = new StringBuilder();
        cmdSB.append("/Users/lcj34/development/edlib-master/bindings/python/levenshtienDistance_edlib.py ").append(seq1).append(" ").append(seq2);
        
        // NOTE - if script does not allow for mode to be NULL but task to have value.
        // It assumes the 3rd parameter (after seq1, seq2) is the 
        if (mode != null) cmdSB.append(" ").append(mode);
        if (task != null) cmdSB.append(" ").append(task);
        try {

            Process start = Runtime.getRuntime().exec(cmd);
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(start.getInputStream()));
            String line = null;
            
            while ((line = br.readLine()) != null)
            {
                System.out.println(line);
            }
        } catch (Exception exc) {
            exc.printStackTrace();
        }
        System.out.println("testPythonLD - finished!");
 
    }
    public static void printTrimTables(String origData, String trimData, String outputFile) {
        BufferedReader origbr = Utils.getBufferedReader(origData);
        BufferedReader trimbr = Utils.getBufferedReader(trimData);
        
        BufferedWriter outbw = Utils.getBufferedWriter(outputFile);
        
        String header = "origStart\torigEnd\tnewStart\tnewEnd\torigLen\ttrimLength\n";
        
        try {
            outbw.write(header);
            String oline;
            String tline;
            while ((oline = origbr.readLine())!= null && (tline = trimbr.readLine()) != null) {
                String[] oTokens = oline.split(":");
                String[] tTokens = tline.split(":");
                int ostart = Integer.parseInt(oTokens[1].substring(0, oTokens[1].indexOf("-")));
                int oend = Integer.parseInt(oTokens[1].substring(oTokens[1].indexOf("-")+1));
                
                int tstart = Integer.parseInt(tTokens[1].substring(0, tTokens[1].indexOf("-")));
                int tend = Integer.parseInt(tTokens[1].substring(tTokens[1].indexOf("-")+1));
                
                int oldLength = oend-ostart + 1;
                int trimLength = tend-tstart + 1;
                
                StringBuilder sb = new StringBuilder();
                sb.append(Integer.toString(ostart)).append("\t");
                sb.append(Integer.toString(oend)).append("\t");
                sb.append(Integer.toString(tstart)).append("\t");
                sb.append(Integer.toString(tend)).append("\t");
                sb.append(Integer.toString(oldLength)).append("\t");
                sb.append(Integer.toString(trimLength)).append("\n");
                outbw.write(sb.toString());
                        
            } 
            origbr.close();
            trimbr.close();
            outbw.close();
        } catch (Exception exc) {
            exc.printStackTrace();
        }

    }
    
    // trying with multisets per Zack - need to explore this more, it wasn't clicking for me
    public static void freqChartHaplotypeCollapse (String inputFile, String outputDir) {
        // this should take tab-delimited file from collapsed anchors with
        // headers anchorid/origANchorSeqID/origLen/callpasedAnchorSeqID/collapsedLen/
        // It counts number of sequences that are stand-alone, number where 2 are
        // mapped together, etc.
        
        //BufferedReader br = Utils.getBufferedReader(inputFile);
        String outfilecsv = outputDir + "anchorSeqCollapseHits.csv";
        
        try (BufferedReader br = Utils.getBufferedReader(inputFile)) {
                        
            // SeqID, sequences mapped to it
            Multiset countSeqID = HashMultiset.create();
            String line = br.readLine(); // skip header
            while ((line=br.readLine()) != null) {
                String[] lineTokens = line.split("\\t");
                countSeqID.add(lineTokens[3]); 
            }
            
            Multiset countOfCount = HashMultiset.create();
            for (String id : countSeqID.elementSet()) {
                countOfCount.add(countSeqID.count(id));
            }
            
            StringBuilder dataSB = new StringBuilder();
//            for (Integer count : countOfCount.elementSet()) {
//                dataSB.append(Integer.toString(count)).append("\n");
//            }
            
//            for (int idx = 0; idx < countOfCount.elementSet().size()) {
//               // String
//            }
            // put it all in a csv file for R
            BufferedWriter bw = Utils.getBufferedWriter(outfilecsv);
            bw.write(dataSB.toString());
            bw.close();
            System.out.println("Data written to: " + outfilecsv);
            
            //bw = Utils.getBufferedWriter(file)
             
        } catch (Exception exc) {
            exc.printStackTrace();
        }
    }
    
    // This one works.
    public static void freqHaplotypeCollapse (String inputFile, String outputDir) {
        // this should take tab-delimited file from collapsed anchors with
        // headers anchorid/origANchorSeqID/origLen/callpasedAnchorSeqID/collapsedLen/
        // It counts number of sequences that are stand-alone, number where 2 are
        // mapped together, etc.
        
        //BufferedReader br = Utils.getBufferedReader(inputFile);
        String outfile = outputDir + "anchorSeqCollapseHits.txt";
        
        try (BufferedReader br = Utils.getBufferedReader(inputFile)) {
                        
            // Need the number of times there were X sequences mapped to another sequence
            // SeqID, sequences mapped to it
            Multimap collapsedToOrigMap = HashMultimap.create();
            String line = br.readLine(); // skip header
            while ((line=br.readLine()) != null) {
                String[] lineTokens = line.split("\\t");
                collapsedToOrigMap.put(Integer.parseInt(lineTokens[3]),Integer.parseInt(lineTokens[1])); 
            }
            
            //This one takes the size (number of ids) as the key, and puts the map
            Multimap sizeToIds = HashMultimap.create();          
            for (Integer key: collapsedToOrigMap.keySet()) {
                Collection values = collapsedToOrigMap.get(key);
                int valueCount = values.size();
                sizeToIds.put(valueCount, key);
            }
            
            // THis one finally has the size and number of collapsed anchor sequence
            // with that
         
            Map countOfCount = new HashMap<>();
            for (Integer key: sizeToIds.keySet()) {
                Collection values = sizeToIds.get(key);
                int valueCount = values.size();
                countOfCount.put(key, valueCount);
            }
            // put it all in a csv file for R
            BufferedWriter bw = Utils.getBufferedWriter(outfile);
            bw.write("Number_Sequences_Collapsed_In_Group\tNumber_Of_Groups_With_This_Sequence_Count\n");
            StringBuilder sb = new StringBuilder();
            for (Integer key : countOfCount.keySet()) {
                sb.append(Integer.toString(key)).append("\t")
                  .append(Integer.toString(countOfCount.get(key))).append("\n");
            }
            bw.write(sb.toString());
            bw.close();
            System.out.println("Data written to: " + outfile);
            
            //bw = Utils.getBufferedWriter(file)
             
        } catch (Exception exc) {
            exc.printStackTrace();
        }
    }
    
    public static void testMafftfromJava() {
        String fastaFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/test_fastas_kalign/265SequencesFull_anchor2000.fa";
        String cmd = "/usr/local/bin/mafft " + fastaFile; // to stdout

        System.out.println("testMafftFromjava: begin ");
        // The fasta file itself identifies the anchor. perhaps that would be passed in
        // when running for real (the anchor plus the fasta file.)
        Map mafftIdSequenceMap = new HashMap<>();
        try {
            int count = 0;
            int totalCount = 0;
            Process start = Runtime.getRuntime().exec(cmd);
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(start.getInputStream()));
            String line = null;
            String prevIdLine = null;
            StringBuilder seqSB = new StringBuilder();
            while ((line = br.readLine()) != null)
            {
                if (line.startsWith(">")) {
                    count++;
                    totalCount++;
                    if (seqSB.length() > 0) {
                        seqSB.append("\n");
                        mafftIdSequenceMap.put(prevIdLine, seqSB.toString());
                        seqSB.setLength(0);                        
                    }
                    prevIdLine = line;
                    continue;
                } else {
                    count++;
                    seqSB.append(line);
                }
                
                // anchor fasta has just the assembly name, e.g. ">B73Ref"
                if (count > 1000) { // MAFFT results : equals gene count if maxTarget = 1
                    System.out.println(totalCount + " MAFFT lines processed");
                    count = 0;
                }
            }
            if (seqSB.length() > 0) { // add last one to map
                seqSB.append("\n");
                mafftIdSequenceMap.put(prevIdLine, seqSB.toString());
            }
            System.out.println("Finished MAFFT processing, total number of sequences processed: " + totalCount);
            String b73ref = mafftIdSequenceMap.get(">B73Ref");
            if (b73ref != null) System.out.println(b73ref.substring(0, 100));
            else System.out.println("b73ref not found in map !!");
            br.close();
            System.out.println("Total entries in mafftIdSequenceMap: "  + mafftIdSequenceMap.keySet().size());
        } catch (Exception exc) {
            exc.printStackTrace();
        } 
    }
    
    public static void testObjectAssignment() {
        GeneGFFData data2 = new GeneGFFData(1,2,"first");
        GeneGFFData data1 = null;
        
        data1 = data2;
        data2 = new GeneGFFData(3,4,"third");
        
        System.out.println("data1.start:" + data1.start() + ", data2.start:" + data2.start());
        
    }
    
    public static void testChrMatch() {       
        String chrS = "UNMAPPED";
        chrS = chrS.toUpperCase();
        chrS = chrS.replace("CHROMOSOME", ""); 
        chrS = chrS.replace("CHR", "");
        try {
            chrS = Integer.toString(Integer.parseInt(chrS));
        } catch (NumberFormatException nfe){
            // leave it as is!
        }
        
        String chrR = "1";
        chrR = chrR.toUpperCase();
        chrR = chrR.replace("CHROMOSOME", ""); 
        chrR = chrR.replace("CHR", "");
        try {
            chrR = Integer.toString(Integer.parseInt(chrR));
        } catch (NumberFormatException nfe){
            // leave it as is!
        }

        if (!chrS.equals(chrR)) {
            System.out.println("Chromosomes do not match!: chrS: " + chrS + ", chrR: " + chrR);
        } else {
            System.out.println("YEAH! - chromosomes match! chrS: " + chrS + ", chrR: " + chrR);
        }
        
        // funky EP1 fasta id line:
        // >CM007694.1 Zea mays subsp. mays cultivar EP1 chromosome 1, whole genome shotgun sequence
        // >CM007695.1 Zea mays subsp. mays cultivar EP1 chromosome 2, whole genome shotgun sequence        
    }
    
 

    public static void getDataFromAssembly(String assemblyFile, String outputFile) {
        GenomeSequence myRefSequence = GenomeSequenceBuilder.instance(assemblyFile);
        Set chromSet = myRefSequence.chromosomes();
        
        BufferedWriter bw = Utils.getBufferedWriter(outputFile);
        try {
            for (Chromosome chrom : chromSet) {
                bw.write(chrom.getName());
                //System.out.println(chrom.getName());
            }
            bw.close();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
        
        List chromList = new ArrayList<>(chromSet);
        // get range of positionsuence.chromosomeSequence(chromList.get(1),5,8);
        byte[] genotypes = myRefSequence.chromosomeSequence(chromList.get(1),5,8);
        System.out.print("Values on chrom " + chromList.get(1) + " at positions 5-8:");
        for (int idx = 0; idx < genotypes.length ; idx++) {
            System.out.print(" " + genotypes[idx]);
        }
        System.out.println();
        
        // get just the first one in that range
        byte allele = myRefSequence.genotype(chromList.get(1),5);        
        System.out.println("Single Allele at first position: " + allele);
        allele = myRefSequence.genotype(chromList.get(1),6);        
        System.out.println("Single Allele at second position: " + allele);
        allele = myRefSequence.genotype(chromList.get(1),7);        
        System.out.println("Single Allele at third position: " + allele);
        allele = myRefSequence.genotype(chromList.get(1),8);        
        System.out.println("Single Allele at fourth position: " + allele);
        
    }

    public static void testChromLength(String genome) {
        //String assemblyGenome = "/Volumes/Samsung_T1/sorghum_hackathonJune2017/Sbicolor_313_v3.0.fa.gz";
        GenomeSequence mySequence = GenomeSequenceBuilder.instance(genome);
        Set myChroms = mySequence.chromosomes();
        System.out.println("testChromLength, values for " + genome);
        for (Chromosome chrom : myChroms) {
            System.out.println(chrom.getName() + " length: " + chrom.getLength());
        }
    }
    
    public static void getGenomeSequenceValues() {
        String refGenome = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/Zea_mays.AGPv4.dna.toplevel.fa";
        
    }

    
    // Method to compare reference fasta to fasta created from DB>
    // It is testing the outcome of ReAssembleGenome_fromAnchorSequences.java
    public static void compareRefToReAssembledChrom(String refGenome, String reAssembledChromFasta, String chrom,
            String coordinatesFile, String outputFile) {
        System.out.println("read reference genome:");
        GenomeSequence myRefGenome = GenomeSequenceBuilder.instance(refGenome);
        
        System.out.println("read chrom fasta ");
        GenomeSequence chromGenome = GenomeSequenceBuilder.instance(reAssembledChromFasta);
        
        Chromosome testChrom = new Chromosome(chrom);
 
        int refChromSize = myRefGenome.chromosomeSize(testChrom);
        int assembledChromSize = chromGenome.chromosomeSize(testChrom);
        
        BufferedReader br = Utils.getBufferedReader(coordinatesFile);
        BufferedWriter bw = Utils.getBufferedWriter(outputFile);
        int goodCount = 0;
        int badCount = 0;
        try {
            String line = br.readLine(); // first header
            line = br.readLine(); // second header (when using merged file)
            while ((line = br.readLine()) != null) {
                String[] tokens = line.split("\\t");
                int start = Integer.parseInt(tokens[0]);
                int end = Integer.parseInt(tokens[1]);
                
                String refSeq = myRefGenome.genotypeAsString(testChrom, start, end);
                String chromSeq = chromGenome.genotypeAsString(testChrom, start, end);
                
                if (!refSeq.equals(chromSeq)) {
                    badCount++;
                    bw.write(line + "\n");
                } else goodCount++;
            }
            br.close();
            bw.close();
        } catch (Exception exc) {
            exc.printStackTrace();
        }
        System.out.println("Number matching sequences: " + goodCount + ", number unmatched " + badCount);
        
        System.out.println("refChromSize " + testChrom + ": " + refChromSize 
                + ", assembledChromSize " + testChrom + ": " + assembledChromSize);
        String assembledChromString = chromGenome.genotypeAsString(testChrom,1,assembledChromSize);
        String refChromString = myRefGenome.genotypeAsString(testChrom, 1,refChromSize);        
             
        
        if (assembledChromString.equals(refChromString)) System.out.println("\nWhole chromosomes are equivalent for chrom " + chrom + " !!");
        else System.out.println("\nWhole Chromosome are NOT equivalent for chrom " + chrom + " !!");       
        System.out.println("ref " + chrom + " size: " + refChromSize + ", reAssembled " + chrom + " size: " + assembledChromSize);

    }
    
   
    public static void createFindProteomeFile() {
        // THis method gets the anchor start/end from the db and writes to a tab-delimited file
        // it will be appended to the MergedSarahProteome_allAnchors.txt file
        
        // Sarah's files can be seen in slack, and in junit_official_test_cases, 
        // see files all10Chrom_anchors_in_proteome_final.txt
//        String db = "/Users/lcj34/notes_files/repgen/wgs_pipeline/agpv4_phg_dbs/hackathon_trimmedAnchors_refAndAssemblies.db";
//        PHGSQLite phg = new PHGSQLite(db);
//        
//        String outputFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/junit_official_test_cases/all10CHroms_chromAnchorStartAnchorEnd.txt";
//        BufferedWriter anchorBW = Utils.getBufferedWriter(outputFile);
//        try {
//            // Map
//            Map anchorCoordinateMap = phg.getHaplotypeAnchorCoordinates( "B73Ref",  0,  1);
//            String header = "anchorid\tChrom\tAnchor_start\tAnchor_end\n";
//            anchorBW.write(header);
//            for (int anchorid : anchorCoordinateMap.keySet()) {
//                String tokens[] = anchorCoordinateMap.get(anchorid).split(":");
//                String data = anchorid + "\t" + tokens[0] + "\t" + tokens[1] + "\t" + tokens[2] + "\n";
//                anchorBW.write(data);
//            }
//            anchorBW.close();
//        } catch (Exception exc) {
//            exc.printStackTrace();
//        }
    }
    
    public static void sortListCreatehash() {
        List hapids = new ArrayList<>();
        hapids.add(1);
        hapids.add(32);
        hapids.add(56);
        hapids.add(45);
        hapids.add(3);
        hapids.add(77);
        hapids.add(107);
        Collections.sort(hapids);
        
        String hapidsString = hapids.stream().map(Object::toString).collect(Collectors.joining(","));
        String hapidsHash1 = AnchorDataPHG.getChecksumForString(hapidsString,"MD5");
        System.out.println("hapidsHash1: " + hapidsHash1);
        
        hapids.clear();
        hapids.add(1);
        hapids.add(32);
        hapids.add(56);
        hapids.add(45);
        hapids.add(3);
        hapids.add(77);
        hapids.add(107);
        
         hapidsString = hapids.stream().map(Object::toString).collect(Collectors.joining(","));
         hapidsHash1 = AnchorDataPHG.getChecksumForString(hapidsString,"MD5");
         hapidsHash1 = AnchorDataPHG.getChecksumForString(hapids.stream().map(Object::toString).collect(Collectors.joining(",")),"MD5");
        System.out.println("hapidsHash1: " + hapidsHash1 + " - unsorted!");
        
        hapids.clear();
        hapids.add(3);
        System.out.println("\njust 3 hash: " + AnchorDataPHG.getChecksumForString(Integer.toString(hapids.get(0)),"MD5"));
        System.out.println("with stream: " +
                AnchorDataPHG.getChecksumForString(hapids.stream().map(Object::toString).collect(Collectors.joining(",")),"MD5"));
    }
    
    public static void testSortedRange() {
        // example from here: http://www.java2novice.com/java-collections-and-util/treemap/comparator-user-object/
       //Map,Integer> intervalData = new TreeMap,Integer>(new RangeComparator());
        RangeMap intervalData =  TreeRangeMap.create();
        String chr1 = "1";
        String chr3 = "3";
        String chr5 = "5";
        Range second = Range.closed(Position.of(chr1, 6), Position.of(chr1,8));
        Range first = Range.closed(Position.of(chr1, 3), Position.of(chr1,5));
        Range fourth = Range.closed(Position.of(chr1, 20), Position.of(chr1,30));
        Range third = Range.closed(Position.of(chr1, 15), Position.of(chr1,18));
        Range fifth = Range.closed(Position.of(chr3, 2), Position.of(chr3,10));
        Range sixth = Range.closed(Position.of(chr3, 18), Position.of(chr3,25));
        intervalData.put(fifth,5);
        intervalData.put(second, 2);
        intervalData.put(sixth, 6);
        intervalData.put(first, 1);
        intervalData.put(third, 3);
        intervalData.put(fourth, 4);
        Map,Integer> orderedMap = intervalData.asMapOfRanges();
        for(Range pos:orderedMap.keySet()) {
            System.out.println(pos.lowerEndpoint().getChromosome().getName() + " " + pos.lowerEndpoint().getPosition());
        }

//        List> orderedList = new ArrayList>(intervalData.keySet());
//        for (Range pos : orderedList) {
//            System.out.println(pos.lowerEndpoint().getChromosome().getName() + " " + pos.lowerEndpoint().getPosition());
//        }
        System.out.println("Finished !!");

    }

    // THis created to correctly pull just chrom 10 for Ed.  It is rather
    // messy, the problem was the reference had multiple lines for each sequence,
    // so I couldn't just "tail" the last 2.
    public static void splitW22GenomeGetChrom10 () {
        //String w22_assembly = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/W22__Ver12.genome.fasta";
        String outputFile = "/Users/lcj34/notes_files/hackathons/august2017_hackathon/assembly_contig_fastas/B73_chr10.fa";
        
        String b73_ref = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/Zea_mays.AGPv4.dna.toplevel.fa";
        BufferedReader br = Utils.getBufferedReader(b73_ref);
        BufferedWriter bw = Utils.getBufferedWriter(outputFile);
        
        System.out.println("Got readers, begin processing");
        String line;
        boolean chr10_found = false;
        try {
            StringBuilder sb = new StringBuilder();
            while((line = br.readLine()) != null) {
                if (line.startsWith(">")) { // any idline
                    if (chr10_found) {
                        System.out.println("breaking - done with chr10");
                        break;
                    }
                    if (line.startsWith(">10")) { // found chr10 idline
                        //System.out.println("FOUnd chrom10 - ");
                        chr10_found = true;
                        sb.append(line).append("\n");
                        continue;
                    } 
                }
                if (chr10_found) {
                    System.out.println("writing chrom10 data");
                    sb.append(line);                    
                }
            }
            bw.write(sb.toString());
            bw.write("\n");
            bw.close();
            br.close();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
        System.out.println("FInished!! - file written to " + outputFile);
 
    }
    
    public static void countNsInFasta(){
       // String inputFasta = "/Users/lcj34/notes_files/hackathons/august2017_hackathon/assembly_contig_fastas/W22_chr10b.fa.gz";
        String outputFile = "/Users/lcj34/notes_files/hackathons/august2017_hackathon/assembly_contig_fastas/chrFastaNCount_fromBoxFile";
        
        String inputFasta = "/Users/lcj34/box sync/MaizePHG/W22_alignment/W22_chr10.fa.gz";
        BufferedReader br = Utils.getBufferedReader(inputFasta);
        BufferedWriter bw = Utils.getBufferedWriter(outputFile);
        System.out.println("Begin countNsInFasta");
        
        int beginNPos = 0;
        int endNPos = 0;
        int nCount = 0;
        int lessThan10 = 0;
        int shortestNString = 1000;
        String header = "beginN\tendN\tlength\n";
        
        String line;
        try {
            bw.write(header);
            while ((line=br.readLine()) != null) {
                if (line.startsWith(">")) {
                    System.out.println("FOUnd idline: " + line);
                    continue;
                }
                boolean inN = false;
                for (int idx = 0; idx < line.length(); idx++) {
                    if (line.charAt(idx) == 'N' || line.charAt(idx) == 'n') {
                        nCount++;
                        if (!inN) {
                            beginNPos = idx;                           
                        }
                        inN = true;
                    } else {
                        if (inN) {
                            endNPos = idx;
                            inN = false;
                            if (nCount < 10 ) {
                                lessThan10++;
                            }
                            if (nCount < shortestNString ) shortestNString = nCount;
                            bw.write(beginNPos + "\t" + endNPos + "\t" + nCount + "\n");
                            nCount = 0;
                        }
                    }
                }
            }
            bw.close();
            br.close();
            System.out.println("end of file - number N runs lessThan10: "+ lessThan10 + ",shortest N: " + shortestNString);
        } catch (Exception exc){
            exc.printStackTrace();
        }
        
    }
    public static void main(String[] args) {
        String refGenome = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/Zea_mays.AGPv4.dna.toplevel.fa";
        //checkChrom2DupAnchor(refGenome);
        
//        String seq1 = "/Users/lcj34/notes_files/repgen/wgs_pipeline/levenshtein_test/firstRefAnchor_LevenshteinTest.txt";
//        String seq2 = "/Users/lcj34/notes_files/repgen/wgs_pipeline/levenshtein_test/lastRefAnchor_LevenshteinTest.txt";
//        
//        BufferedReader br = Utils.getBufferedReader(seq1);
//        String seq1Line = null;
//        String seq2Line = null;
//        String line;
//        try {
//            // Only 1 line in each of these files, which is a long sequence
//            while ((line = br.readLine()) != null) {
//                seq1Line = line;
//            }
//            br.close();
//            br = Utils.getBufferedReader(seq2);
//            while ((line = br.readLine()) != null) {
//                seq2Line = line;
//            }
//            br.close();
//        } catch (Exception exc) {
//            exc.printStackTrace();
//        }
  
        long time = System.nanoTime();
//        seq1Line = "ACTGATGT"; // changed - to test with N's
//        seq2Line = "ACTNNTAT";
        
//        seq2Line = "GAMBOL";
//        seq1Line = "GUMBOL";
//        int distance = calcLevenshtein(seq1Line,seq2Line,0);
//        System.out.println("EditDistance from calcLevenshtein: " + distance);
        
        //testPythonLD(seq1Line,seq2Line,null,null);
        
        String origData = "/Users/lcj34/notes_files/repgen/wgs_pipeline/testing_output/terry_trim_output/origFirst100.txt";
        String trimData = "/Users/lcj34/notes_files/repgen/wgs_pipeline/testing_output/terry_trim_output/terryFirst100.txt";
        String outputFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/testing_output/terry_trim_output/origTrimCompare.txt";

        //printTrimTables( origData,  trimData,  outputFile);
        //String collapseDataFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/collapse_data/seqID_seqCollapseID_maxEditDist_5.txt";
        //String collapseDataFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/collapse_data/editDistance5_justANchor1.txt";
//        String collapseDataFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/collapse_data/seqID_seqCollapseID_maxEditDist_15_sorted4then2_justAnchor1.txt";
//        String outputDir = "/Users/lcj34/notes_files/repgen/wgs_pipeline/collapse_data/editDistance15_anchor1_";
//        freqHaplotypeCollapse ( collapseDataFile,  outputDir);
        
        //getAnchorsForRef();
        //testMafftfromJava();
       // testPHGSchema();
        //testObjectAssignment();
        //testChrMatch();
        
        String line_name = "EP1Assembly";
        int hap_number = 0;
        // NOTE - this db is old - will not work with latest SCHEMA
        String db = "/Users/lcj34/notes_Files/repgen/wgs_pipeline/agpv4_phg_dbs/all10Chroms_withflanking_MSATable.db";
        String outputDir = "/Users/lcj34/notes_Files/repgen/wgs_pipeline/anchorsFromV4gff/assembly_chrom_start_files/";
        //checkAssemblyAnchorOrder(line_name, hap_number, db, outputDir);
        
       //getAssemblyAnchorCoordinates(line_name,hap_number,db,outputDir);
        
        String genomeOutputFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/testGenomeSequence.txt";
        //getDataFromAssembly(refGenome,  genomeOutputFile) ;
        //testPHGSQLite_AnchorTypeA();
//        testPHGSQLite_AnchorTypeIA();
//        testPHGSQLite_AnchorTypeB();
        
        // RUn these 2 together to get results
//        String genome1 = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/B73Ref_all_genomeFromDB.fa";
//        testChromLength(genome1);
//        String genome2 = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/Zea_mays.AGPv4.dna.toplevel.fa";
//        testChromLength(genome2);
        
        // THis test compares all chromsome sequence from ref fasta to a re-assembled fasta
//        String[] chromList = {"1","2","3","4","5","6","7","8","9","10"};
//        String reAssembledChromFasta = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/B73Ref_5_genomeFromDB.fa";
//        for (String chrom : chromList) {
//            System.out.println("Testing chrom: " + chrom);
//            reAssembledChromFasta = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/B73Ref_" + chrom + "_genomeFromDB.fa";
//            compareRefToReAssembledChrom (refGenome,  reAssembledChromFasta,  chrom);
//        }
        
        // test w22 chr1 to assembly
        String reAssembledChromFasta = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/W22Assembly_chr10_genomeFromDB.fa";
        //System.out.println("Testing W22 chr10");
        String w22Assembly = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/W22__Ver12.genome.fasta";
        //String coorindatesFile = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/W22Assembly_chr1_allchromPositionsList_forDebug.txt";
       String coorindatesFile = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/W22Assembly_chr10_mergedChromPositionsList_forDebug.txt";
        String output = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/mismatchAssemblySeqLines.txt";
        //compareRefToReAssembledChrom (w22Assembly,  reAssembledChromFasta,  "chr10", coorindatesFile,output);
        //testGetAnchorDataByChrom();
        //compareRefToDBAnchor();
        //createFindProteomeFile();
        
        //compareDuplicateAnchorSeqs( refGenome); 
        //sortListCreatehash();
        //testSortedRange();
        splitW22GenomeGetChrom10();
        //countNsInFasta();
        System.out.println("time to compute: " + (System.nanoTime() - time) / 1e9 + " seconds");
    }
}





© 2015 - 2024 Weber Informatics LLC | Privacy Policy