net.maizegenetics.pangenome.db_loading.TestPHGStuff Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.db_loading;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multiset;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils.AnchorType;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils.MethodType;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.util.CheckSum;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
/**
* @author lcj34
*
*/
public class TestPHGStuff {
public static void checkChrom2DupAnchor (String refGenome){
GenomeSequence refSequence = GenomeSequenceBuilder.instance(refGenome);
// Get reference string
Chromosome chrom2 = new Chromosome("2");
byte[] refBytes = refSequence.chromosomeSequence(chrom2,8244392,8247762);
String anchor1String = NucleotideAlignmentConstants.nucleotideBytetoString(refBytes);
byte[] ref2Bytes =
refSequence.chromosomeSequence(chrom2,8394514,8397884);
String anchor2String = NucleotideAlignmentConstants.nucleotideBytetoString(refBytes);
if (anchor1String.equals(anchor2String)) System.out.println("STrings equal");
else System.out.println("Strings NOT equal");
}
public static int calcLevenshtein(String seq1, String seq2, int maxEditDistance) {
long time = System.nanoTime();
seq1 = seq1.toUpperCase();
seq2 = seq2.toUpperCase();
int editDistance = -1;
// i == 0
int [] costs = new int [seq2.length() + 1];
for (int jdx = 0; jdx < costs.length; jdx++)
costs[jdx] = jdx;
for (int idx = 1; idx <= seq1.length(); idx++) {
// jdx == 0; nw = lev(idx - 1, jdx)
costs[0] = idx;
int nw = idx - 1;
// for (int jdx = 1; jdx <= seq2.length(); jdx++) { // original - before N computation
//
// int cj = Math.min(1 + Math.min(costs[jdx], costs[jdx - 1]), seq1.charAt(idx - 1) == seq2.charAt(jdx - 1) ? nw : nw + 1);
// nw = costs[jdx];
// costs[jdx] = cj;
// }
// The edit distance is on the diagonal of the matrix. If this value is
// greater than our max edit distance, bail returning -1
// If diagonal is when idx=jdx
int minDistance = 100000;
boolean keepProcessing = true;;
for (int jdx = 1; jdx <= seq2.length(); jdx++) { // seq was set to all lower case
int considerN = (seq1.charAt(idx-1) == 'N' || seq2.charAt(jdx-1) == 'N' || seq1.charAt(idx-1) == seq2.charAt(jdx-1)) ?
nw : nw + 1;
int cj = Math.min(1 + Math.min(costs[jdx], costs[jdx - 1]), considerN);
if (cj < minDistance) minDistance = cj;
nw = costs[jdx];
costs[jdx] = cj;
System.out.println("costs["+jdx+"]="+cj+", idx=" + idx);
// if (jdx == idx && cj > maxEditDistance ) {
// keepProcessing = false;
// break;
// }
}
System.out.println("idx=" + idx + ",min distance after J loop: " + minDistance);
if (minDistance > maxEditDistance) return editDistance;
// if (!keepProcessing) return editDistance;
}
System.out.println("Time to calculate: " + (System.nanoTime() - time) / 1e9 + " seconds");
return costs[seq2.length()];
}
public static void testPythonLD(String seq1, String seq2, String mode, String task) {
// BLAST the fasta file of genes
// String cmd = "blastn -num_threads " + threads + " -db " + blastDB + " -query " + fastaFile + " -evalue " + eval + " -max_target_seqs "
// + maxTarget + " -max_hsps " + maxHSPS + " -outfmt 6"; // to stdout
System.out.println("testPythonLD - begin, calling python script:");
String cmd = "/Users/lcj34/development/edlib-master/bindings/python/levenshtienDistance_edlib.py " + seq1 + " " + seq2;
StringBuilder cmdSB = new StringBuilder();
cmdSB.append("/Users/lcj34/development/edlib-master/bindings/python/levenshtienDistance_edlib.py ").append(seq1).append(" ").append(seq2);
// NOTE - if script does not allow for mode to be NULL but task to have value.
// It assumes the 3rd parameter (after seq1, seq2) is the
if (mode != null) cmdSB.append(" ").append(mode);
if (task != null) cmdSB.append(" ").append(task);
try {
Process start = Runtime.getRuntime().exec(cmd);
BufferedReader br = new BufferedReader(
new InputStreamReader(start.getInputStream()));
String line = null;
while ((line = br.readLine()) != null)
{
System.out.println(line);
}
} catch (Exception exc) {
exc.printStackTrace();
}
System.out.println("testPythonLD - finished!");
}
public static void printTrimTables(String origData, String trimData, String outputFile) {
BufferedReader origbr = Utils.getBufferedReader(origData);
BufferedReader trimbr = Utils.getBufferedReader(trimData);
BufferedWriter outbw = Utils.getBufferedWriter(outputFile);
String header = "origStart\torigEnd\tnewStart\tnewEnd\torigLen\ttrimLength\n";
try {
outbw.write(header);
String oline;
String tline;
while ((oline = origbr.readLine())!= null && (tline = trimbr.readLine()) != null) {
String[] oTokens = oline.split(":");
String[] tTokens = tline.split(":");
int ostart = Integer.parseInt(oTokens[1].substring(0, oTokens[1].indexOf("-")));
int oend = Integer.parseInt(oTokens[1].substring(oTokens[1].indexOf("-")+1));
int tstart = Integer.parseInt(tTokens[1].substring(0, tTokens[1].indexOf("-")));
int tend = Integer.parseInt(tTokens[1].substring(tTokens[1].indexOf("-")+1));
int oldLength = oend-ostart + 1;
int trimLength = tend-tstart + 1;
StringBuilder sb = new StringBuilder();
sb.append(Integer.toString(ostart)).append("\t");
sb.append(Integer.toString(oend)).append("\t");
sb.append(Integer.toString(tstart)).append("\t");
sb.append(Integer.toString(tend)).append("\t");
sb.append(Integer.toString(oldLength)).append("\t");
sb.append(Integer.toString(trimLength)).append("\n");
outbw.write(sb.toString());
}
origbr.close();
trimbr.close();
outbw.close();
} catch (Exception exc) {
exc.printStackTrace();
}
}
// trying with multisets per Zack - need to explore this more, it wasn't clicking for me
public static void freqChartHaplotypeCollapse (String inputFile, String outputDir) {
// this should take tab-delimited file from collapsed anchors with
// headers anchorid/origANchorSeqID/origLen/callpasedAnchorSeqID/collapsedLen/
// It counts number of sequences that are stand-alone, number where 2 are
// mapped together, etc.
//BufferedReader br = Utils.getBufferedReader(inputFile);
String outfilecsv = outputDir + "anchorSeqCollapseHits.csv";
try (BufferedReader br = Utils.getBufferedReader(inputFile)) {
// SeqID, sequences mapped to it
Multiset countSeqID = HashMultiset.create();
String line = br.readLine(); // skip header
while ((line=br.readLine()) != null) {
String[] lineTokens = line.split("\\t");
countSeqID.add(lineTokens[3]);
}
Multiset countOfCount = HashMultiset.create();
for (String id : countSeqID.elementSet()) {
countOfCount.add(countSeqID.count(id));
}
StringBuilder dataSB = new StringBuilder();
// for (Integer count : countOfCount.elementSet()) {
// dataSB.append(Integer.toString(count)).append("\n");
// }
// for (int idx = 0; idx < countOfCount.elementSet().size()) {
// // String
// }
// put it all in a csv file for R
BufferedWriter bw = Utils.getBufferedWriter(outfilecsv);
bw.write(dataSB.toString());
bw.close();
System.out.println("Data written to: " + outfilecsv);
//bw = Utils.getBufferedWriter(file)
} catch (Exception exc) {
exc.printStackTrace();
}
}
// This one works.
public static void freqHaplotypeCollapse (String inputFile, String outputDir) {
// this should take tab-delimited file from collapsed anchors with
// headers anchorid/origANchorSeqID/origLen/callpasedAnchorSeqID/collapsedLen/
// It counts number of sequences that are stand-alone, number where 2 are
// mapped together, etc.
//BufferedReader br = Utils.getBufferedReader(inputFile);
String outfile = outputDir + "anchorSeqCollapseHits.txt";
try (BufferedReader br = Utils.getBufferedReader(inputFile)) {
// Need the number of times there were X sequences mapped to another sequence
// SeqID, sequences mapped to it
Multimap collapsedToOrigMap = HashMultimap.create();
String line = br.readLine(); // skip header
while ((line=br.readLine()) != null) {
String[] lineTokens = line.split("\\t");
collapsedToOrigMap.put(Integer.parseInt(lineTokens[3]),Integer.parseInt(lineTokens[1]));
}
//This one takes the size (number of ids) as the key, and puts the map
Multimap sizeToIds = HashMultimap.create();
for (Integer key: collapsedToOrigMap.keySet()) {
Collection values = collapsedToOrigMap.get(key);
int valueCount = values.size();
sizeToIds.put(valueCount, key);
}
// THis one finally has the size and number of collapsed anchor sequence
// with that
Map countOfCount = new HashMap<>();
for (Integer key: sizeToIds.keySet()) {
Collection values = sizeToIds.get(key);
int valueCount = values.size();
countOfCount.put(key, valueCount);
}
// put it all in a csv file for R
BufferedWriter bw = Utils.getBufferedWriter(outfile);
bw.write("Number_Sequences_Collapsed_In_Group\tNumber_Of_Groups_With_This_Sequence_Count\n");
StringBuilder sb = new StringBuilder();
for (Integer key : countOfCount.keySet()) {
sb.append(Integer.toString(key)).append("\t")
.append(Integer.toString(countOfCount.get(key))).append("\n");
}
bw.write(sb.toString());
bw.close();
System.out.println("Data written to: " + outfile);
//bw = Utils.getBufferedWriter(file)
} catch (Exception exc) {
exc.printStackTrace();
}
}
public static void testMafftfromJava() {
String fastaFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/test_fastas_kalign/265SequencesFull_anchor2000.fa";
String cmd = "/usr/local/bin/mafft " + fastaFile; // to stdout
System.out.println("testMafftFromjava: begin ");
// The fasta file itself identifies the anchor. perhaps that would be passed in
// when running for real (the anchor plus the fasta file.)
Map mafftIdSequenceMap = new HashMap<>();
try {
int count = 0;
int totalCount = 0;
Process start = Runtime.getRuntime().exec(cmd);
BufferedReader br = new BufferedReader(
new InputStreamReader(start.getInputStream()));
String line = null;
String prevIdLine = null;
StringBuilder seqSB = new StringBuilder();
while ((line = br.readLine()) != null)
{
if (line.startsWith(">")) {
count++;
totalCount++;
if (seqSB.length() > 0) {
seqSB.append("\n");
mafftIdSequenceMap.put(prevIdLine, seqSB.toString());
seqSB.setLength(0);
}
prevIdLine = line;
continue;
} else {
count++;
seqSB.append(line);
}
// anchor fasta has just the assembly name, e.g. ">B73Ref"
if (count > 1000) { // MAFFT results : equals gene count if maxTarget = 1
System.out.println(totalCount + " MAFFT lines processed");
count = 0;
}
}
if (seqSB.length() > 0) { // add last one to map
seqSB.append("\n");
mafftIdSequenceMap.put(prevIdLine, seqSB.toString());
}
System.out.println("Finished MAFFT processing, total number of sequences processed: " + totalCount);
String b73ref = mafftIdSequenceMap.get(">B73Ref");
if (b73ref != null) System.out.println(b73ref.substring(0, 100));
else System.out.println("b73ref not found in map !!");
br.close();
System.out.println("Total entries in mafftIdSequenceMap: " + mafftIdSequenceMap.keySet().size());
} catch (Exception exc) {
exc.printStackTrace();
}
}
public static void testObjectAssignment() {
GeneGFFData data2 = new GeneGFFData(1,2,"first");
GeneGFFData data1 = null;
data1 = data2;
data2 = new GeneGFFData(3,4,"third");
System.out.println("data1.start:" + data1.start() + ", data2.start:" + data2.start());
}
public static void testChrMatch() {
String chrS = "UNMAPPED";
chrS = chrS.toUpperCase();
chrS = chrS.replace("CHROMOSOME", "");
chrS = chrS.replace("CHR", "");
try {
chrS = Integer.toString(Integer.parseInt(chrS));
} catch (NumberFormatException nfe){
// leave it as is!
}
String chrR = "1";
chrR = chrR.toUpperCase();
chrR = chrR.replace("CHROMOSOME", "");
chrR = chrR.replace("CHR", "");
try {
chrR = Integer.toString(Integer.parseInt(chrR));
} catch (NumberFormatException nfe){
// leave it as is!
}
if (!chrS.equals(chrR)) {
System.out.println("Chromosomes do not match!: chrS: " + chrS + ", chrR: " + chrR);
} else {
System.out.println("YEAH! - chromosomes match! chrS: " + chrS + ", chrR: " + chrR);
}
// funky EP1 fasta id line:
// >CM007694.1 Zea mays subsp. mays cultivar EP1 chromosome 1, whole genome shotgun sequence
// >CM007695.1 Zea mays subsp. mays cultivar EP1 chromosome 2, whole genome shotgun sequence
}
public static void getDataFromAssembly(String assemblyFile, String outputFile) {
GenomeSequence myRefSequence = GenomeSequenceBuilder.instance(assemblyFile);
Set chromSet = myRefSequence.chromosomes();
BufferedWriter bw = Utils.getBufferedWriter(outputFile);
try {
for (Chromosome chrom : chromSet) {
bw.write(chrom.getName());
//System.out.println(chrom.getName());
}
bw.close();
} catch (IOException ioe) {
ioe.printStackTrace();
}
List chromList = new ArrayList<>(chromSet);
// get range of positionsuence.chromosomeSequence(chromList.get(1),5,8);
byte[] genotypes = myRefSequence.chromosomeSequence(chromList.get(1),5,8);
System.out.print("Values on chrom " + chromList.get(1) + " at positions 5-8:");
for (int idx = 0; idx < genotypes.length ; idx++) {
System.out.print(" " + genotypes[idx]);
}
System.out.println();
// get just the first one in that range
byte allele = myRefSequence.genotype(chromList.get(1),5);
System.out.println("Single Allele at first position: " + allele);
allele = myRefSequence.genotype(chromList.get(1),6);
System.out.println("Single Allele at second position: " + allele);
allele = myRefSequence.genotype(chromList.get(1),7);
System.out.println("Single Allele at third position: " + allele);
allele = myRefSequence.genotype(chromList.get(1),8);
System.out.println("Single Allele at fourth position: " + allele);
}
public static void testChromLength(String genome) {
//String assemblyGenome = "/Volumes/Samsung_T1/sorghum_hackathonJune2017/Sbicolor_313_v3.0.fa.gz";
GenomeSequence mySequence = GenomeSequenceBuilder.instance(genome);
Set myChroms = mySequence.chromosomes();
System.out.println("testChromLength, values for " + genome);
for (Chromosome chrom : myChroms) {
System.out.println(chrom.getName() + " length: " + chrom.getLength());
}
}
public static void getGenomeSequenceValues() {
String refGenome = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/Zea_mays.AGPv4.dna.toplevel.fa";
}
// Method to compare reference fasta to fasta created from DB>
// It is testing the outcome of ReAssembleGenome_fromAnchorSequences.java
public static void compareRefToReAssembledChrom(String refGenome, String reAssembledChromFasta, String chrom,
String coordinatesFile, String outputFile) {
System.out.println("read reference genome:");
GenomeSequence myRefGenome = GenomeSequenceBuilder.instance(refGenome);
System.out.println("read chrom fasta ");
GenomeSequence chromGenome = GenomeSequenceBuilder.instance(reAssembledChromFasta);
Chromosome testChrom = new Chromosome(chrom);
int refChromSize = myRefGenome.chromosomeSize(testChrom);
int assembledChromSize = chromGenome.chromosomeSize(testChrom);
BufferedReader br = Utils.getBufferedReader(coordinatesFile);
BufferedWriter bw = Utils.getBufferedWriter(outputFile);
int goodCount = 0;
int badCount = 0;
try {
String line = br.readLine(); // first header
line = br.readLine(); // second header (when using merged file)
while ((line = br.readLine()) != null) {
String[] tokens = line.split("\\t");
int start = Integer.parseInt(tokens[0]);
int end = Integer.parseInt(tokens[1]);
String refSeq = myRefGenome.genotypeAsString(testChrom, start, end);
String chromSeq = chromGenome.genotypeAsString(testChrom, start, end);
if (!refSeq.equals(chromSeq)) {
badCount++;
bw.write(line + "\n");
} else goodCount++;
}
br.close();
bw.close();
} catch (Exception exc) {
exc.printStackTrace();
}
System.out.println("Number matching sequences: " + goodCount + ", number unmatched " + badCount);
System.out.println("refChromSize " + testChrom + ": " + refChromSize
+ ", assembledChromSize " + testChrom + ": " + assembledChromSize);
String assembledChromString = chromGenome.genotypeAsString(testChrom,1,assembledChromSize);
String refChromString = myRefGenome.genotypeAsString(testChrom, 1,refChromSize);
if (assembledChromString.equals(refChromString)) System.out.println("\nWhole chromosomes are equivalent for chrom " + chrom + " !!");
else System.out.println("\nWhole Chromosome are NOT equivalent for chrom " + chrom + " !!");
System.out.println("ref " + chrom + " size: " + refChromSize + ", reAssembled " + chrom + " size: " + assembledChromSize);
}
public static void createFindProteomeFile() {
// THis method gets the anchor start/end from the db and writes to a tab-delimited file
// it will be appended to the MergedSarahProteome_allAnchors.txt file
// Sarah's files can be seen in slack, and in junit_official_test_cases,
// see files all10Chrom_anchors_in_proteome_final.txt
// String db = "/Users/lcj34/notes_files/repgen/wgs_pipeline/agpv4_phg_dbs/hackathon_trimmedAnchors_refAndAssemblies.db";
// PHGSQLite phg = new PHGSQLite(db);
//
// String outputFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/junit_official_test_cases/all10CHroms_chromAnchorStartAnchorEnd.txt";
// BufferedWriter anchorBW = Utils.getBufferedWriter(outputFile);
// try {
// // Map
// Map anchorCoordinateMap = phg.getHaplotypeAnchorCoordinates( "B73Ref", 0, 1);
// String header = "anchorid\tChrom\tAnchor_start\tAnchor_end\n";
// anchorBW.write(header);
// for (int anchorid : anchorCoordinateMap.keySet()) {
// String tokens[] = anchorCoordinateMap.get(anchorid).split(":");
// String data = anchorid + "\t" + tokens[0] + "\t" + tokens[1] + "\t" + tokens[2] + "\n";
// anchorBW.write(data);
// }
// anchorBW.close();
// } catch (Exception exc) {
// exc.printStackTrace();
// }
}
public static void sortListCreatehash() {
List hapids = new ArrayList<>();
hapids.add(1);
hapids.add(32);
hapids.add(56);
hapids.add(45);
hapids.add(3);
hapids.add(77);
hapids.add(107);
Collections.sort(hapids);
String hapidsString = hapids.stream().map(Object::toString).collect(Collectors.joining(","));
String hapidsHash1 = AnchorDataPHG.getChecksumForString(hapidsString,"MD5");
System.out.println("hapidsHash1: " + hapidsHash1);
hapids.clear();
hapids.add(1);
hapids.add(32);
hapids.add(56);
hapids.add(45);
hapids.add(3);
hapids.add(77);
hapids.add(107);
hapidsString = hapids.stream().map(Object::toString).collect(Collectors.joining(","));
hapidsHash1 = AnchorDataPHG.getChecksumForString(hapidsString,"MD5");
hapidsHash1 = AnchorDataPHG.getChecksumForString(hapids.stream().map(Object::toString).collect(Collectors.joining(",")),"MD5");
System.out.println("hapidsHash1: " + hapidsHash1 + " - unsorted!");
hapids.clear();
hapids.add(3);
System.out.println("\njust 3 hash: " + AnchorDataPHG.getChecksumForString(Integer.toString(hapids.get(0)),"MD5"));
System.out.println("with stream: " +
AnchorDataPHG.getChecksumForString(hapids.stream().map(Object::toString).collect(Collectors.joining(",")),"MD5"));
}
public static void testSortedRange() {
// example from here: http://www.java2novice.com/java-collections-and-util/treemap/comparator-user-object/
//Map,Integer> intervalData = new TreeMap,Integer>(new RangeComparator());
RangeMap intervalData = TreeRangeMap.create();
String chr1 = "1";
String chr3 = "3";
String chr5 = "5";
Range second = Range.closed(Position.of(chr1, 6), Position.of(chr1,8));
Range first = Range.closed(Position.of(chr1, 3), Position.of(chr1,5));
Range fourth = Range.closed(Position.of(chr1, 20), Position.of(chr1,30));
Range third = Range.closed(Position.of(chr1, 15), Position.of(chr1,18));
Range fifth = Range.closed(Position.of(chr3, 2), Position.of(chr3,10));
Range sixth = Range.closed(Position.of(chr3, 18), Position.of(chr3,25));
intervalData.put(fifth,5);
intervalData.put(second, 2);
intervalData.put(sixth, 6);
intervalData.put(first, 1);
intervalData.put(third, 3);
intervalData.put(fourth, 4);
Map,Integer> orderedMap = intervalData.asMapOfRanges();
for(Range pos:orderedMap.keySet()) {
System.out.println(pos.lowerEndpoint().getChromosome().getName() + " " + pos.lowerEndpoint().getPosition());
}
// List> orderedList = new ArrayList>(intervalData.keySet());
// for (Range pos : orderedList) {
// System.out.println(pos.lowerEndpoint().getChromosome().getName() + " " + pos.lowerEndpoint().getPosition());
// }
System.out.println("Finished !!");
}
// THis created to correctly pull just chrom 10 for Ed. It is rather
// messy, the problem was the reference had multiple lines for each sequence,
// so I couldn't just "tail" the last 2.
public static void splitW22GenomeGetChrom10 () {
//String w22_assembly = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/W22__Ver12.genome.fasta";
String outputFile = "/Users/lcj34/notes_files/hackathons/august2017_hackathon/assembly_contig_fastas/B73_chr10.fa";
String b73_ref = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/Zea_mays.AGPv4.dna.toplevel.fa";
BufferedReader br = Utils.getBufferedReader(b73_ref);
BufferedWriter bw = Utils.getBufferedWriter(outputFile);
System.out.println("Got readers, begin processing");
String line;
boolean chr10_found = false;
try {
StringBuilder sb = new StringBuilder();
while((line = br.readLine()) != null) {
if (line.startsWith(">")) { // any idline
if (chr10_found) {
System.out.println("breaking - done with chr10");
break;
}
if (line.startsWith(">10")) { // found chr10 idline
//System.out.println("FOUnd chrom10 - ");
chr10_found = true;
sb.append(line).append("\n");
continue;
}
}
if (chr10_found) {
System.out.println("writing chrom10 data");
sb.append(line);
}
}
bw.write(sb.toString());
bw.write("\n");
bw.close();
br.close();
} catch (IOException ioe) {
ioe.printStackTrace();
}
System.out.println("FInished!! - file written to " + outputFile);
}
public static void countNsInFasta(){
// String inputFasta = "/Users/lcj34/notes_files/hackathons/august2017_hackathon/assembly_contig_fastas/W22_chr10b.fa.gz";
String outputFile = "/Users/lcj34/notes_files/hackathons/august2017_hackathon/assembly_contig_fastas/chrFastaNCount_fromBoxFile";
String inputFasta = "/Users/lcj34/box sync/MaizePHG/W22_alignment/W22_chr10.fa.gz";
BufferedReader br = Utils.getBufferedReader(inputFasta);
BufferedWriter bw = Utils.getBufferedWriter(outputFile);
System.out.println("Begin countNsInFasta");
int beginNPos = 0;
int endNPos = 0;
int nCount = 0;
int lessThan10 = 0;
int shortestNString = 1000;
String header = "beginN\tendN\tlength\n";
String line;
try {
bw.write(header);
while ((line=br.readLine()) != null) {
if (line.startsWith(">")) {
System.out.println("FOUnd idline: " + line);
continue;
}
boolean inN = false;
for (int idx = 0; idx < line.length(); idx++) {
if (line.charAt(idx) == 'N' || line.charAt(idx) == 'n') {
nCount++;
if (!inN) {
beginNPos = idx;
}
inN = true;
} else {
if (inN) {
endNPos = idx;
inN = false;
if (nCount < 10 ) {
lessThan10++;
}
if (nCount < shortestNString ) shortestNString = nCount;
bw.write(beginNPos + "\t" + endNPos + "\t" + nCount + "\n");
nCount = 0;
}
}
}
}
bw.close();
br.close();
System.out.println("end of file - number N runs lessThan10: "+ lessThan10 + ",shortest N: " + shortestNString);
} catch (Exception exc){
exc.printStackTrace();
}
}
public static void main(String[] args) {
String refGenome = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/Zea_mays.AGPv4.dna.toplevel.fa";
//checkChrom2DupAnchor(refGenome);
// String seq1 = "/Users/lcj34/notes_files/repgen/wgs_pipeline/levenshtein_test/firstRefAnchor_LevenshteinTest.txt";
// String seq2 = "/Users/lcj34/notes_files/repgen/wgs_pipeline/levenshtein_test/lastRefAnchor_LevenshteinTest.txt";
//
// BufferedReader br = Utils.getBufferedReader(seq1);
// String seq1Line = null;
// String seq2Line = null;
// String line;
// try {
// // Only 1 line in each of these files, which is a long sequence
// while ((line = br.readLine()) != null) {
// seq1Line = line;
// }
// br.close();
// br = Utils.getBufferedReader(seq2);
// while ((line = br.readLine()) != null) {
// seq2Line = line;
// }
// br.close();
// } catch (Exception exc) {
// exc.printStackTrace();
// }
long time = System.nanoTime();
// seq1Line = "ACTGATGT"; // changed - to test with N's
// seq2Line = "ACTNNTAT";
// seq2Line = "GAMBOL";
// seq1Line = "GUMBOL";
// int distance = calcLevenshtein(seq1Line,seq2Line,0);
// System.out.println("EditDistance from calcLevenshtein: " + distance);
//testPythonLD(seq1Line,seq2Line,null,null);
String origData = "/Users/lcj34/notes_files/repgen/wgs_pipeline/testing_output/terry_trim_output/origFirst100.txt";
String trimData = "/Users/lcj34/notes_files/repgen/wgs_pipeline/testing_output/terry_trim_output/terryFirst100.txt";
String outputFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/testing_output/terry_trim_output/origTrimCompare.txt";
//printTrimTables( origData, trimData, outputFile);
//String collapseDataFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/collapse_data/seqID_seqCollapseID_maxEditDist_5.txt";
//String collapseDataFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/collapse_data/editDistance5_justANchor1.txt";
// String collapseDataFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/collapse_data/seqID_seqCollapseID_maxEditDist_15_sorted4then2_justAnchor1.txt";
// String outputDir = "/Users/lcj34/notes_files/repgen/wgs_pipeline/collapse_data/editDistance15_anchor1_";
// freqHaplotypeCollapse ( collapseDataFile, outputDir);
//getAnchorsForRef();
//testMafftfromJava();
// testPHGSchema();
//testObjectAssignment();
//testChrMatch();
String line_name = "EP1Assembly";
int hap_number = 0;
// NOTE - this db is old - will not work with latest SCHEMA
String db = "/Users/lcj34/notes_Files/repgen/wgs_pipeline/agpv4_phg_dbs/all10Chroms_withflanking_MSATable.db";
String outputDir = "/Users/lcj34/notes_Files/repgen/wgs_pipeline/anchorsFromV4gff/assembly_chrom_start_files/";
//checkAssemblyAnchorOrder(line_name, hap_number, db, outputDir);
//getAssemblyAnchorCoordinates(line_name,hap_number,db,outputDir);
String genomeOutputFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/testGenomeSequence.txt";
//getDataFromAssembly(refGenome, genomeOutputFile) ;
//testPHGSQLite_AnchorTypeA();
// testPHGSQLite_AnchorTypeIA();
// testPHGSQLite_AnchorTypeB();
// RUn these 2 together to get results
// String genome1 = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/B73Ref_all_genomeFromDB.fa";
// testChromLength(genome1);
// String genome2 = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/Zea_mays.AGPv4.dna.toplevel.fa";
// testChromLength(genome2);
// THis test compares all chromsome sequence from ref fasta to a re-assembled fasta
// String[] chromList = {"1","2","3","4","5","6","7","8","9","10"};
// String reAssembledChromFasta = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/B73Ref_5_genomeFromDB.fa";
// for (String chrom : chromList) {
// System.out.println("Testing chrom: " + chrom);
// reAssembledChromFasta = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/B73Ref_" + chrom + "_genomeFromDB.fa";
// compareRefToReAssembledChrom (refGenome, reAssembledChromFasta, chrom);
// }
// test w22 chr1 to assembly
String reAssembledChromFasta = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/W22Assembly_chr10_genomeFromDB.fa";
//System.out.println("Testing W22 chr10");
String w22Assembly = "/Volumes/Samsung_T1/wgs_pipeline/refGenomeFiles/W22__Ver12.genome.fasta";
//String coorindatesFile = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/W22Assembly_chr1_allchromPositionsList_forDebug.txt";
String coorindatesFile = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/W22Assembly_chr10_mergedChromPositionsList_forDebug.txt";
String output = "/Volumes/Samsung_T1/wgs_pipeline/reAssembledFastas/mismatchAssemblySeqLines.txt";
//compareRefToReAssembledChrom (w22Assembly, reAssembledChromFasta, "chr10", coorindatesFile,output);
//testGetAnchorDataByChrom();
//compareRefToDBAnchor();
//createFindProteomeFile();
//compareDuplicateAnchorSeqs( refGenome);
//sortListCreatehash();
//testSortedRange();
splitW22GenomeGetChrom10();
//countNsInFasta();
System.out.println("time to compute: " + (System.nanoTime() - time) / 1e9 + " seconds");
}
}