net.maizegenetics.pangenome.multiSequenceAlignment.ComputeNDistribution Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.multiSequenceAlignment;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.sqlite.SQLiteConfig;
/**
* Simple One off Main class to compute how many Ns we have over all the basepairs of all the anchors
* TODO if needed in pipeline, Refractor into TASSEL code
* Created by zrm22 on 5/25/17.
*/
public class ComputeNDistribution {
public static void main(String args[]) {
ComputeNDistribution app = new ComputeNDistribution();
app.run2();
}
public void run() {
try {
SQLiteConfig config=new SQLiteConfig();
Connection connection = DriverManager.getConnection("jdbc:sqlite:/Volumes/ZackBackup/Temp/Pangenome/DBTests/v4anchors_allChroms_mergedPlus1000orGap_md5HashCIMMYTAndNAMAndRobertsLines.db",config.toProperties());
connection.setAutoCommit(true); //This has massive performance effects
String query = "SELECT anchorid, line_name, sequence from anchor_haplotypes" +
" INNER JOIN anchor_sequences on anchor_haplotypes.anchor_sequence_id = anchor_sequences.anchor_sequence_id" +
" INNER JOIN haplotypes on anchor_haplotypes.hapid = haplotypes.hapid" +
" INNER JOIN genotypes on haplotypes.genoid = genotypes.genoid " +
"where anchor_haplotypes.anchorId=?;";
PreparedStatement ps = connection.prepareStatement(query);
BufferedWriter writer = new BufferedWriter(new FileWriter("/Volumes/ZackBackup/Temp/Pangenome/DBTests/allAnchorNDistributionAfterFilteringNs.csv"));
for(int i = 1; i < 37805; i++) {
// for(int i = 1; i < 10; i++) {
ps.setInt(1, i);
ResultSet rs = ps.executeQuery();
int[] countOfNs = countOfNsAndTotal(rs);
System.out.println("Total: "+countOfNs[0]+", Ns: "+countOfNs[1]);
writer.write(i);
writer.write(",");
writer.write(""+countOfNs[0]);
writer.write(",");
writer.write(""+countOfNs[1]);
writer.write(",");
writer.write(""+(double)countOfNs[1]/countOfNs[0]);
writer.newLine();
}
writer.close();
}
catch(Exception e) {
e.printStackTrace();
}
}
public void run2() {
try {
System.out.println("Setting up connection");
SQLiteConfig config=new SQLiteConfig();
Connection connection = DriverManager.getConnection("jdbc:sqlite:/Volumes/ZackBackup/Temp/Pangenome/DBTests/v4anchors_allChroms_mergedPlus1000orGap_md5HashCIMMYTAndNAMAndRobertsLines.db",config.toProperties());
connection.setAutoCommit(false); //This has massive performance effects
String query = "SELECT anchorid, line_name, sequence from anchor_haplotypes" +
" INNER JOIN anchor_sequences on anchor_haplotypes.anchor_sequence_id = anchor_sequences.anchor_sequence_id" +
" INNER JOIN haplotypes on anchor_haplotypes.hapid = haplotypes.hapid" +
" INNER JOIN genotypes on haplotypes.genoid = genotypes.genoid" +
" WHERE anchorid < 20000;";
query = "SELECT anchorid, sequence from anchor_haplotypes" +
" INNER JOIN anchor_sequences on anchor_haplotypes.anchor_sequence_id = anchor_sequences.anchor_sequence_id" +
" WHERE anchorid < 4000;";
String query2 = "SELECT anchorid, line_name, sequence from anchor_haplotypes" +
" INNER JOIN anchor_sequences on anchor_haplotypes.anchor_sequence_id = anchor_sequences.anchor_sequence_id" +
" INNER JOIN haplotypes on anchor_haplotypes.hapid = haplotypes.hapid" +
" INNER JOIN genotypes on haplotypes.genoid = genotypes.genoid" +
" WHERE anchorid >= 20000;";
System.out.println("Executing query.");
PreparedStatement ps = connection.prepareStatement(query);
BufferedWriter writer = new BufferedWriter(new FileWriter("/Volumes/ZackBackup/Temp/Pangenome/DBTests/allAnchorNDistributionV2First2k.csv"));
ResultSet rs = ps.executeQuery();
System.out.println("Creating map.");
// Map> anchorIdToSequenceListMap = createIdToSequenceMap(rs);
Map> anchorIdToCountMap = createIdToCountMap(rs);
System.out.println("Done Creating map");
for(int i = 1; i < 4000; i++) {
// for(int i = 1; i < 37805; i++) {
ArrayList countList = anchorIdToCountMap.get(i);
System.out.println("Total: "+countList.get(0)+", Ns: "+countList.get(1));
writer.write(""+i);
writer.write(",");
writer.write(""+countList.get(0));
writer.write(",");
writer.write(""+countList.get(1));
writer.write(",");
writer.write(""+(double)countList.get(1)/countList.get(0));
writer.newLine();
// int[] countOfNs = countOfNsAndTotal(sequenceList);
// System.out.println("Total: "+countOfNs[0]+", Ns: "+countOfNs[1]);
//
// writer.write(i);
// writer.write(",");
// writer.write(""+countOfNs[0]);
// writer.write(",");
// writer.write(""+countOfNs[1]);
// writer.write(",");
// writer.write(""+(double)countOfNs[1]/countOfNs[0]);
// writer.newLine();
}
// for(int i = 1; i < 10; i++) {
// ps.setInt(1, i);
// ResultSet rs = ps.executeQuery();
// int[] countOfNs = countOfNsAndTotal(rs);
//
// System.out.println("Total: "+countOfNs[0]+", Ns: "+countOfNs[1]);
//
// writer.write(i);
// writer.write(",");
// writer.write(""+countOfNs[0]);
// writer.write(",");
// writer.write(""+countOfNs[1]);
// writer.write(",");
// writer.write(""+(double)countOfNs[1]/countOfNs[0]);
// writer.newLine();
// }
writer.close();
}
catch(Exception e) {
e.printStackTrace();
}
}
private Map> createIdToSequenceMap(ResultSet rs) {
Map> map = new HashMap<>();
try {
while(rs.next()) {
if(!map.containsKey(rs.getInt(1))) {
map.put(rs.getInt(1),new ArrayList());
}
map.get(rs.getInt(1)).add(rs.getString(3));
}
}
catch(Exception e) {
e.printStackTrace();
}
return map;
}
private Map> createIdToCountMap(ResultSet rs) {
Map> map = new HashMap<>();
for(int i = 0; i < 37805; i ++) {
ArrayList list = new ArrayList<>();
list.add(0);
list.add(0);
map.put(i,list);
}
try {
int counter = 0;
while(rs.next()) {
// if(!map.containsKey(rs.getInt(1))) {
// ArrayList list = new ArrayList<>();
// list.add(0);
// list.add(0);
//
// map.put(rs.getInt(1),list);
// }
// String sequence = rs.getString(3);
String sequence = rs.getString(2);
ArrayList counts = map.get(rs.getInt(1));
int totalSeqLength = counts.get(0);
int numberNs = counts.get(1);
counts.set(0,totalSeqLength+sequence.length());
counts.set(1,numberNs+countOfNs(sequence));
map.put(rs.getInt(1),counts);
counter++;
if(counter%1000==0) {
System.out.println(counter);
}
}
}
catch(Exception e) {
e.printStackTrace();
}
return map;
}
private int[] countOfNsAndTotal(ResultSet rs) {
try{
int[] counts = new int[2];
//loop through the result set and get the sequence
int counter = 0;
while(rs.next()) {
String sequence = rs.getString(3);
counts[0] += sequence.length();
counts[1] += countOfNs(sequence);
counter++;
}
System.out.println(counter);
return counts;
}
catch(Exception e) {
e.printStackTrace();
}
return new int[2];
}
private int[] countOfNsAndTotal(List sequenceList) {
try{
int[] counts = new int[2];
//loop through the result set and get the sequence
int counter = 0;
for(String sequence:sequenceList) {
counts[0] += sequence.length();
counts[1] += countOfNs(sequence);
counter++;
}
// System.out.println(counter);
return counts;
}
catch(Exception e) {
e.printStackTrace();
}
return new int[2];
}
private int[] countOfNsAndTotalMinusAnchors(ResultSet rs) {
try{
int[] counts = new int[2];
//loop through the result set and get the sequence
int counter = 0;
while(rs.next()) {
String lineName = rs.getString(2);
if(!lineName.endsWith("Assembly")) {
String sequence = rs.getString(3);
counts[0] += sequence.length();
counts[1] += countOfNs(sequence);
counter++;
}
}
System.out.println(counter);
return counts;
}
catch(Exception e) {
e.printStackTrace();
}
return new int[2];
}
public static String removeLongNs(String anchorSequence) {
StringBuilder longNRemovedBuilder = new StringBuilder();
int nCounter = 0;
for(int i = 0; i < anchorSequence.length(); i++) {
if(anchorSequence.charAt(i)=='N') {
if(nCounter<2) {
longNRemovedBuilder.append("N");
nCounter++;
}
}
else {
nCounter=0;
longNRemovedBuilder.append(anchorSequence.charAt(i));
}
}
return longNRemovedBuilder.toString();
}
private int countOfNs(String sequence) {
// sequence = removeLongNs(sequence);
int countOfN = 0;
for(int i = 0; i < sequence.length(); i++) {
if(sequence.charAt(i)=='N') {
countOfN++;
}
}
return countOfN;
}
}