All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.multiSequenceAlignment.ComputeNDistribution Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.multiSequenceAlignment;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.sqlite.SQLiteConfig;

/**
 * Simple One off Main class to compute how many Ns we have over all the basepairs of all the anchors
 * TODO if needed in pipeline, Refractor into TASSEL code
 * Created by zrm22 on 5/25/17.
 */
public class ComputeNDistribution {

    public static void main(String args[]) {
        ComputeNDistribution app = new ComputeNDistribution();
        app.run2();
    }

    public void run() {
        try {
            SQLiteConfig config=new SQLiteConfig();
            Connection connection =  DriverManager.getConnection("jdbc:sqlite:/Volumes/ZackBackup/Temp/Pangenome/DBTests/v4anchors_allChroms_mergedPlus1000orGap_md5HashCIMMYTAndNAMAndRobertsLines.db",config.toProperties());
            connection.setAutoCommit(true);  //This has massive performance effects

            String query = "SELECT anchorid, line_name, sequence from anchor_haplotypes" +
                    "  INNER JOIN anchor_sequences on anchor_haplotypes.anchor_sequence_id = anchor_sequences.anchor_sequence_id" +
                    "  INNER JOIN haplotypes on anchor_haplotypes.hapid = haplotypes.hapid" +
                    "  INNER JOIN genotypes on haplotypes.genoid = genotypes.genoid " +
                    "where anchor_haplotypes.anchorId=?;";

            PreparedStatement ps = connection.prepareStatement(query);
            BufferedWriter writer = new BufferedWriter(new FileWriter("/Volumes/ZackBackup/Temp/Pangenome/DBTests/allAnchorNDistributionAfterFilteringNs.csv"));
            for(int i = 1; i < 37805; i++) {
//            for(int i = 1; i < 10; i++) {
                ps.setInt(1, i);
                ResultSet rs = ps.executeQuery();
                int[] countOfNs = countOfNsAndTotal(rs);

                System.out.println("Total: "+countOfNs[0]+", Ns: "+countOfNs[1]);

                writer.write(i);
                writer.write(",");
                writer.write(""+countOfNs[0]);
                writer.write(",");
                writer.write(""+countOfNs[1]);
                writer.write(",");
                writer.write(""+(double)countOfNs[1]/countOfNs[0]);
                writer.newLine();
            }
            writer.close();
        }
        catch(Exception e) {
            e.printStackTrace();
        }
    }

    public void run2() {
        try {
            System.out.println("Setting up connection");
            SQLiteConfig config=new SQLiteConfig();
            Connection connection =  DriverManager.getConnection("jdbc:sqlite:/Volumes/ZackBackup/Temp/Pangenome/DBTests/v4anchors_allChroms_mergedPlus1000orGap_md5HashCIMMYTAndNAMAndRobertsLines.db",config.toProperties());
            connection.setAutoCommit(false);  //This has massive performance effects

            String query = "SELECT anchorid, line_name, sequence from anchor_haplotypes" +
                    "  INNER JOIN anchor_sequences on anchor_haplotypes.anchor_sequence_id = anchor_sequences.anchor_sequence_id" +
                    "  INNER JOIN haplotypes on anchor_haplotypes.hapid = haplotypes.hapid" +
                    "  INNER JOIN genotypes on haplotypes.genoid = genotypes.genoid" +
                    " WHERE anchorid < 20000;";

            query = "SELECT anchorid, sequence from anchor_haplotypes" +
                    "  INNER JOIN anchor_sequences on anchor_haplotypes.anchor_sequence_id = anchor_sequences.anchor_sequence_id" +
                    " WHERE anchorid < 4000;";

            String query2 = "SELECT anchorid, line_name, sequence from anchor_haplotypes" +
                    "  INNER JOIN anchor_sequences on anchor_haplotypes.anchor_sequence_id = anchor_sequences.anchor_sequence_id" +
                    "  INNER JOIN haplotypes on anchor_haplotypes.hapid = haplotypes.hapid" +
                    "  INNER JOIN genotypes on haplotypes.genoid = genotypes.genoid" +
                    " WHERE anchorid >= 20000;";
            System.out.println("Executing query.");
            PreparedStatement ps = connection.prepareStatement(query);
            BufferedWriter writer = new BufferedWriter(new FileWriter("/Volumes/ZackBackup/Temp/Pangenome/DBTests/allAnchorNDistributionV2First2k.csv"));

            ResultSet rs = ps.executeQuery();
            System.out.println("Creating map.");
//            Map> anchorIdToSequenceListMap = createIdToSequenceMap(rs);
            Map> anchorIdToCountMap = createIdToCountMap(rs);
            System.out.println("Done Creating map");
            for(int i = 1; i < 4000; i++) {
//            for(int i = 1; i < 37805; i++) {
                ArrayList countList = anchorIdToCountMap.get(i);
                System.out.println("Total: "+countList.get(0)+", Ns: "+countList.get(1));

                writer.write(""+i);
                writer.write(",");
                writer.write(""+countList.get(0));
                writer.write(",");
                writer.write(""+countList.get(1));
                writer.write(",");
                writer.write(""+(double)countList.get(1)/countList.get(0));
                writer.newLine();

//                int[] countOfNs = countOfNsAndTotal(sequenceList);
//                System.out.println("Total: "+countOfNs[0]+", Ns: "+countOfNs[1]);
//
//                writer.write(i);
//                writer.write(",");
//                writer.write(""+countOfNs[0]);
//                writer.write(",");
//                writer.write(""+countOfNs[1]);
//                writer.write(",");
//                writer.write(""+(double)countOfNs[1]/countOfNs[0]);
//                writer.newLine();
            }



//            for(int i = 1; i < 10; i++) {
//                ps.setInt(1, i);
//                ResultSet rs = ps.executeQuery();
//                int[] countOfNs = countOfNsAndTotal(rs);
//
//                System.out.println("Total: "+countOfNs[0]+", Ns: "+countOfNs[1]);
//
//                writer.write(i);
//                writer.write(",");
//                writer.write(""+countOfNs[0]);
//                writer.write(",");
//                writer.write(""+countOfNs[1]);
//                writer.write(",");
//                writer.write(""+(double)countOfNs[1]/countOfNs[0]);
//                writer.newLine();
//            }
            writer.close();
        }
        catch(Exception e) {
            e.printStackTrace();
        }
    }

    private Map> createIdToSequenceMap(ResultSet rs) {
        Map> map = new HashMap<>();
        try {
            while(rs.next()) {
                if(!map.containsKey(rs.getInt(1))) {
                    map.put(rs.getInt(1),new ArrayList());
                }

                map.get(rs.getInt(1)).add(rs.getString(3));
            }
        }
        catch(Exception e) {
            e.printStackTrace();
        }
        return map;
    }

    private Map> createIdToCountMap(ResultSet rs) {
        Map> map = new HashMap<>();
        for(int i = 0; i < 37805; i ++) {
            ArrayList list = new ArrayList<>();
            list.add(0);
            list.add(0);

            map.put(i,list);
        }
        try {
            int counter = 0;
            while(rs.next()) {
//                if(!map.containsKey(rs.getInt(1))) {
//                    ArrayList list = new ArrayList<>();
//                    list.add(0);
//                    list.add(0);
//
//                    map.put(rs.getInt(1),list);
//                }
//                String sequence = rs.getString(3);
                String sequence = rs.getString(2);
                ArrayList counts = map.get(rs.getInt(1));
                int totalSeqLength = counts.get(0);
                int numberNs = counts.get(1);
                counts.set(0,totalSeqLength+sequence.length());
                counts.set(1,numberNs+countOfNs(sequence));

                map.put(rs.getInt(1),counts);

                counter++;
                if(counter%1000==0) {
                    System.out.println(counter);
                }
            }
        }
        catch(Exception e) {
            e.printStackTrace();
        }
        return map;
    }



    private int[] countOfNsAndTotal(ResultSet rs) {
        try{
            int[] counts = new int[2];

            //loop through the result set and get the sequence
            int counter = 0;
            while(rs.next()) {
                String sequence = rs.getString(3);
                counts[0] += sequence.length();
                counts[1] += countOfNs(sequence);
                counter++;
            }
            System.out.println(counter);
            return counts;
        }
        catch(Exception e) {
            e.printStackTrace();
        }
        return new int[2];
    }
    private int[] countOfNsAndTotal(List sequenceList) {
        try{
            int[] counts = new int[2];

            //loop through the result set and get the sequence
            int counter = 0;
            for(String sequence:sequenceList) {
                counts[0] += sequence.length();
                counts[1] += countOfNs(sequence);
                counter++;
            }
//            System.out.println(counter);
            return counts;
        }
        catch(Exception e) {
            e.printStackTrace();
        }
        return new int[2];
    }

    private int[] countOfNsAndTotalMinusAnchors(ResultSet rs) {
        try{
            int[] counts = new int[2];

            //loop through the result set and get the sequence
            int counter = 0;
            while(rs.next()) {
                String lineName = rs.getString(2);
                if(!lineName.endsWith("Assembly")) {
                    String sequence = rs.getString(3);
                    counts[0] += sequence.length();
                    counts[1] += countOfNs(sequence);
                    counter++;
                }
            }
            System.out.println(counter);
            return counts;
        }
        catch(Exception e) {
            e.printStackTrace();
        }
        return new int[2];
    }

    public static String removeLongNs(String anchorSequence) {
        StringBuilder longNRemovedBuilder = new StringBuilder();

        int nCounter = 0;
        for(int i = 0; i < anchorSequence.length(); i++) {
            if(anchorSequence.charAt(i)=='N') {
                if(nCounter<2) {
                    longNRemovedBuilder.append("N");
                    nCounter++;
                }
            }
            else {
                nCounter=0;
                longNRemovedBuilder.append(anchorSequence.charAt(i));
            }
        }

        return longNRemovedBuilder.toString();
    }

    private int countOfNs(String sequence) {
//        sequence = removeLongNs(sequence);
        int countOfN = 0;

        for(int i = 0; i < sequence.length(); i++) {
            if(sequence.charAt(i)=='N') {
                countOfN++;
            }
        }

        return countOfN;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy