net.maizegenetics.pangenome.pipelineTests.FindProteomeGenesInAssembly Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
/**
 * 
 */
package net.maizegenetics.pangenome.pipelineTests;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.pangenome.db_loading.AnchorDataPHG;
import net.maizegenetics.pangenome.db_loading.PHGDataWriter;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils.AnchorType;
import net.maizegenetics.util.Utils;

/**
 * NOTE:  Needs to be re-worked with new db.  The PHG calls are obsolete, need
 *        calls for new schema.
 *        
 * Input:
 *   1.  the db from which to pull the assembly/haplotype data 
 *   2.  the reference genome for pulling the gene sequence (a subset of the anchor sequence)
 *   3.  the name of the reference line as stored in the db (e.g. B73Ref)
 *   4.  the name of the assembly as stored in the db, e.g. (W22Assembly) 
 *   5.  name of the line from haplotypeCaller as stored in the db (e.g. W22_Haplotype_Caller)
 *       or NONE if this line was not processed via haplotype caller.
 *   6.  Tab-delimited file containing list of anchorids, the gene start/end on which the
 *       anchor is based, and whether the gene is a proteome gene.
 *   7.  The output directory for writing files.
 * 
 * Output:
 *   1.  A tab-delimited file containing the headers:
 *      AnchorID\tChromosome\tGene\tGeneInProteome\tAssemblyHasAnchor\tGeneSeqIn_\tGeneSeqIn_\tGeneSeqInBoth\tGeneSeqInNeither
 *   2. Console output that may be re-directed to a file.  THis output contains counts and calculated
 *      percentages.
 *      
 *  Algorithm:
 *     - read reference genome into GenomeSequence object
 *     - get assembly and haplotype caller anchor infor from db
 *     - read geneFile, for each entry:
 *       - note if is proteome
 *       - if anchor is present in assembly, mark counts for proteome/non-proteome,
 *         and check if gene sequence is part of this line's anchor sequence for this id
 *       - if anchor is present in haplotype_caller, mark counts for proteom/non-proteome,
 *         and check if gene sequence is part of this line's anchor sequence for this id.
 *     - close files, 
 *     - calculate percentages
 * 
 *   
 * 
 * @author lcj34
 *
 */
public class FindProteomeGenesInAssembly {
    
//    public static void processMain(String db, String refGenome, String refLine, String assembly, 
//            String hapLine, String geneFile, String outputDir) {
//        
//        BufferedReader geneBR = Utils.getBufferedReader(geneFile);
//        try {
// 
//            // Create map of chrom with anchors that include proteome genes
//            // Chromosome is 2nd column, anchorid is first, gene data is 7th
//
//            System.out.println("\nReading reference genome ...");
//            // Ref sequence needed for pulling gene sequence
//            GenomeSequence refGenomeSeq = GenomeSequenceBuilder.instance(refGenome);
//            
//            PHGDataWriter phg = new PHGSQLite(db);
//
//            System.out.println("\nget assembly data from db");
//            // Returns Map         
//            Map assemblyAnchorData = phg.getAnchorDataForHaplotype( assembly, 0, AnchorType.ANCHOR);
//            
//            // Create lists for easy comparison
//            List assemblyAnchorList = new ArrayList(assemblyAnchorData.keySet());
//            System.out.println("Size of assemblyAnchorList: " + assemblyAnchorList.size());
//                      
//            System.out.println("Get haplotype data from db");
// 
//            // Not all assemblies had fastq/BAM files to process for haplotype caller
//            Map hapAnchorData = null;
//            List hapAnchorList = null;
//            if (!hapLine.toUpperCase().equals("NONE")) {
//                hapAnchorData = phg.getAnchorDataForHaplotype( hapLine, 0, AnchorType.ANCHOR);
//                hapAnchorList = new ArrayList(hapAnchorData.keySet());
//                System.out.println("Size of hapAnchorList: " + hapAnchorList.size());
//            }
//                       
//            // Find number of overlaps between assembly and the haplotype version of the assembly
//            // write data indicating gene found in:  assemblyNotHap, hapNotAssembly, foundInBoth, foundInNeither
//            String dataFile = outputDir + assembly + "_proteomePresentData.txt";
//            BufferedWriter proteomeDataWR = Utils.getBufferedWriter(dataFile);
//             
//            String header = "AnchorID\tChromosome\tGene\tGeneInProteome\tAssemblyHasAnchor\tGeneSeqIn_" + assembly + "\tGeneSeqIn_" + hapLine + "\tGeneSeqInBoth\tGeneSeqInNeither\n";
//            proteomeDataWR.write(header);
//            int proteomeCount = 0;
//            int nonProteomeCount = 0;
//            
//            int hapAnchorCount = 0;
//            int hapProteomeCount = 0;
//            int hapProteomeMatch = 0;
//            int hapNonProteomeCount = 0;
//            int hapNonProteomeMatch = 0;
//            
//            int assemblyAnchorCount = 0;
//            int assemblyProteomeCount = 0;
//            int assemblyProteomeMatch = 0;
//            int assemblyNonProteomeCount = 0;
//            int assemblyNonProteomeMatch = 0;
//            
//            String geneLine = geneBR.readLine(); // toss header
// 
//            int totalAnchors = 0;
//            System.out.println("Got data, processing  entries from geneFile " + geneFile);
//            while ((geneLine=geneBR.readLine()) != null) {
//            
//                totalAnchors++;
//                //headers in geneFile are: AnchorID/Chromosome/GeneStart/GeneEnd/Genes/isProteome
//                String[] tokens = geneLine.split("\\t"); 
//                int anchorid = Integer.parseInt(tokens[0]);
//                String chrom = tokens[1];
//                int geneStart = Integer.parseInt(tokens[2]);
//                int geneEnd = Integer.parseInt(tokens[3]);
//                String gene = tokens[4];
//                boolean inProteome = tokens[5].toLowerCase().equals("true") ? true : false;
//                if (inProteome){
//                    proteomeCount++;
//                } else {
//                    nonProteomeCount++;
//                }
//                        
//                byte[] geneBytes = refGenomeSeq.chromosomeSequence(new Chromosome(chrom),geneStart,geneEnd);
//                String geneSeq = NucleotideAlignmentConstants.nucleotideBytetoString(geneBytes);
//                boolean inHap = false;
//                boolean inAssembly = false;
//                boolean assemblyHasAnchor = false;
//                if (hapAnchorList != null && hapAnchorList.contains(anchorid)) {
//                    hapAnchorCount++;
//                    AnchorDataPHG hapPHG = hapAnchorData.get(anchorid); 
//                    
//                    inHap = containsSeq( hapPHG,  geneSeq);
//                    if (inProteome) {
//                        hapProteomeCount++; // haplotype caller version has the anchor
//                        if (inHap) hapProteomeMatch++; // the anchor sequence contains the exact gene sequence
//                    } else {
//                        hapNonProteomeCount++;
//                        if (inHap) hapNonProteomeMatch++;
//                    }
//                }
//                if (assemblyAnchorList.contains(anchorid)) {
//                    assemblyAnchorCount++;
//                    AnchorDataPHG aPHG = assemblyAnchorData.get(anchorid);
//                    inAssembly = containsSeq(aPHG,  geneSeq);
//                    assemblyHasAnchor = true;
//                    if (inProteome) {
//                        assemblyProteomeCount++; // assembly has the anchor
//                        if (inAssembly) assemblyProteomeMatch++; // the anchor sequence contains the exact gene sequence
//                    } else {
//                        assemblyNonProteomeCount++;
//                        if (inAssembly) assemblyNonProteomeMatch++;
//                    }                                   
//                }
//                boolean inBoth = (inHap && inAssembly) ? true : false;
//                boolean inNeither = (!inHap && !inAssembly) ? true : false;
// 
//                StringBuilder sb = new StringBuilder().append(anchorid + "\t" + chrom + "\t" + gene + "\t")
//                        .append(inProteome + "\t")
//                        .append(assemblyHasAnchor + "\t")
//                        .append(inAssembly + "\t")
//                        .append(inHap + "\t")
//                        .append(inBoth + "\t")
//                        .append(inNeither + "\n");
//                proteomeDataWR.write(sb.toString());
//            }
//            geneBR.close();
//            
//            System.out.println("\nTotal number of anchors: " + totalAnchors +
//                    
//                    "\nnumber of anchors with proteome genes: " + proteomeCount +
//                    "\nnumber of anchors with non-proteome genes: " + nonProteomeCount +
//                    "\nnumber of " + assembly + " anchors: " + assemblyAnchorCount +
//                    "\nnumber of proteome anchors in " + assembly + ":" + assemblyProteomeCount +
//                    "\nnumber of proteome anchors in " + assembly + " with gene match: " + assemblyProteomeMatch +
//                    "\nnumber of non-proteome anchors in " + assembly + ":" + assemblyNonProteomeCount +
//                    "\nnumber of non-proteome anchors in " + assembly + " with gene match: " + assemblyNonProteomeMatch +
//                    "\nnumber of " + hapLine + " anchors: " + hapAnchorCount +
//                    "\nnumber of proteome anchors in " + hapLine + ": " + hapProteomeCount +
//                    "\nnumber of proteome anchors in " + hapLine + " with gene match: " + hapProteomeMatch +
//                    "\nnumber of non-proteome anchors in " + hapLine + ": " + hapNonProteomeCount +
//                    "\nnumber of non-proteome anchors in " + hapLine + " with gene match: " + hapNonProteomeMatch );
//            
//            // percentage of gene sequence matching is calculated as a percentage of the
//            // number of proteome or non-proteome based anchors present for each haplotype (from assembly or from haplotype caller) 
//            
//            double percentAssemblyAnchors = ((double)(assemblyAnchorCount)/totalAnchors) * 100;
//            double percentAssemblyProteomeAnchors = ((double)(assemblyProteomeCount)/proteomeCount) * 100;
//            double percentAssemblyNonProteomeAnchors = ((double)(assemblyNonProteomeCount)/nonProteomeCount) * 100; 
//            double percentAssemblyProteomeGeneMatch = ((double)(assemblyProteomeMatch)/assemblyProteomeCount) * 100;
//            double percentAssemblyNonProteomGeneMatch = ((double)(assemblyNonProteomeMatch)/assemblyNonProteomeCount) * 100;
//            
//            double percentHapAnchors = ((double)(hapAnchorCount)/totalAnchors) * 100;
//            double percentHapProteomeAnchors = ((double)(hapProteomeCount)/proteomeCount) * 100;
//            double percentHapNonProteomeAnchors = ((double)(hapNonProteomeCount)/nonProteomeCount) * 100;
//            double percentHapProteomeGeneMatch = ((double)(hapProteomeMatch)/hapProteomeCount) * 100;
//            double percentHapNonProteomGeneMatch = ((double)(hapNonProteomeMatch)/hapNonProteomeCount) * 100;
//            
//            
//            System.out.println("\nAssembly Stats: total number of assembly anchors: " + assemblyAnchorCount +
//                    "\npercentAssemblyAnchor: " + percentAssemblyAnchors +
//                    "\ntotal number of Assembly Proteome Anchors: " + assemblyProteomeCount +
//                    "\npercentAssemblyProteomeAnchors: " + percentAssemblyProteomeAnchors +
//                    "\ntotal number of assembly proteome gene seq matches: " + assemblyProteomeMatch +
//                    "\npercentAssemblyProteomeGeneMatches: " + percentAssemblyProteomeGeneMatch +
//                    "\ntotal number of assembly non-proteome anchors: " + assemblyNonProteomeCount +
//                    "\npercentAssemblyNonProteomeAnchors: " + percentAssemblyNonProteomeAnchors +
//                    "\ntotal number of assembly non-proteome gene seq matches: " + assemblyNonProteomeMatch +
//                    "\npercentAssemblyNonProteomeGeneMatch: " + percentAssemblyNonProteomGeneMatch);
//            
//            System.out.println("\nHaplotype Caller Stats: total number of  haplotype anchors: " + hapAnchorCount +
//                    "\npercentHapAnchor: " + percentHapAnchors +
//                    "\ntotal number of haplotype caller  Proteome Anchors: " + hapProteomeCount +
//                    "\npercentHapProteomeAnchors: " + percentHapProteomeAnchors +
//                    "\ntotal number of haplotype caller proteome gene seq matches: " + hapProteomeMatch +
//                    "\npercentHapProteomeGeneMatches: " + percentHapProteomeGeneMatch +
//                    "\ntotal number of hap non-proteome anchors: " + hapNonProteomeCount +
//                    "\npercentHapNonProteomeAnchors: " + percentHapNonProteomeAnchors +
//                    "\ntotal number of hap non-proteome gene seq matches: " + hapNonProteomeMatch +
//                    "\npercentHapNonProteomeGeneMatch: " + percentHapNonProteomGeneMatch);
//                    
//            proteomeDataWR.close();
//            ((PHGSQLite)phg).close();
//        } catch (Exception exc) {
//            exc.printStackTrace();
//        }
//    }
//    private static boolean containsSeq(AnchorDataPHG aData, String geneSeq) {
//        String sequence = aData.sequence();
//        int index = sequence.indexOf(geneSeq);
//        return (index != -1) ? true : false;
//    }
//
//    /**
//     * @param args
//     */
//    public static void main(String[] args) {
//        // TODO Auto-generated method stub
//        long time = System.nanoTime();
//        String db = "/Users/lcj34/notes_files/repgen/wgs_pipeline/agpv4_phg_dbs/hackathon_trimmedAnchors_refAndAssemblies.db";
//        String refGenome = "Zea_mays.AGPv4.dna.toplevel.fa";
//        String refLine = "B73Ref";
//        String assemblyLine = "W22Assembly";
//        String hapLine = "W22_Haplotype_Caller";
//        String geneFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/junit_official_test_cases/all10Chrom_anchors_in_proteome_final.txt";
//        String outputDir = ""; // will be a workdir on cbsu
//        if (args.length != 7) {
//            System.out.println(" Expecting 7 input arguments in this order: " );            
//            System.out.println("     Full path name for database to be loaded,");
//            System.out.println("     Path name of reference fasta,");
//            System.out.println("     Name of Ref Line, e.g. B73Ref");
//            System.out.println("     Name of Assembly line, eg W22Assembly,");            
//            System.out.println("     Name of Assembly line from hapltoype_caller, e.g W22_Hapltype_Caller or NONE");
//            System.out.println("     Name of file containing gene proteome data ,");
//            System.out.println("     Name of directory to write output files, including trailing /");
//            System.out.println("     Please fix the arguments and try again - aborting the run");
//            return;
//        }
//        try {
//            db = args[0];
//            refGenome = args[1];
//            refLine = args[2];
//            assemblyLine=args[3];
//            hapLine = args[4];
//            geneFile = args[5];
//            outputDir = args[6];
//        } catch (Exception exc) {
//            exc.printStackTrace();
//        }
//        System.out.println("Begin FindProteomeGenes");
//        processMain(db, refGenome, refLine,assemblyLine,hapLine,geneFile,outputDir);
//        System.out.println("\nFinished !! Total Time: " + (System.nanoTime() - time)/1e9 + " seconds");
//
//    }
//    
}