net.maizegenetics.pangenome.pipelineTests.FindProteomeGenesInAssembly Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.pipelineTests;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.pangenome.db_loading.AnchorDataPHG;
import net.maizegenetics.pangenome.db_loading.PHGDataWriter;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils.AnchorType;
import net.maizegenetics.util.Utils;
/**
* NOTE: Needs to be re-worked with new db. The PHG calls are obsolete, need
* calls for new schema.
*
* Input:
* 1. the db from which to pull the assembly/haplotype data
* 2. the reference genome for pulling the gene sequence (a subset of the anchor sequence)
* 3. the name of the reference line as stored in the db (e.g. B73Ref)
* 4. the name of the assembly as stored in the db, e.g. (W22Assembly)
* 5. name of the line from haplotypeCaller as stored in the db (e.g. W22_Haplotype_Caller)
* or NONE if this line was not processed via haplotype caller.
* 6. Tab-delimited file containing list of anchorids, the gene start/end on which the
* anchor is based, and whether the gene is a proteome gene.
* 7. The output directory for writing files.
*
* Output:
* 1. A tab-delimited file containing the headers:
* AnchorID\tChromosome\tGene\tGeneInProteome\tAssemblyHasAnchor\tGeneSeqIn_\tGeneSeqIn_\tGeneSeqInBoth\tGeneSeqInNeither
* 2. Console output that may be re-directed to a file. THis output contains counts and calculated
* percentages.
*
* Algorithm:
* - read reference genome into GenomeSequence object
* - get assembly and haplotype caller anchor infor from db
* - read geneFile, for each entry:
* - note if is proteome
* - if anchor is present in assembly, mark counts for proteome/non-proteome,
* and check if gene sequence is part of this line's anchor sequence for this id
* - if anchor is present in haplotype_caller, mark counts for proteom/non-proteome,
* and check if gene sequence is part of this line's anchor sequence for this id.
* - close files,
* - calculate percentages
*
*
*
* @author lcj34
*
*/
public class FindProteomeGenesInAssembly {
// public static void processMain(String db, String refGenome, String refLine, String assembly,
// String hapLine, String geneFile, String outputDir) {
//
// BufferedReader geneBR = Utils.getBufferedReader(geneFile);
// try {
//
// // Create map of chrom with anchors that include proteome genes
// // Chromosome is 2nd column, anchorid is first, gene data is 7th
//
// System.out.println("\nReading reference genome ...");
// // Ref sequence needed for pulling gene sequence
// GenomeSequence refGenomeSeq = GenomeSequenceBuilder.instance(refGenome);
//
// PHGDataWriter phg = new PHGSQLite(db);
//
// System.out.println("\nget assembly data from db");
// // Returns Map
// Map assemblyAnchorData = phg.getAnchorDataForHaplotype( assembly, 0, AnchorType.ANCHOR);
//
// // Create lists for easy comparison
// List assemblyAnchorList = new ArrayList(assemblyAnchorData.keySet());
// System.out.println("Size of assemblyAnchorList: " + assemblyAnchorList.size());
//
// System.out.println("Get haplotype data from db");
//
// // Not all assemblies had fastq/BAM files to process for haplotype caller
// Map hapAnchorData = null;
// List hapAnchorList = null;
// if (!hapLine.toUpperCase().equals("NONE")) {
// hapAnchorData = phg.getAnchorDataForHaplotype( hapLine, 0, AnchorType.ANCHOR);
// hapAnchorList = new ArrayList(hapAnchorData.keySet());
// System.out.println("Size of hapAnchorList: " + hapAnchorList.size());
// }
//
// // Find number of overlaps between assembly and the haplotype version of the assembly
// // write data indicating gene found in: assemblyNotHap, hapNotAssembly, foundInBoth, foundInNeither
// String dataFile = outputDir + assembly + "_proteomePresentData.txt";
// BufferedWriter proteomeDataWR = Utils.getBufferedWriter(dataFile);
//
// String header = "AnchorID\tChromosome\tGene\tGeneInProteome\tAssemblyHasAnchor\tGeneSeqIn_" + assembly + "\tGeneSeqIn_" + hapLine + "\tGeneSeqInBoth\tGeneSeqInNeither\n";
// proteomeDataWR.write(header);
// int proteomeCount = 0;
// int nonProteomeCount = 0;
//
// int hapAnchorCount = 0;
// int hapProteomeCount = 0;
// int hapProteomeMatch = 0;
// int hapNonProteomeCount = 0;
// int hapNonProteomeMatch = 0;
//
// int assemblyAnchorCount = 0;
// int assemblyProteomeCount = 0;
// int assemblyProteomeMatch = 0;
// int assemblyNonProteomeCount = 0;
// int assemblyNonProteomeMatch = 0;
//
// String geneLine = geneBR.readLine(); // toss header
//
// int totalAnchors = 0;
// System.out.println("Got data, processing entries from geneFile " + geneFile);
// while ((geneLine=geneBR.readLine()) != null) {
//
// totalAnchors++;
// //headers in geneFile are: AnchorID/Chromosome/GeneStart/GeneEnd/Genes/isProteome
// String[] tokens = geneLine.split("\\t");
// int anchorid = Integer.parseInt(tokens[0]);
// String chrom = tokens[1];
// int geneStart = Integer.parseInt(tokens[2]);
// int geneEnd = Integer.parseInt(tokens[3]);
// String gene = tokens[4];
// boolean inProteome = tokens[5].toLowerCase().equals("true") ? true : false;
// if (inProteome){
// proteomeCount++;
// } else {
// nonProteomeCount++;
// }
//
// byte[] geneBytes = refGenomeSeq.chromosomeSequence(new Chromosome(chrom),geneStart,geneEnd);
// String geneSeq = NucleotideAlignmentConstants.nucleotideBytetoString(geneBytes);
// boolean inHap = false;
// boolean inAssembly = false;
// boolean assemblyHasAnchor = false;
// if (hapAnchorList != null && hapAnchorList.contains(anchorid)) {
// hapAnchorCount++;
// AnchorDataPHG hapPHG = hapAnchorData.get(anchorid);
//
// inHap = containsSeq( hapPHG, geneSeq);
// if (inProteome) {
// hapProteomeCount++; // haplotype caller version has the anchor
// if (inHap) hapProteomeMatch++; // the anchor sequence contains the exact gene sequence
// } else {
// hapNonProteomeCount++;
// if (inHap) hapNonProteomeMatch++;
// }
// }
// if (assemblyAnchorList.contains(anchorid)) {
// assemblyAnchorCount++;
// AnchorDataPHG aPHG = assemblyAnchorData.get(anchorid);
// inAssembly = containsSeq(aPHG, geneSeq);
// assemblyHasAnchor = true;
// if (inProteome) {
// assemblyProteomeCount++; // assembly has the anchor
// if (inAssembly) assemblyProteomeMatch++; // the anchor sequence contains the exact gene sequence
// } else {
// assemblyNonProteomeCount++;
// if (inAssembly) assemblyNonProteomeMatch++;
// }
// }
// boolean inBoth = (inHap && inAssembly) ? true : false;
// boolean inNeither = (!inHap && !inAssembly) ? true : false;
//
// StringBuilder sb = new StringBuilder().append(anchorid + "\t" + chrom + "\t" + gene + "\t")
// .append(inProteome + "\t")
// .append(assemblyHasAnchor + "\t")
// .append(inAssembly + "\t")
// .append(inHap + "\t")
// .append(inBoth + "\t")
// .append(inNeither + "\n");
// proteomeDataWR.write(sb.toString());
// }
// geneBR.close();
//
// System.out.println("\nTotal number of anchors: " + totalAnchors +
//
// "\nnumber of anchors with proteome genes: " + proteomeCount +
// "\nnumber of anchors with non-proteome genes: " + nonProteomeCount +
// "\nnumber of " + assembly + " anchors: " + assemblyAnchorCount +
// "\nnumber of proteome anchors in " + assembly + ":" + assemblyProteomeCount +
// "\nnumber of proteome anchors in " + assembly + " with gene match: " + assemblyProteomeMatch +
// "\nnumber of non-proteome anchors in " + assembly + ":" + assemblyNonProteomeCount +
// "\nnumber of non-proteome anchors in " + assembly + " with gene match: " + assemblyNonProteomeMatch +
// "\nnumber of " + hapLine + " anchors: " + hapAnchorCount +
// "\nnumber of proteome anchors in " + hapLine + ": " + hapProteomeCount +
// "\nnumber of proteome anchors in " + hapLine + " with gene match: " + hapProteomeMatch +
// "\nnumber of non-proteome anchors in " + hapLine + ": " + hapNonProteomeCount +
// "\nnumber of non-proteome anchors in " + hapLine + " with gene match: " + hapNonProteomeMatch );
//
// // percentage of gene sequence matching is calculated as a percentage of the
// // number of proteome or non-proteome based anchors present for each haplotype (from assembly or from haplotype caller)
//
// double percentAssemblyAnchors = ((double)(assemblyAnchorCount)/totalAnchors) * 100;
// double percentAssemblyProteomeAnchors = ((double)(assemblyProteomeCount)/proteomeCount) * 100;
// double percentAssemblyNonProteomeAnchors = ((double)(assemblyNonProteomeCount)/nonProteomeCount) * 100;
// double percentAssemblyProteomeGeneMatch = ((double)(assemblyProteomeMatch)/assemblyProteomeCount) * 100;
// double percentAssemblyNonProteomGeneMatch = ((double)(assemblyNonProteomeMatch)/assemblyNonProteomeCount) * 100;
//
// double percentHapAnchors = ((double)(hapAnchorCount)/totalAnchors) * 100;
// double percentHapProteomeAnchors = ((double)(hapProteomeCount)/proteomeCount) * 100;
// double percentHapNonProteomeAnchors = ((double)(hapNonProteomeCount)/nonProteomeCount) * 100;
// double percentHapProteomeGeneMatch = ((double)(hapProteomeMatch)/hapProteomeCount) * 100;
// double percentHapNonProteomGeneMatch = ((double)(hapNonProteomeMatch)/hapNonProteomeCount) * 100;
//
//
// System.out.println("\nAssembly Stats: total number of assembly anchors: " + assemblyAnchorCount +
// "\npercentAssemblyAnchor: " + percentAssemblyAnchors +
// "\ntotal number of Assembly Proteome Anchors: " + assemblyProteomeCount +
// "\npercentAssemblyProteomeAnchors: " + percentAssemblyProteomeAnchors +
// "\ntotal number of assembly proteome gene seq matches: " + assemblyProteomeMatch +
// "\npercentAssemblyProteomeGeneMatches: " + percentAssemblyProteomeGeneMatch +
// "\ntotal number of assembly non-proteome anchors: " + assemblyNonProteomeCount +
// "\npercentAssemblyNonProteomeAnchors: " + percentAssemblyNonProteomeAnchors +
// "\ntotal number of assembly non-proteome gene seq matches: " + assemblyNonProteomeMatch +
// "\npercentAssemblyNonProteomeGeneMatch: " + percentAssemblyNonProteomGeneMatch);
//
// System.out.println("\nHaplotype Caller Stats: total number of haplotype anchors: " + hapAnchorCount +
// "\npercentHapAnchor: " + percentHapAnchors +
// "\ntotal number of haplotype caller Proteome Anchors: " + hapProteomeCount +
// "\npercentHapProteomeAnchors: " + percentHapProteomeAnchors +
// "\ntotal number of haplotype caller proteome gene seq matches: " + hapProteomeMatch +
// "\npercentHapProteomeGeneMatches: " + percentHapProteomeGeneMatch +
// "\ntotal number of hap non-proteome anchors: " + hapNonProteomeCount +
// "\npercentHapNonProteomeAnchors: " + percentHapNonProteomeAnchors +
// "\ntotal number of hap non-proteome gene seq matches: " + hapNonProteomeMatch +
// "\npercentHapNonProteomeGeneMatch: " + percentHapNonProteomGeneMatch);
//
// proteomeDataWR.close();
// ((PHGSQLite)phg).close();
// } catch (Exception exc) {
// exc.printStackTrace();
// }
// }
// private static boolean containsSeq(AnchorDataPHG aData, String geneSeq) {
// String sequence = aData.sequence();
// int index = sequence.indexOf(geneSeq);
// return (index != -1) ? true : false;
// }
//
// /**
// * @param args
// */
// public static void main(String[] args) {
// // TODO Auto-generated method stub
// long time = System.nanoTime();
// String db = "/Users/lcj34/notes_files/repgen/wgs_pipeline/agpv4_phg_dbs/hackathon_trimmedAnchors_refAndAssemblies.db";
// String refGenome = "Zea_mays.AGPv4.dna.toplevel.fa";
// String refLine = "B73Ref";
// String assemblyLine = "W22Assembly";
// String hapLine = "W22_Haplotype_Caller";
// String geneFile = "/Users/lcj34/notes_files/repgen/wgs_pipeline/junit_official_test_cases/all10Chrom_anchors_in_proteome_final.txt";
// String outputDir = ""; // will be a workdir on cbsu
// if (args.length != 7) {
// System.out.println(" Expecting 7 input arguments in this order: " );
// System.out.println(" Full path name for database to be loaded,");
// System.out.println(" Path name of reference fasta,");
// System.out.println(" Name of Ref Line, e.g. B73Ref");
// System.out.println(" Name of Assembly line, eg W22Assembly,");
// System.out.println(" Name of Assembly line from hapltoype_caller, e.g W22_Hapltype_Caller or NONE");
// System.out.println(" Name of file containing gene proteome data ,");
// System.out.println(" Name of directory to write output files, including trailing /");
// System.out.println(" Please fix the arguments and try again - aborting the run");
// return;
// }
// try {
// db = args[0];
// refGenome = args[1];
// refLine = args[2];
// assemblyLine=args[3];
// hapLine = args[4];
// geneFile = args[5];
// outputDir = args[6];
// } catch (Exception exc) {
// exc.printStackTrace();
// }
// System.out.println("Begin FindProteomeGenes");
// processMain(db, refGenome, refLine,assemblyLine,hapLine,geneFile,outputDir);
// System.out.println("\nFinished !! Total Time: " + (System.nanoTime() - time)/1e9 + " seconds");
//
// }
//
}