All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.pipelineTests.EvaluateGVCFbyKnownSNPTest2 Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.pipelineTests;

import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultiset;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.ImportUtils;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import org.junit.*;

import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;

import static net.maizegenetics.dna.snp.GenotypeTable.UNKNOWN_DIPLOID_ALLELE;
import static net.maizegenetics.dna.snp.NucleotideAlignmentConstants.parseNucleotideDiploidByte;

/**
 * Create unit test to evaluate GVCF SNPs by comparing to known SNPs.
 * Read a GVCF file from practical haplotype pipeline.
 * Read in known SNP file (VCF).
 * Identify taxa from VCF that corresponds to GVCF taxa.
 * Loop through SNPs from GVCF and compare PHG SNPs to known SNPs.
 * Power: how many of the known SNPs are called in PHG?
 * Type I error: how frequently does PHG differ from known SNPs?
 * 

* Note: W22-B73 IBD Chr 8: 122-147Mb * Created by edbuckler on 6/22/17. */ public class EvaluateGVCFbyKnownSNPTest2 { private static String testLine = "Ki11"; private static String refLine = "B73"; //private static String workingDir ="/Users/sej65/Documents/Hackathon_Jun17/unitTestFiles/"; private static String workingDir = "/Users/edbuckler/Box Sync/Hackhaton_files_June_2017/W22_unit_test_files/"; private static String gvcfFile = workingDir + "W22_haplotype_caller_output.g.vcf.gz"; private static String knownSNPs = workingDir + "TUM8Lines_Maize600k_elitelines_AGPv4_Tasselsorted.vcf.gz"; private static String referenceGenomeFile = workingDir + "Zea_mays.AGPv4.dna.toplevel.fa.gz"; private static SimpleGVCFReader simpleGVCFReader; private static GenotypeTable knownSNPgt; private static int minDepth = 3; @BeforeClass public static void setUpClass() throws Exception { System.out.println("EvaluateGVCFbyKnownSNPTest.setUpClass"); System.out.println("Reading GVCF file: " + gvcfFile); simpleGVCFReader = new SimpleGVCFReader(gvcfFile, referenceGenomeFile, 100000000, minDepth); System.out.println("Reading VCF file: " + knownSNPs); knownSNPgt = ImportUtils.readFromVCF(knownSNPs); } @AfterClass public static void tearDownClass() { } @Before public void setUp() { } @After public void tearDown() { } /** * Test of writeToHapmap method, of class ExportUtils. */ @Test public void testSimpleGVCFReader() throws IOException { System.out.println("EvaluateGVCFbyKnownSNPTest.testSimpleGVCFReader"); Assert.assertEquals("W22 GVCF at 1:118295 for alternate allele", "C", simpleGVCFReader.genotype(1, 118295)); Assert.assertEquals("W22 GVCF at 1:118452 for reference allele", "A", simpleGVCFReader.genotype(1, 118452)); Assert.assertEquals("W22 GVCF at 1:100000 for not in gvcf", "N", simpleGVCFReader.genotype(1, 100000)); Assert.assertEquals("W22 GVCF at 1:100000 for reference allele", true, simpleGVCFReader.inReferenceRange(1, 110800)); Assert.assertEquals("W22 GVCF at 1:210960 not in reference", "CTCTTACCACTCCATA", simpleGVCFReader.genotype(1, 210960)); Assert.assertEquals("W22 GVCF at 1:2114400 deletion", "AAT", simpleGVCFReader.genotype(1, 2114400)); } @Test public void testPHGvsKnownSNPs() { testPHGvsKnownSNPs("W22","B73",true, true); // testPHGvsKnownSNPs("B73","B73",false, false); // testPHGvsKnownSNPs("Ki11","B73",false, false); } private void testPHGvsKnownSNPs(String testLine, String refLine, boolean errorReport, boolean regionReport) { System.out.println("EvaluateGVCFbyKnownSNPTest.testPHGvsknownSNPs"); int w22index = knownSNPgt.taxa().indexOf(testLine); int b73index = knownSNPgt.taxa().indexOf(refLine); List goodChromosomes = Lists.newArrayList(2, 3, 4, 7, 8, 9); //W22 IBD with W22Rstd int gvcfAgreesWithKnown = 0; int refCallsDoNotMatch = 0; int mismatchGVCFisRef = 0; int mismatchGVCFisAlt1 = 0; int mismatchGVCFisAlt2 = 0; int hetIndelMissing = 0; Multiset agreeMap = TreeMultiset.create(); Multiset mismatchGVCFisRefMap = TreeMultiset.create(); Multiset mismatchGVCFisAlt1Map = TreeMultiset.create(); Multiset mismatchGVCFisAlt2Map = TreeMultiset.create(); if(errorReport) System.out.println("Mismatch\tErrorType\tSiteNumber\tChromosome\tPosition\tGVCF" + testLine + "\tArray" + testLine + "\t" + "ReferenceAllele\tArray" + refLine + "\tInRefenceRange"); for (int sites = 0; sites < knownSNPgt.numberOfSites(); sites++) { Position currentPosition = knownSNPgt.positions().get(sites); if (currentPosition.getChromosome().getChromosomeNumber() > 10) continue; if (!goodChromosomes.contains(currentPosition.getChromosome().getChromosomeNumber())) continue; Position binPosition = new GeneralPosition.Builder(currentPosition.getChromosome(), (currentPosition.getPosition() / 1_000_000) * 1_000_000).build(); List calls = Lists.newArrayList(//gVCFcallOfTest, arrayCallofTest, referenceAllele, arrayCallOfReference parseNucleotideDiploidByte(simpleGVCFReader.genotype(currentPosition)).orElse(UNKNOWN_DIPLOID_ALLELE), parseNucleotideDiploidByte(knownSNPgt.genotypeAsString(w22index, sites)).orElse(UNKNOWN_DIPLOID_ALLELE), parseNucleotideDiploidByte(simpleGVCFReader.reference(currentPosition)).orElse(UNKNOWN_DIPLOID_ALLELE), parseNucleotideDiploidByte(knownSNPgt.genotypeAsString(b73index, sites)).orElse(UNKNOWN_DIPLOID_ALLELE) ); //remove the indel, heterozygous, or reference genome problems if (calls.stream().anyMatch(call -> !NucleotideAlignmentConstants.isHomozygousACGT(call))) { hetIndelMissing++; continue; } if (calls.get(2) != calls.get(3)) { refCallsDoNotMatch++; continue; } //count the correct calls if (calls.get(0) == calls.get(1)) { gvcfAgreesWithKnown++; agreeMap.add(binPosition); continue; } //classifying the mismatches String errorType = ""; if (calls.get(0) == calls.get(2)) { mismatchGVCFisRef++; errorType = "mismatchGVCFisRef"; mismatchGVCFisRefMap.add(binPosition); } else if (calls.get(1) == calls.get(2)) { mismatchGVCFisAlt1++; errorType = "mismatchGVCFisAlt1"; mismatchGVCFisAlt1Map.add(binPosition); } //two states at site else { mismatchGVCFisAlt2++; errorType = "mismatchGVCFisAlt2"; mismatchGVCFisAlt2Map.add(binPosition); } //three states at site String callsToString = calls.stream().map(NucleotideAlignmentConstants::getNucleotideIUPAC).collect(Collectors.joining("\t")); if(errorReport) System.out.printf("Mismatch\t%s\t%d\t%d\t%d\t%s\t%s\n", errorType, sites, currentPosition.getChromosome().getChromosomeNumber(), currentPosition.getPosition(), callsToString, simpleGVCFReader.inReferenceRange(currentPosition)); } if(regionReport) { System.out.println("Chromosome\tBinPosition\tAgreeCnt\tmismatchGVCFisRefCnt\tmismatchGVCFisAlt1\tmismatchGVCFisAlt2Cnt"); for (Position position : agreeMap.elementSet()) { System.out.println(position.getChromosome().getChromosomeNumber() + "\t" + position.getPosition() + "\t" + agreeMap.count(position) + "\t" + mismatchGVCFisRefMap.count(position) + "\t" + mismatchGVCFisAlt1Map.count(position) + "\t" + mismatchGVCFisAlt2Map.count(position)); } } System.out.println("RefLine\t" + refLine); System.out.println("TestLine\t" + testLine); System.out.println("TestChromosomes\t"+ goodChromosomes.stream().map(i -> i.toString()).collect(Collectors.joining(","))); System.out.println("minDepth\t" + minDepth); System.out.println("HetIndelMissing\t" + hetIndelMissing); System.out.println("refCallsDoNotMatch\t" + refCallsDoNotMatch); System.out.println("gvcfAgreesWithKnown\t" + gvcfAgreesWithKnown + "\t" + agreeMap.entrySet().stream() .mapToInt(e -> e.getCount()).average().orElse(-1)); System.out.println("mismatchGVCFisRef\t" + mismatchGVCFisRef + "\t" + mismatchGVCFisRefMap.entrySet().stream() .mapToInt(e -> e.getCount()).average().orElse(-1)); System.out.println("mismatchGVCFisAlt1\t" + mismatchGVCFisAlt1 + "\t" + mismatchGVCFisAlt1Map.entrySet().stream() .mapToInt(e -> e.getCount()).average().orElse(-1)); System.out.println("mismatchGVCFisAlt2\t" + mismatchGVCFisAlt2 + "\t" + mismatchGVCFisAlt2Map.entrySet().stream() .mapToInt(e -> e.getCount()).average().orElse(-1)); double totalScored = gvcfAgreesWithKnown + mismatchGVCFisRef + mismatchGVCFisAlt1 + mismatchGVCFisAlt2; double totalErrorRate = (mismatchGVCFisRef + mismatchGVCFisAlt1 + mismatchGVCFisAlt2) / totalScored; double twoAlleleErrors = (mismatchGVCFisRef + mismatchGVCFisAlt1) / totalScored; System.out.println("totalScored\t" + totalScored); System.out.println("totalErrorRate\t" + totalErrorRate); System.out.println("twoAlleleErrors\t" + twoAlleleErrors); //Assert.assertTrue("Error rate at two state above 1.2%",twoAlleleErrors<0.012); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy