net.maizegenetics.pangenome.pipelineTests.EvaluateGVCFbyKnownSNPTest2 Maven / Gradle / Ivy
Show all versions of phg Show documentation
package net.maizegenetics.pangenome.pipelineTests;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultiset;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.ImportUtils;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import org.junit.*;
import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;
import static net.maizegenetics.dna.snp.GenotypeTable.UNKNOWN_DIPLOID_ALLELE;
import static net.maizegenetics.dna.snp.NucleotideAlignmentConstants.parseNucleotideDiploidByte;
/**
* Create unit test to evaluate GVCF SNPs by comparing to known SNPs.
* Read a GVCF file from practical haplotype pipeline.
* Read in known SNP file (VCF).
* Identify taxa from VCF that corresponds to GVCF taxa.
* Loop through SNPs from GVCF and compare PHG SNPs to known SNPs.
* Power: how many of the known SNPs are called in PHG?
* Type I error: how frequently does PHG differ from known SNPs?
*
* Note: W22-B73 IBD Chr 8: 122-147Mb
* Created by edbuckler on 6/22/17.
*/
public class EvaluateGVCFbyKnownSNPTest2 {
private static String testLine = "Ki11";
private static String refLine = "B73";
//private static String workingDir ="/Users/sej65/Documents/Hackathon_Jun17/unitTestFiles/";
private static String workingDir = "/Users/edbuckler/Box Sync/Hackhaton_files_June_2017/W22_unit_test_files/";
private static String gvcfFile = workingDir + "W22_haplotype_caller_output.g.vcf.gz";
private static String knownSNPs = workingDir + "TUM8Lines_Maize600k_elitelines_AGPv4_Tasselsorted.vcf.gz";
private static String referenceGenomeFile = workingDir + "Zea_mays.AGPv4.dna.toplevel.fa.gz";
private static SimpleGVCFReader simpleGVCFReader;
private static GenotypeTable knownSNPgt;
private static int minDepth = 3;
@BeforeClass
public static void setUpClass() throws Exception {
System.out.println("EvaluateGVCFbyKnownSNPTest.setUpClass");
System.out.println("Reading GVCF file: " + gvcfFile);
simpleGVCFReader = new SimpleGVCFReader(gvcfFile, referenceGenomeFile, 100000000, minDepth);
System.out.println("Reading VCF file: " + knownSNPs);
knownSNPgt = ImportUtils.readFromVCF(knownSNPs);
}
@AfterClass
public static void tearDownClass() {
}
@Before
public void setUp() {
}
@After
public void tearDown() {
}
/**
* Test of writeToHapmap method, of class ExportUtils.
*/
@Test
public void testSimpleGVCFReader() throws IOException {
System.out.println("EvaluateGVCFbyKnownSNPTest.testSimpleGVCFReader");
Assert.assertEquals("W22 GVCF at 1:118295 for alternate allele", "C", simpleGVCFReader.genotype(1, 118295));
Assert.assertEquals("W22 GVCF at 1:118452 for reference allele", "A", simpleGVCFReader.genotype(1, 118452));
Assert.assertEquals("W22 GVCF at 1:100000 for not in gvcf", "N", simpleGVCFReader.genotype(1, 100000));
Assert.assertEquals("W22 GVCF at 1:100000 for reference allele", true, simpleGVCFReader.inReferenceRange(1, 110800));
Assert.assertEquals("W22 GVCF at 1:210960 not in reference", "CTCTTACCACTCCATA", simpleGVCFReader.genotype(1, 210960));
Assert.assertEquals("W22 GVCF at 1:2114400 deletion", "AAT", simpleGVCFReader.genotype(1, 2114400));
}
@Test
public void testPHGvsKnownSNPs() {
testPHGvsKnownSNPs("W22","B73",true, true);
// testPHGvsKnownSNPs("B73","B73",false, false);
// testPHGvsKnownSNPs("Ki11","B73",false, false);
}
private void testPHGvsKnownSNPs(String testLine, String refLine, boolean errorReport, boolean regionReport) {
System.out.println("EvaluateGVCFbyKnownSNPTest.testPHGvsknownSNPs");
int w22index = knownSNPgt.taxa().indexOf(testLine);
int b73index = knownSNPgt.taxa().indexOf(refLine);
List goodChromosomes = Lists.newArrayList(2, 3, 4, 7, 8, 9); //W22 IBD with W22Rstd
int gvcfAgreesWithKnown = 0;
int refCallsDoNotMatch = 0;
int mismatchGVCFisRef = 0;
int mismatchGVCFisAlt1 = 0;
int mismatchGVCFisAlt2 = 0;
int hetIndelMissing = 0;
Multiset agreeMap = TreeMultiset.create();
Multiset mismatchGVCFisRefMap = TreeMultiset.create();
Multiset mismatchGVCFisAlt1Map = TreeMultiset.create();
Multiset mismatchGVCFisAlt2Map = TreeMultiset.create();
if(errorReport) System.out.println("Mismatch\tErrorType\tSiteNumber\tChromosome\tPosition\tGVCF" + testLine + "\tArray" + testLine + "\t" +
"ReferenceAllele\tArray" + refLine + "\tInRefenceRange");
for (int sites = 0; sites < knownSNPgt.numberOfSites(); sites++) {
Position currentPosition = knownSNPgt.positions().get(sites);
if (currentPosition.getChromosome().getChromosomeNumber() > 10) continue;
if (!goodChromosomes.contains(currentPosition.getChromosome().getChromosomeNumber())) continue;
Position binPosition = new GeneralPosition.Builder(currentPosition.getChromosome(), (currentPosition.getPosition() / 1_000_000) * 1_000_000).build();
List calls = Lists.newArrayList(//gVCFcallOfTest, arrayCallofTest, referenceAllele, arrayCallOfReference
parseNucleotideDiploidByte(simpleGVCFReader.genotype(currentPosition)).orElse(UNKNOWN_DIPLOID_ALLELE),
parseNucleotideDiploidByte(knownSNPgt.genotypeAsString(w22index, sites)).orElse(UNKNOWN_DIPLOID_ALLELE),
parseNucleotideDiploidByte(simpleGVCFReader.reference(currentPosition)).orElse(UNKNOWN_DIPLOID_ALLELE),
parseNucleotideDiploidByte(knownSNPgt.genotypeAsString(b73index, sites)).orElse(UNKNOWN_DIPLOID_ALLELE)
);
//remove the indel, heterozygous, or reference genome problems
if (calls.stream().anyMatch(call -> !NucleotideAlignmentConstants.isHomozygousACGT(call))) {
hetIndelMissing++;
continue;
}
if (calls.get(2) != calls.get(3)) {
refCallsDoNotMatch++;
continue;
}
//count the correct calls
if (calls.get(0) == calls.get(1)) {
gvcfAgreesWithKnown++;
agreeMap.add(binPosition);
continue;
}
//classifying the mismatches
String errorType = "";
if (calls.get(0) == calls.get(2)) {
mismatchGVCFisRef++;
errorType = "mismatchGVCFisRef";
mismatchGVCFisRefMap.add(binPosition);
} else if (calls.get(1) == calls.get(2)) {
mismatchGVCFisAlt1++;
errorType = "mismatchGVCFisAlt1";
mismatchGVCFisAlt1Map.add(binPosition);
} //two states at site
else {
mismatchGVCFisAlt2++;
errorType = "mismatchGVCFisAlt2";
mismatchGVCFisAlt2Map.add(binPosition);
} //three states at site
String callsToString = calls.stream().map(NucleotideAlignmentConstants::getNucleotideIUPAC).collect(Collectors.joining("\t"));
if(errorReport) System.out.printf("Mismatch\t%s\t%d\t%d\t%d\t%s\t%s\n", errorType, sites, currentPosition.getChromosome().getChromosomeNumber(),
currentPosition.getPosition(), callsToString, simpleGVCFReader.inReferenceRange(currentPosition));
}
if(regionReport) {
System.out.println("Chromosome\tBinPosition\tAgreeCnt\tmismatchGVCFisRefCnt\tmismatchGVCFisAlt1\tmismatchGVCFisAlt2Cnt");
for (Position position : agreeMap.elementSet()) {
System.out.println(position.getChromosome().getChromosomeNumber() + "\t" + position.getPosition() + "\t" +
agreeMap.count(position) + "\t" + mismatchGVCFisRefMap.count(position)
+ "\t" + mismatchGVCFisAlt1Map.count(position) + "\t" + mismatchGVCFisAlt2Map.count(position));
}
}
System.out.println("RefLine\t" + refLine);
System.out.println("TestLine\t" + testLine);
System.out.println("TestChromosomes\t"+ goodChromosomes.stream().map(i -> i.toString()).collect(Collectors.joining(",")));
System.out.println("minDepth\t" + minDepth);
System.out.println("HetIndelMissing\t" + hetIndelMissing);
System.out.println("refCallsDoNotMatch\t" + refCallsDoNotMatch);
System.out.println("gvcfAgreesWithKnown\t" + gvcfAgreesWithKnown + "\t" + agreeMap.entrySet().stream()
.mapToInt(e -> e.getCount()).average().orElse(-1));
System.out.println("mismatchGVCFisRef\t" + mismatchGVCFisRef + "\t" + mismatchGVCFisRefMap.entrySet().stream()
.mapToInt(e -> e.getCount()).average().orElse(-1));
System.out.println("mismatchGVCFisAlt1\t" + mismatchGVCFisAlt1 + "\t" + mismatchGVCFisAlt1Map.entrySet().stream()
.mapToInt(e -> e.getCount()).average().orElse(-1));
System.out.println("mismatchGVCFisAlt2\t" + mismatchGVCFisAlt2 + "\t" + mismatchGVCFisAlt2Map.entrySet().stream()
.mapToInt(e -> e.getCount()).average().orElse(-1));
double totalScored = gvcfAgreesWithKnown + mismatchGVCFisRef + mismatchGVCFisAlt1 + mismatchGVCFisAlt2;
double totalErrorRate = (mismatchGVCFisRef + mismatchGVCFisAlt1 + mismatchGVCFisAlt2) / totalScored;
double twoAlleleErrors = (mismatchGVCFisRef + mismatchGVCFisAlt1) / totalScored;
System.out.println("totalScored\t" + totalScored);
System.out.println("totalErrorRate\t" + totalErrorRate);
System.out.println("twoAlleleErrors\t" + twoAlleleErrors);
//Assert.assertTrue("Error rate at two state above 1.2%",twoAlleleErrors<0.012);
}
}