net.maizegenetics.pangenome.pipelineTests.EvaluateGVCFbyKnownSNPTest Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.pipelineTests;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultiset;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.ImportUtils;
import org.junit.*;
import java.io.IOException;
import java.util.List;
/**
* Create unit test to evaluate GVCF SNPs by comparing to known SNPs.
* Read a GVCF file from practical haplotype pipeline.
* Read in known SNP file (VCF).
* Identify taxa from VCF that corresponds to GVCF taxa.
* Loop through SNPs from GVCF and compare PHG SNPs to known SNPs.
* Power: how many of the known SNPs are called in PHG?
* Type I error: how frequently does PHG differ from known SNPs?
* Created by edbuckler on 6/22/17.
*/
public class EvaluateGVCFbyKnownSNPTest {
private static String testLine="W22";
private static String refLine="B73";
//private static String workingDir ="/Users/sej65/Documents/Hackathon_Jun17/unitTestFiles/";
private static String workingDir ="/Users/edbuckler/Box Sync/Hackhaton_files_June_2017/W22_unit_test_files/";
private static String gvcfFile = workingDir+"W22_haplotype_caller_output.g.vcf.gz";
private static String knownSNPs = workingDir+"TUM8Lines_Maize600k_elitelines_AGPv4_Tasselsorted.vcf.gz";
private static String referenceGenomeFile = workingDir+"Zea_mays.AGPv4.dna.toplevel.fa.gz";
private static SimpleGVCFReader simpleGVCFReader;
private static GenotypeTable knownSNPgt;
@BeforeClass
public static void setUpClass() throws Exception {
// new SortGenotypeFilePlugin(null,false)
// .inputFile(knownSNPs)
// .outputFile("/Users/edbuckler/temp/W22_unit_test_files/TUMPLANTBREEDING_Maize600k_elitelines_AGPv4_Tasselsorted.vcf.gz")
// .run();
System.out.println("EvaluateGVCFbyKnownSNPTest.setUpClass");
//GenotypeTable phgGVCF = ImportUtils.readFromVCF(gvcfFile);
System.out.println("Reading GVCF file: " + gvcfFile);
simpleGVCFReader=new SimpleGVCFReader(gvcfFile, referenceGenomeFile, 100000000, -1);
System.out.println("Reading VCF file: " + knownSNPs);
knownSNPgt = ImportUtils.readFromVCF(knownSNPs);
}
@AfterClass
public static void tearDownClass() {
}
@Before
public void setUp() {
}
@After
public void tearDown() {
}
/**
* Test of writeToHapmap method, of class ExportUtils.
*/
@Test
public void testSimpleGVCFReader() throws IOException {
System.out.println("EvaluateGVCFbyKnownSNPTest.testSimpleGVCFReader");
Assert.assertEquals("W22 GVCF at 1:118295 for alternate allele", "C",simpleGVCFReader.genotype(1,118295));
Assert.assertEquals("W22 GVCF at 1:118452 for reference allele", "A",simpleGVCFReader.genotype(1,118452));
Assert.assertEquals("W22 GVCF at 1:100000 for not in gvcf", "N",simpleGVCFReader.genotype(1,100000));
Assert.assertEquals("W22 GVCF at 1:100000 for reference allele", true,simpleGVCFReader.inReferenceRange(1,110800));
Assert.assertEquals("W22 GVCF at 1:210960 not in reference", "CTCTTACCACTCCATA", simpleGVCFReader.genotype(1, 210960));
Assert.assertEquals("W22 GVCF at 1:2114400 deletion", "AAT", simpleGVCFReader.genotype(1, 2114400));
}
@Test
public void testPHGvsKnownSNPs() {
System.out.println("EvaluateGVCFbyKnownSNPTest.testPHGvsknownSNPs");
int w22index = knownSNPgt.taxa().indexOf(testLine);
int b73index = knownSNPgt.taxa().indexOf(refLine);
List goodChromosomes= Lists.newArrayList(2,3,4,7,8,9);
int gvcfAgreesWithKnown = 0;
int b73RefAndB73ArrayDoNotMatch = 0;
int indelOrMissing = 0;
int w22ArrayMatchesB73ButGVCFDoesNotMatch = 0;
int gvcfMatchesB73ref = 0; //but doesn't match W22 array
int other = 0;
Multiset agreeMap= TreeMultiset.create();
Multiset errorMap= TreeMultiset.create();
for (int sites = 0; sites < knownSNPgt.numberOfSites(); sites++) {
Position currentPosition=knownSNPgt.positions().get(sites);
if(currentPosition.getChromosome().getChromosomeNumber()>10) continue;
if(!goodChromosomes.contains(currentPosition.getChromosome().getChromosomeNumber())) continue;
Position binPosition=new GeneralPosition.Builder(currentPosition.getChromosome(),(currentPosition.getPosition()/1_000_000)*1_000_000).build();
String gvcfCall = simpleGVCFReader.genotype(knownSNPgt.positions().chromosome(sites).getChromosomeNumber(), knownSNPgt.positions().chromosomalPosition(sites));
String knownSNPcall = knownSNPgt.genotypeAsString(w22index, sites);
String b73KnownSNPcall= knownSNPgt.genotypeAsString(b73index, sites);
String refCall=simpleGVCFReader.reference(knownSNPgt.positions().chromosome(sites).getChromosomeNumber(), knownSNPgt.positions().chromosomalPosition(sites));
// System.out.println("Mismatch Site: " + sites + "\t" + knownSNPgt.positions().chromosomeName(sites) + "\t"
// + knownSNPgt.positions().chromosomalPosition(sites) + "\t" + knownSNPcall + "\t" + gvcfCall +"\t"
// + b73KnownSNPcall +"\t" + refCall +"\t" + simpleGVCFReader.inReferenceRange(currentPosition));
if (knownSNPcall.contains("-") || gvcfCall.contains("-") || b73KnownSNPcall.contains("-") || refCall.contains("-")
|| knownSNPcall.length()>1 || gvcfCall.length()>1 || b73KnownSNPcall.length()>1 || refCall.length()>1
|| knownSNPcall.equals("N") || gvcfCall.equals("N") || b73KnownSNPcall.equals("N") || refCall.equals("N")) {
indelOrMissing++;}
else if (!b73KnownSNPcall.equals(refCall)) b73RefAndB73ArrayDoNotMatch++;
else if (gvcfCall.equals(knownSNPcall)) {
gvcfAgreesWithKnown++;
agreeMap.add(binPosition);
}
else if (gvcfCall.equals(b73KnownSNPcall)) {
gvcfMatchesB73ref++; //
System.out.println("GVCF Ref But not array Site: " + sites + "\t" + knownSNPgt.positions().chromosomeName(sites) + "\t"
+ knownSNPgt.positions().chromosomalPosition(sites) + "\t" + knownSNPcall + "\t" + gvcfCall +"\t"
+ b73KnownSNPcall +"\t" + refCall +"\t" + simpleGVCFReader.inReferenceRange(currentPosition));
}
else if (knownSNPcall.equals(b73KnownSNPcall) && !gvcfCall.equals(knownSNPcall)) {
w22ArrayMatchesB73ButGVCFDoesNotMatch++;
errorMap.add(binPosition);
System.out.println("Mismatch Site: " + sites + "\t" + knownSNPgt.positions().chromosomeName(sites) + "\t"
+ knownSNPgt.positions().chromosomalPosition(sites) + "\t" + knownSNPcall + "\t" + gvcfCall +"\t"
+ b73KnownSNPcall +"\t" + refCall +"\t" + simpleGVCFReader.inReferenceRange(currentPosition));
}
else {
other++;
System.out.println("Other Site: " + sites + "\t" + knownSNPgt.positions().chromosomeName(sites) + "\t"
+ knownSNPgt.positions().chromosomalPosition(sites) + "\t" + knownSNPcall + "\t" + gvcfCall +"\t"
+ b73KnownSNPcall +"\t" + refCall +"\t" + simpleGVCFReader.inReferenceRange(currentPosition));
}
}
System.out.println("indelOrMissing\t" + indelOrMissing);
System.out.println("b73RefAndB73ArrayDoNotMatch\t" + b73RefAndB73ArrayDoNotMatch);
System.out.println("gvcfAgreesWithKnown\t" + gvcfAgreesWithKnown);
System.out.println("gvcfMatchesB73ref\t" + gvcfMatchesB73ref);
System.out.println("w22ArrayMatchesB73ButGVCFDoesNotMatch\t" + w22ArrayMatchesB73ButGVCFDoesNotMatch);
System.out.println("other\t" + other);
for (Position position : agreeMap.elementSet()) {
System.out.println(position.toString()+"\t"+agreeMap.count(position)+"\t"+errorMap.count(position));
}
}
}