All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.pipelineTests.EvaluateGVCFbyKnownSNPTest Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.pipelineTests;

import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultiset;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.ImportUtils;
import org.junit.*;

import java.io.IOException;
import java.util.List;

/**
 * Create unit test to evaluate GVCF SNPs by comparing to known SNPs.
 * Read a GVCF file from practical haplotype pipeline.
 * Read in known SNP file (VCF).
 * Identify taxa from VCF that corresponds to GVCF taxa.
 * Loop through SNPs from GVCF and compare PHG SNPs to known SNPs.
 * Power: how many of the known SNPs are called in PHG?
 * Type I error: how frequently does PHG differ from known SNPs?
 * Created by edbuckler on 6/22/17.
 */
public class EvaluateGVCFbyKnownSNPTest {
    private static String testLine="W22";
    private static String refLine="B73";

    //private static String workingDir ="/Users/sej65/Documents/Hackathon_Jun17/unitTestFiles/";
    private static String workingDir ="/Users/edbuckler/Box Sync/Hackhaton_files_June_2017/W22_unit_test_files/";

    private static String gvcfFile = workingDir+"W22_haplotype_caller_output.g.vcf.gz";
    private static String knownSNPs = workingDir+"TUM8Lines_Maize600k_elitelines_AGPv4_Tasselsorted.vcf.gz";
    private static String referenceGenomeFile = workingDir+"Zea_mays.AGPv4.dna.toplevel.fa.gz";
    private static SimpleGVCFReader simpleGVCFReader;
    private static GenotypeTable knownSNPgt;

    @BeforeClass
    public static void setUpClass() throws Exception {
//        new SortGenotypeFilePlugin(null,false)
//                .inputFile(knownSNPs)
//                .outputFile("/Users/edbuckler/temp/W22_unit_test_files/TUMPLANTBREEDING_Maize600k_elitelines_AGPv4_Tasselsorted.vcf.gz")
//                .run();


        System.out.println("EvaluateGVCFbyKnownSNPTest.setUpClass");
        //GenotypeTable phgGVCF = ImportUtils.readFromVCF(gvcfFile);
        System.out.println("Reading GVCF file: " + gvcfFile);
        simpleGVCFReader=new SimpleGVCFReader(gvcfFile, referenceGenomeFile, 100000000, -1);
        System.out.println("Reading VCF file: " + knownSNPs);
        knownSNPgt = ImportUtils.readFromVCF(knownSNPs);

    }

    @AfterClass
    public static void tearDownClass() {
    }

    @Before
    public void setUp() {
    }

    @After
    public void tearDown() {
    }

    /**
     * Test of writeToHapmap method, of class ExportUtils.
     */
    @Test
    public void testSimpleGVCFReader() throws IOException {
        System.out.println("EvaluateGVCFbyKnownSNPTest.testSimpleGVCFReader");
        Assert.assertEquals("W22 GVCF at 1:118295 for alternate allele", "C",simpleGVCFReader.genotype(1,118295));
        Assert.assertEquals("W22 GVCF at 1:118452 for reference allele", "A",simpleGVCFReader.genotype(1,118452));
        Assert.assertEquals("W22 GVCF at 1:100000 for not in gvcf", "N",simpleGVCFReader.genotype(1,100000));
        Assert.assertEquals("W22 GVCF at 1:100000 for reference allele", true,simpleGVCFReader.inReferenceRange(1,110800));
        Assert.assertEquals("W22 GVCF at 1:210960 not in reference", "CTCTTACCACTCCATA", simpleGVCFReader.genotype(1, 210960));
        Assert.assertEquals("W22 GVCF at 1:2114400 deletion", "AAT", simpleGVCFReader.genotype(1, 2114400));

    }

    @Test
    public void testPHGvsKnownSNPs() {
        System.out.println("EvaluateGVCFbyKnownSNPTest.testPHGvsknownSNPs");
        int w22index = knownSNPgt.taxa().indexOf(testLine);
        int b73index = knownSNPgt.taxa().indexOf(refLine);
        List goodChromosomes= Lists.newArrayList(2,3,4,7,8,9);

        int gvcfAgreesWithKnown = 0;
        int b73RefAndB73ArrayDoNotMatch = 0;
        int indelOrMissing = 0;
        int w22ArrayMatchesB73ButGVCFDoesNotMatch = 0;
        int gvcfMatchesB73ref = 0; //but doesn't match W22 array
        int other = 0;

        Multiset agreeMap= TreeMultiset.create();
        Multiset errorMap= TreeMultiset.create();

        for (int sites = 0; sites < knownSNPgt.numberOfSites(); sites++) {
            Position currentPosition=knownSNPgt.positions().get(sites);
            if(currentPosition.getChromosome().getChromosomeNumber()>10) continue;
            if(!goodChromosomes.contains(currentPosition.getChromosome().getChromosomeNumber())) continue;
            Position binPosition=new GeneralPosition.Builder(currentPosition.getChromosome(),(currentPosition.getPosition()/1_000_000)*1_000_000).build();

            String gvcfCall = simpleGVCFReader.genotype(knownSNPgt.positions().chromosome(sites).getChromosomeNumber(), knownSNPgt.positions().chromosomalPosition(sites));
            String knownSNPcall = knownSNPgt.genotypeAsString(w22index, sites);
            String b73KnownSNPcall= knownSNPgt.genotypeAsString(b73index, sites);
            String refCall=simpleGVCFReader.reference(knownSNPgt.positions().chromosome(sites).getChromosomeNumber(), knownSNPgt.positions().chromosomalPosition(sites));

//            System.out.println("Mismatch Site: " +  sites + "\t" + knownSNPgt.positions().chromosomeName(sites) + "\t"
//                    + knownSNPgt.positions().chromosomalPosition(sites) + "\t" + knownSNPcall + "\t" + gvcfCall +"\t"
//                    + b73KnownSNPcall +"\t" + refCall +"\t" + simpleGVCFReader.inReferenceRange(currentPosition));

            if (knownSNPcall.contains("-") || gvcfCall.contains("-") || b73KnownSNPcall.contains("-") || refCall.contains("-")
                    || knownSNPcall.length()>1 || gvcfCall.length()>1 || b73KnownSNPcall.length()>1 || refCall.length()>1
                    || knownSNPcall.equals("N") || gvcfCall.equals("N") || b73KnownSNPcall.equals("N") || refCall.equals("N")) {
                indelOrMissing++;}
            else if (!b73KnownSNPcall.equals(refCall)) b73RefAndB73ArrayDoNotMatch++;
            else if (gvcfCall.equals(knownSNPcall)) {
                gvcfAgreesWithKnown++;
                agreeMap.add(binPosition);
            }
            else if (gvcfCall.equals(b73KnownSNPcall)) {
                gvcfMatchesB73ref++;  //
                System.out.println("GVCF Ref But not array Site: " +  sites + "\t" + knownSNPgt.positions().chromosomeName(sites) + "\t"
                        + knownSNPgt.positions().chromosomalPosition(sites) + "\t" + knownSNPcall + "\t" + gvcfCall +"\t"
                        + b73KnownSNPcall +"\t" + refCall +"\t" + simpleGVCFReader.inReferenceRange(currentPosition));
            }
            else if (knownSNPcall.equals(b73KnownSNPcall) && !gvcfCall.equals(knownSNPcall)) {
                w22ArrayMatchesB73ButGVCFDoesNotMatch++;
                errorMap.add(binPosition);
                System.out.println("Mismatch Site: " +  sites + "\t" + knownSNPgt.positions().chromosomeName(sites) + "\t"
                        + knownSNPgt.positions().chromosomalPosition(sites) + "\t" + knownSNPcall + "\t" + gvcfCall +"\t"
                        + b73KnownSNPcall +"\t" + refCall +"\t" + simpleGVCFReader.inReferenceRange(currentPosition));
            }
            else {
                other++;
                System.out.println("Other Site: " +  sites + "\t" + knownSNPgt.positions().chromosomeName(sites) + "\t"
                        + knownSNPgt.positions().chromosomalPosition(sites) + "\t" + knownSNPcall + "\t" + gvcfCall +"\t"
                        + b73KnownSNPcall +"\t" + refCall +"\t" + simpleGVCFReader.inReferenceRange(currentPosition));
            }

        }

        System.out.println("indelOrMissing\t" + indelOrMissing);
        System.out.println("b73RefAndB73ArrayDoNotMatch\t" + b73RefAndB73ArrayDoNotMatch);
        System.out.println("gvcfAgreesWithKnown\t" + gvcfAgreesWithKnown);
        System.out.println("gvcfMatchesB73ref\t" + gvcfMatchesB73ref);
        System.out.println("w22ArrayMatchesB73ButGVCFDoesNotMatch\t" + w22ArrayMatchesB73ButGVCFDoesNotMatch);
        System.out.println("other\t" + other);

        for (Position position : agreeMap.elementSet()) {
            System.out.println(position.toString()+"\t"+agreeMap.count(position)+"\t"+errorMap.count(position));
        }

    }


}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy