net.maizegenetics.pangenome.pipelineTests.CompareToKnownSNPPlugin Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
package net.maizegenetics.pangenome.pipelineTests;

import com.google.common.collect.Lists;
import com.google.common.collect.Range;
import com.google.common.collect.RangeSet;
import com.google.common.collect.TreeRangeSet;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;

import javax.swing.*;
import java.awt.*;
import java.io.BufferedReader;
import java.util.List;
import java.util.stream.Collectors;

import static net.maizegenetics.dna.snp.GenotypeTable.UNKNOWN_DIPLOID_ALLELE;

/**
 * Plugin to compare A GenotypeTable to a known trusted GenotypeTable
 * Inputs:
 * -DataSet of at least 2 GenotypeTables, First one is the known SNP set you wish to compare to, the rest are compared in order.
 * -bedFile, bed file of regions you want to compare.  If not specified will compare all base pairs in the known SNP set
 *
 * Outputs:
 * - Will print out the Number of HetIndelMissing base pairs, the number of missing base pairs, the number of base pairs compared and the error rate to the console.
 *
 *
 * Created by zrm22 on 12/18/17.
 */
public class CompareToKnownSNPPlugin extends AbstractPlugin {

    private static final Logger myLogger = Logger.getLogger(CompareToKnownSNPPlugin.class);

    PluginParameter anchorRegionBed = new PluginParameter.Builder("bedFile",null, String.class)
            .required(false)
            .inFile()
            .build();


    public CompareToKnownSNPPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }

    @Override
    public DataSet processData(DataSet input) {
        List genotypeTables = input.getDataOfType(GenotypeTable.class);

        if(genotypeTables.size()<2) {
            throw new IllegalStateException("CompareToKnownSNPPlugin must have at least 2 genotype tables input");
        }

        //The first GenotypeTable needs to be the known trusted SNP set, the rest are to be compared to the known SNP set
        GenotypeTable knownSNPs = (GenotypeTable)genotypeTables.get(0).getData();

        for(Datum currentDatum : genotypeTables) {
            GenotypeTable currentGenotypeTable = (GenotypeTable)currentDatum.getData();
            if(!currentGenotypeTable.equals(knownSNPs)) {
                computeErrorRate(knownSNPs, currentGenotypeTable, parseAnchorBed(anchorRegionBed()));
            }
        }

        return null;
    }

    /**
     * Parse the bed file,  This will improve the Power counts as we have less positions.
     * @param bedFileName
     * @return
     */
    private RangeSet parseAnchorBed(String bedFileName) {
        if(bedFileName==null || bedFileName.equals("")) return null;
        RangeSet anchorRanges = TreeRangeSet.create();

        try(BufferedReader reader = Utils.getBufferedReader(bedFileName)) {
            String currentLine = "";
            while((currentLine = reader.readLine()) != null) {
                String[] currentLineSplit = currentLine.split("\t");
                anchorRanges.add(Range.closed(Position.of(currentLineSplit[0],Integer.parseInt(currentLineSplit[1])+1),
                        Position.of(currentLineSplit[0], Integer.parseInt(currentLineSplit[2]))));
            }
        }
        catch(Exception e) {
            throw new IllegalStateException("Error parsing Bed File:", e);
        }

        return anchorRanges;
    }

    /**
     * Compute Error Rates against the knownSNPs genotypeTable
     * @param knownSNPs
     * @param testGenotypeTable
     * @param setOfAnchorPositions
     */
    private void computeErrorRate(GenotypeTable knownSNPs, GenotypeTable testGenotypeTable, RangeSet setOfAnchorPositions) {
        TaxaList knownTaxa = knownSNPs.taxa();

        System.out.println("Taxon\t#HetIndelMissing\t#Missing\t#ScoredBps\tErrorRate\tsitesMissing");
        //loop through each knownTaxa and check to make sure the testGenotypeTable has the same taxaName
        for(Taxon currentTaxon : knownTaxa) {
            //If the testGenotypeTable does not have the correct taxon name, skip
            if (!testGenotypeTable.taxa().contains(currentTaxon)) {
                continue;
            }

            int knownIndex = knownTaxa.indexOf(currentTaxon);
            int testIndex = testGenotypeTable.taxa().indexOf(currentTaxon);

            int gvcfAgreesWithKnown = 0;
            int siteMissing = 0;
            int refCallsDoNotMatch = 0;
            int mismatchGVCFisRef = 0;
            int mismatchGVCFisAlt1 = 0;
            int mismatchGVCFisAlt2 = 0;
            int hetIndelMissing = 0;
            int missing = 0;

            int currentTestPositionIndex = 0;

            for (int sites = 0; sites < knownSNPs.numberOfSites(); sites++) {
                Position currentPosition = knownSNPs.positions().get(sites);
                //Check to see if we moved chromosomes.  If we did, we need to slide the testPosition index up until we are back on the right chromosome
                //We also need to put a check in to make sure we dont loop forever.
                while(Integer.parseInt(testGenotypeTable.positions().get(currentTestPositionIndex).getChromosome().getName()) < Integer.parseInt(currentPosition.getChromosome().getName()) && currentTestPositionIndex < testGenotypeTable.positions().size()-1) {
                    Chromosome testChr = testGenotypeTable.positions().get(currentTestPositionIndex).getChromosome();
                    Position testPos = testGenotypeTable.positions().get(currentTestPositionIndex);
                    currentTestPositionIndex++;
                }
//                while(!currentPosition.getChromosome().equals(testGenotypeTable.positions().get(currentTestPositionIndex).getChromosome()) && sites+1 < knownSNPs.numberOfSites()) {
                while(Integer.parseInt(currentPosition.getChromosome().getName()) < Integer.parseInt(testGenotypeTable.positions().get(currentTestPositionIndex).getChromosome().getName()) ) {
//                    Chromosome chr = testGenotypeTable.positions().get(currentTestPositionIndex);
                    sites++;
                    currentPosition = knownSNPs.positions().get(sites);
                }


//                System.out.println(currentTestPositionIndex+ " " +sites + " "+ testGenotypeTable.positions().size());
                //If the current Test position is below the currently evaluated Known SNP position, we need to slide up the test index some more
                while(testGenotypeTable.positions().get(currentTestPositionIndex).getChromosome().equals(currentPosition.getChromosome())
                        && testGenotypeTable.positions().get(currentTestPositionIndex).getPosition() < currentPosition.getPosition()
                && currentTestPositionIndex < testGenotypeTable.positions().size()-1) {
                    currentTestPositionIndex++;
                }
                if(currentTestPositionIndex>=testGenotypeTable.positions().size()-1 || sites >= knownSNPs.numberOfSites())
                    break;

                int testPositionIndex = -1;
                //If the chrom and position match, we update the testPositionIndex, otherwise it means we looped past the current query and it should be missing.
                //Because Positions must be sorted in GenotypeTable, we can do this safely.
                if(testGenotypeTable.positions().get(currentTestPositionIndex).getChromosome().equals(currentPosition.getChromosome())
                        && testGenotypeTable.positions().get(currentTestPositionIndex).getPosition() == currentPosition.getPosition()) {
                    testPositionIndex = currentTestPositionIndex;
                }
                //Check to see if the position is in our anchor file
                if (setOfAnchorPositions != null && !setOfAnchorPositions.contains(Position.of(currentPosition.getChromosome(),currentPosition.getPosition()))) continue;

                //Get the reference calls
                byte testRefByte = UNKNOWN_DIPLOID_ALLELE;
                if (testPositionIndex >=0) {
                    testRefByte = testGenotypeTable.referenceAllele(testPositionIndex);
                    testRefByte = (byte)((testRefByte) | (testRefByte << 4));
                }

                byte testCallByte = UNKNOWN_DIPLOID_ALLELE;
                if(testPositionIndex >=0) {
                    testCallByte = testGenotypeTable.genotype(testIndex, testPositionIndex);
                }

                byte knownCallByte = knownSNPs.genotype(knownIndex,sites);
                byte knownRefByte = knownSNPs.referenceAllele(sites);
                knownRefByte = (byte)((knownRefByte) | (knownRefByte << 4));


                //Catch if we have any site out of bounds errors
                try {
                    List calls = Lists.newArrayList(//gVCFcallOfTest, arrayCallofTest, referenceAllele, arrayCallOfReference
                            testCallByte,
                            knownCallByte,
                            testRefByte,
                            knownRefByte
                    );

//                    if(currentTaxon.getName().equals("B73") ) {
//                        System.out.println(currentPosition + " " + calls.stream().map(it -> "" + it).collect(Collectors.joining(",")) + " TestIndex:" + testIndex + " TestPosIndex:" + testPositionIndex + " Sites:" + currentPosition.getPosition());
//                        System.out.println("\t"+testGenotypeTable.positions().get(currentTestPositionIndex));
//                    }
                    if(calls.get(2) == UNKNOWN_DIPLOID_ALLELE) {
                        siteMissing++;
//                        if(currentTaxon.getName().equals("B73") && currentPosition.getPosition() >= 150827101 && currentPosition.getPosition() <= 150831598 && currentPosition.getChromosome().getName().equals("10") ) {
//                        if(currentTaxon.getName().equals("B73") ) {
//                            System.out.println(currentPosition + " " + calls.stream().map(it -> "" + it).collect(Collectors.joining(",")) + " TestIndex:" + testIndex + " TestPosIndex:" + testPositionIndex + " Sites:" + currentPosition.getPosition());
//                            System.out.println("\t"+testGenotypeTable.positions().get(currentTestPositionIndex));
//                        }
                    }
                    //remove the indel, heterozygous, or reference genome problems
                    if(calls.stream().anyMatch(call -> call == UNKNOWN_DIPLOID_ALLELE)) {
                        missing++;
                    }
                    if (calls.stream().anyMatch(call -> !NucleotideAlignmentConstants.isHomozygousACGT(call))) {
//                        if(calls.stream().anyMatch(call -> call == UNKNOWN_DIPLOID_ALLELE)) {
//
//                        }
//                        else {
//                            if(currentTaxon.getName().equals("B73")) {
//                                System.out.println(currentPosition + " " + calls.stream().map(it -> "" + it).collect(Collectors.joining(",")) + " TestIndex:" + testIndex + " TestPosIndex:" + testPositionIndex + " Sites:" + currentPosition.getPosition());
//                            }
//                        }
                        hetIndelMissing++;
                        continue;
                    }
                    if (calls.get(2) != calls.get(3)) {
                        refCallsDoNotMatch++;
                        continue;
                    }
                    //count the correct calls
                    if (calls.get(0) == calls.get(1)) {
                        gvcfAgreesWithKnown++;
                        continue;
                    }
                    if (calls.get(0) == calls.get(2)) {
                        mismatchGVCFisRef++;
                    } else if (calls.get(1) == calls.get(2)) {
                        mismatchGVCFisAlt1++;
                    }  //two states at site
                    else {
                        mismatchGVCFisAlt2++;
                    } //three states at site

                }
                catch(Exception e) {
                    throw new IllegalStateException("Error comparing calls:",e);
                }
            }

            //Compute the number of bps scored and the error rate
            double totalScored = gvcfAgreesWithKnown + mismatchGVCFisRef + mismatchGVCFisAlt1 + mismatchGVCFisAlt2;
            double totalErrorRate = (mismatchGVCFisRef + mismatchGVCFisAlt1 + mismatchGVCFisAlt2) / totalScored;

            System.out.println(currentTaxon.getName() + "\t" + hetIndelMissing + "\t" + missing+ "\t" + totalScored + "\t" + totalErrorRate+"\t"+siteMissing);
        }

    }

    @Override
    public ImageIcon getIcon() {
        return null;
    }

    @Override
    public String getButtonName() {
        return "Compare To Known SNPs";
    }

    @Override
    public String getToolTipText() {
        return "Compare GenotypeTables to a known GenotypeTable and compute Error rates.";
    }

    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
    // public static void main(String[] args) {
    //     GeneratePluginCode.generate(CompareToKnownSNPPlugin.class);
    // }

    /**
     * Convenience method to run plugin with one return object.
     */
    // TODO: Replace  with specific type.
//    public  runPlugin(DataSet input) {
//        return () performFunction(input).getData(0).getData();
//    }

    /**
     * Bed File
     *
     * @return Bed File
     */
    public String anchorRegionBed() {
        return anchorRegionBed.value();
    }

    /**
     * Set Bed File. Bed File
     *
     * @param value Bed File
     *
     * @return this plugin
     */
    public CompareToKnownSNPPlugin anchorRegionBed(String value) {
        anchorRegionBed = new PluginParameter<>(anchorRegionBed, value);
        return this;
    }
}