net.maizegenetics.pangenome.pipelineTests.CompareToKnownSNPPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.pipelineTests;
import com.google.common.collect.Lists;
import com.google.common.collect.Range;
import com.google.common.collect.RangeSet;
import com.google.common.collect.TreeRangeSet;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.Datum;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
import javax.swing.*;
import java.awt.*;
import java.io.BufferedReader;
import java.util.List;
import java.util.stream.Collectors;
import static net.maizegenetics.dna.snp.GenotypeTable.UNKNOWN_DIPLOID_ALLELE;
/**
* Plugin to compare A GenotypeTable to a known trusted GenotypeTable
* Inputs:
* -DataSet of at least 2 GenotypeTables, First one is the known SNP set you wish to compare to, the rest are compared in order.
* -bedFile, bed file of regions you want to compare. If not specified will compare all base pairs in the known SNP set
*
* Outputs:
* - Will print out the Number of HetIndelMissing base pairs, the number of missing base pairs, the number of base pairs compared and the error rate to the console.
*
*
* Created by zrm22 on 12/18/17.
*/
public class CompareToKnownSNPPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(CompareToKnownSNPPlugin.class);
PluginParameter anchorRegionBed = new PluginParameter.Builder("bedFile",null, String.class)
.required(false)
.inFile()
.build();
public CompareToKnownSNPPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public DataSet processData(DataSet input) {
List genotypeTables = input.getDataOfType(GenotypeTable.class);
if(genotypeTables.size()<2) {
throw new IllegalStateException("CompareToKnownSNPPlugin must have at least 2 genotype tables input");
}
//The first GenotypeTable needs to be the known trusted SNP set, the rest are to be compared to the known SNP set
GenotypeTable knownSNPs = (GenotypeTable)genotypeTables.get(0).getData();
for(Datum currentDatum : genotypeTables) {
GenotypeTable currentGenotypeTable = (GenotypeTable)currentDatum.getData();
if(!currentGenotypeTable.equals(knownSNPs)) {
computeErrorRate(knownSNPs, currentGenotypeTable, parseAnchorBed(anchorRegionBed()));
}
}
return null;
}
/**
* Parse the bed file, This will improve the Power counts as we have less positions.
* @param bedFileName
* @return
*/
private RangeSet parseAnchorBed(String bedFileName) {
if(bedFileName==null || bedFileName.equals("")) return null;
RangeSet anchorRanges = TreeRangeSet.create();
try(BufferedReader reader = Utils.getBufferedReader(bedFileName)) {
String currentLine = "";
while((currentLine = reader.readLine()) != null) {
String[] currentLineSplit = currentLine.split("\t");
anchorRanges.add(Range.closed(Position.of(currentLineSplit[0],Integer.parseInt(currentLineSplit[1])+1),
Position.of(currentLineSplit[0], Integer.parseInt(currentLineSplit[2]))));
}
}
catch(Exception e) {
throw new IllegalStateException("Error parsing Bed File:", e);
}
return anchorRanges;
}
/**
* Compute Error Rates against the knownSNPs genotypeTable
* @param knownSNPs
* @param testGenotypeTable
* @param setOfAnchorPositions
*/
private void computeErrorRate(GenotypeTable knownSNPs, GenotypeTable testGenotypeTable, RangeSet setOfAnchorPositions) {
TaxaList knownTaxa = knownSNPs.taxa();
System.out.println("Taxon\t#HetIndelMissing\t#Missing\t#ScoredBps\tErrorRate\tsitesMissing");
//loop through each knownTaxa and check to make sure the testGenotypeTable has the same taxaName
for(Taxon currentTaxon : knownTaxa) {
//If the testGenotypeTable does not have the correct taxon name, skip
if (!testGenotypeTable.taxa().contains(currentTaxon)) {
continue;
}
int knownIndex = knownTaxa.indexOf(currentTaxon);
int testIndex = testGenotypeTable.taxa().indexOf(currentTaxon);
int gvcfAgreesWithKnown = 0;
int siteMissing = 0;
int refCallsDoNotMatch = 0;
int mismatchGVCFisRef = 0;
int mismatchGVCFisAlt1 = 0;
int mismatchGVCFisAlt2 = 0;
int hetIndelMissing = 0;
int missing = 0;
int currentTestPositionIndex = 0;
for (int sites = 0; sites < knownSNPs.numberOfSites(); sites++) {
Position currentPosition = knownSNPs.positions().get(sites);
//Check to see if we moved chromosomes. If we did, we need to slide the testPosition index up until we are back on the right chromosome
//We also need to put a check in to make sure we dont loop forever.
while(Integer.parseInt(testGenotypeTable.positions().get(currentTestPositionIndex).getChromosome().getName()) < Integer.parseInt(currentPosition.getChromosome().getName()) && currentTestPositionIndex < testGenotypeTable.positions().size()-1) {
Chromosome testChr = testGenotypeTable.positions().get(currentTestPositionIndex).getChromosome();
Position testPos = testGenotypeTable.positions().get(currentTestPositionIndex);
currentTestPositionIndex++;
}
// while(!currentPosition.getChromosome().equals(testGenotypeTable.positions().get(currentTestPositionIndex).getChromosome()) && sites+1 < knownSNPs.numberOfSites()) {
while(Integer.parseInt(currentPosition.getChromosome().getName()) < Integer.parseInt(testGenotypeTable.positions().get(currentTestPositionIndex).getChromosome().getName()) ) {
// Chromosome chr = testGenotypeTable.positions().get(currentTestPositionIndex);
sites++;
currentPosition = knownSNPs.positions().get(sites);
}
// System.out.println(currentTestPositionIndex+ " " +sites + " "+ testGenotypeTable.positions().size());
//If the current Test position is below the currently evaluated Known SNP position, we need to slide up the test index some more
while(testGenotypeTable.positions().get(currentTestPositionIndex).getChromosome().equals(currentPosition.getChromosome())
&& testGenotypeTable.positions().get(currentTestPositionIndex).getPosition() < currentPosition.getPosition()
&& currentTestPositionIndex < testGenotypeTable.positions().size()-1) {
currentTestPositionIndex++;
}
if(currentTestPositionIndex>=testGenotypeTable.positions().size()-1 || sites >= knownSNPs.numberOfSites())
break;
int testPositionIndex = -1;
//If the chrom and position match, we update the testPositionIndex, otherwise it means we looped past the current query and it should be missing.
//Because Positions must be sorted in GenotypeTable, we can do this safely.
if(testGenotypeTable.positions().get(currentTestPositionIndex).getChromosome().equals(currentPosition.getChromosome())
&& testGenotypeTable.positions().get(currentTestPositionIndex).getPosition() == currentPosition.getPosition()) {
testPositionIndex = currentTestPositionIndex;
}
//Check to see if the position is in our anchor file
if (setOfAnchorPositions != null && !setOfAnchorPositions.contains(Position.of(currentPosition.getChromosome(),currentPosition.getPosition()))) continue;
//Get the reference calls
byte testRefByte = UNKNOWN_DIPLOID_ALLELE;
if (testPositionIndex >=0) {
testRefByte = testGenotypeTable.referenceAllele(testPositionIndex);
testRefByte = (byte)((testRefByte) | (testRefByte << 4));
}
byte testCallByte = UNKNOWN_DIPLOID_ALLELE;
if(testPositionIndex >=0) {
testCallByte = testGenotypeTable.genotype(testIndex, testPositionIndex);
}
byte knownCallByte = knownSNPs.genotype(knownIndex,sites);
byte knownRefByte = knownSNPs.referenceAllele(sites);
knownRefByte = (byte)((knownRefByte) | (knownRefByte << 4));
//Catch if we have any site out of bounds errors
try {
List calls = Lists.newArrayList(//gVCFcallOfTest, arrayCallofTest, referenceAllele, arrayCallOfReference
testCallByte,
knownCallByte,
testRefByte,
knownRefByte
);
// if(currentTaxon.getName().equals("B73") ) {
// System.out.println(currentPosition + " " + calls.stream().map(it -> "" + it).collect(Collectors.joining(",")) + " TestIndex:" + testIndex + " TestPosIndex:" + testPositionIndex + " Sites:" + currentPosition.getPosition());
// System.out.println("\t"+testGenotypeTable.positions().get(currentTestPositionIndex));
// }
if(calls.get(2) == UNKNOWN_DIPLOID_ALLELE) {
siteMissing++;
// if(currentTaxon.getName().equals("B73") && currentPosition.getPosition() >= 150827101 && currentPosition.getPosition() <= 150831598 && currentPosition.getChromosome().getName().equals("10") ) {
// if(currentTaxon.getName().equals("B73") ) {
// System.out.println(currentPosition + " " + calls.stream().map(it -> "" + it).collect(Collectors.joining(",")) + " TestIndex:" + testIndex + " TestPosIndex:" + testPositionIndex + " Sites:" + currentPosition.getPosition());
// System.out.println("\t"+testGenotypeTable.positions().get(currentTestPositionIndex));
// }
}
//remove the indel, heterozygous, or reference genome problems
if(calls.stream().anyMatch(call -> call == UNKNOWN_DIPLOID_ALLELE)) {
missing++;
}
if (calls.stream().anyMatch(call -> !NucleotideAlignmentConstants.isHomozygousACGT(call))) {
// if(calls.stream().anyMatch(call -> call == UNKNOWN_DIPLOID_ALLELE)) {
//
// }
// else {
// if(currentTaxon.getName().equals("B73")) {
// System.out.println(currentPosition + " " + calls.stream().map(it -> "" + it).collect(Collectors.joining(",")) + " TestIndex:" + testIndex + " TestPosIndex:" + testPositionIndex + " Sites:" + currentPosition.getPosition());
// }
// }
hetIndelMissing++;
continue;
}
if (calls.get(2) != calls.get(3)) {
refCallsDoNotMatch++;
continue;
}
//count the correct calls
if (calls.get(0) == calls.get(1)) {
gvcfAgreesWithKnown++;
continue;
}
if (calls.get(0) == calls.get(2)) {
mismatchGVCFisRef++;
} else if (calls.get(1) == calls.get(2)) {
mismatchGVCFisAlt1++;
} //two states at site
else {
mismatchGVCFisAlt2++;
} //three states at site
}
catch(Exception e) {
throw new IllegalStateException("Error comparing calls:",e);
}
}
//Compute the number of bps scored and the error rate
double totalScored = gvcfAgreesWithKnown + mismatchGVCFisRef + mismatchGVCFisAlt1 + mismatchGVCFisAlt2;
double totalErrorRate = (mismatchGVCFisRef + mismatchGVCFisAlt1 + mismatchGVCFisAlt2) / totalScored;
System.out.println(currentTaxon.getName() + "\t" + hetIndelMissing + "\t" + missing+ "\t" + totalScored + "\t" + totalErrorRate+"\t"+siteMissing);
}
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return "Compare To Known SNPs";
}
@Override
public String getToolTipText() {
return "Compare GenotypeTables to a known GenotypeTable and compute Error rates.";
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(CompareToKnownSNPPlugin.class);
// }
/**
* Convenience method to run plugin with one return object.
*/
// TODO: Replace with specific type.
// public runPlugin(DataSet input) {
// return () performFunction(input).getData(0).getData();
// }
/**
* Bed File
*
* @return Bed File
*/
public String anchorRegionBed() {
return anchorRegionBed.value();
}
/**
* Set Bed File. Bed File
*
* @param value Bed File
*
* @return this plugin
*/
public CompareToKnownSNPPlugin anchorRegionBed(String value) {
anchorRegionBed = new PluginParameter<>(anchorRegionBed, value);
return this;
}
}