org.broadinstitute.hellbender.tools.walkers.varianteval.AlleleFrequencyQC Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.walkers.varianteval;
import htsjdk.samtools.metrics.MetricsFile;
import org.apache.commons.math3.distribution.ChiSquaredDistribution;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.metrics.MetricsUtils;
import org.broadinstitute.hellbender.metrics.analysis.AlleleFrequencyQCMetric;
import org.broadinstitute.hellbender.utils.R.RScriptExecutor;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.io.Resource;
import org.broadinstitute.hellbender.utils.report.GATKReport;
import org.broadinstitute.hellbender.utils.report.GATKReportTable;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* This tool uses VariantEval to bin variants in Thousand Genomes by allele frequency. For each bin, we compare the
* expected allele frequency from Thousand Genomes with the observed allele frequency in the input VCF. This was
* designed with arrays in mind, as a way to discover potential bugs in our pipeline. It uses the results from
* VariantEval to generate a simplified metric that returns a modified chi squared statistic (sum of the squared
* difference between the two allele frequencies in each bin, allowing a constant variance) as well as its associated
* p-value. The original VariantEval results can be returned by giving the debug variable a filename.
**/
public class AlleleFrequencyQC extends VariantEval {
@Argument(shortName = "pvalue-threshold",
doc = "Threshold to cut off the pvalue.")
protected double threshold = 0.05;
@Argument(shortName = "allowed-variance",
doc = "Variance allowed in calculating the modified chi squared statistic.")
protected double allowedVariance = 0.01;
@Argument(shortName = "debug-file",
doc = "File to save the results from variant eval for debugging", optional = true)
protected File debugFile;
final static private String R_SCRIPT = "plotAlleleFrequencyQC.R";
protected File metricOutput;
protected String sample;
@Override
public void onTraversalStart() {
variantEvalArgs.noStandardModules = true;
variantEvalArgs.modulesToUse = Collections.singletonList("VariantAFEvaluator");
variantEvalArgs.keepSitesWithAC0 = true;
variantEvalArgs.noStandardStratifications = true;
variantEvalArgs.stratificationsToUse = Arrays.asList("AlleleFrequency", "Filter");
variantEvalArgs.setAFScale(VariantEvalArgumentCollection.StratifyingScale.LOGARITHMIC);
variantEvalArgs.setUseCompAFStratifier(true);
metricOutput = outFile; // output file for summary metrics
// have to set the output file for variant eval; if not given a debug file to return the variant eval results
// from, this will just be a temporary file that will be deleted after the tool runs
outFile = debugFile == null ? IOUtils.createTempFile("variant_eval" ,".txt") : debugFile;
sample = getHeaderForVariants().getOtherHeaderLine("sampleAlias").getValue();
super.onTraversalStart();
}
@Override
public Object onTraversalSuccess() {
super.onTraversalSuccess();
final GATKReportTable table = new GATKReport(outFile).getTable(engine.getModulesToUse().get(0));
final List columnNames = table.getColumnInfo().stream().map(c -> c.getColumnName()).collect(Collectors.toList());
// this is a map of allele frequency bin : length 2 list of observed allele frequencies ( one for comp, one for eval )
final Map © 2015 - 2025 Weber Informatics LLC | Privacy Policy