org.broadinstitute.hellbender.tools.walkers.varianteval.VariantEval Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.walkers.varianteval;

import htsjdk.samtools.util.IOUtil;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.MultiVariantInputArgumentCollection;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.MultiVariantWalkerGroupedOnStart;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.utils.Utils;
import picard.cmdline.programgroups.VariantEvaluationProgramGroup;

import java.io.File;
import java.util.Collections;
import java.util.List;

/**
 *
 *
 * 
 * Given a variant callset, it is common to calculate various quality control metrics. These metrics include the number of
 * raw or filtered SNP counts; ratio of transition mutations to transversions; concordance of a particular sample's calls
 * to a genotyping chip; number of s per sample; etc. Furthermore, it is often useful to stratify these metrics
 * by various criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the amino acid
 * degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: by providing several built-in
 * evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation
 * and stratification modules.
 * 
 *
 * Input
 * 
 * One or more variant sets to evaluate plus any number of comparison sets.
 * 
 *
 * Output
 * 
 * Evaluation tables detailing the results of the eval modules which were applied.
 * For example:
 * 
 *  * output.eval.grp:
 * ##:GATKReport.v0.1 CountVariants : Counts different classes of variants in the sample
 * CountVariants  CompFeatureInput  CpG      EvalFeatureInput  JexlExpression  Novelty  nProcessedLoci  nCalledLoci  nRefLoci  nVariantLoci  variantRate ...
 * CountVariants  dbsnp             CpG      eval              none            all      65900028        135770       0         135770        0.00206024  ...
 * CountVariants  dbsnp             CpG      eval              none            known    65900028        47068        0         47068         0.00071423  ...
 * CountVariants  dbsnp             CpG      eval              none            novel    65900028        88702        0         88702         0.00134601  ...
 * CountVariants  dbsnp             all      eval              none            all      65900028        330818       0         330818        0.00502000  ...
 * CountVariants  dbsnp             all      eval              none            known    65900028        120685       0         120685        0.00183133  ...
 * CountVariants  dbsnp             all      eval              none            novel    65900028        210133       0         210133        0.00318866  ...
 * CountVariants  dbsnp             non_CpG  eval              none            all      65900028        195048       0         195048        0.00295976  ...
 * CountVariants  dbsnp             non_CpG  eval              none            known    65900028        73617        0         73617         0.00111710  ...
 * CountVariants  dbsnp             non_CpG  eval              none            novel    65900028        121431       0         121431        0.00184265  ...
 * ...
 * 
 * 
 *
 * Usage examples
 *  * gatk VariantEval \
 *   -R reference.fasta \
 *   -O output.eval.grp \
 *   --eval set1:set1.vcf \
 *   --eval set2:set2.vcf \
 *   [--comp comp.vcf]
 * 
 *
 * Count Mendelian violations for each family in a callset with multiple families (and provided pedigree)
 *  * gatk VariantEval \
 *   -R reference.fasta \
 *   -O output.MVs.byFamily.table \
 *   --eval multiFamilyCallset.vcf \
 *   -no-ev -noST \
 *   -ST Family \
 *   -EV MendelianViolationEvaluator
 * 
 *
 * Caveat
 *
 * Some stratifications and evaluators are incompatible with each other due to their respective memory requirements,
 * such as AlleleCount and VariantSummary, or Sample and VariantSummary. If you specify such a combination, the program
 * will output an error message and ask you to disable one of these options. We do not currently provide an exhaustive
 * list of incompatible combinations, so we recommend trying out combinations that you are interested in on a dummy
 * command line, to rapidly ascertain whether it will work or not.
 *
 */
@CommandLineProgramProperties(
        summary = "Given a variant callset, it is common to calculate various quality control metrics. These metrics include the number of " +
                "raw or filtered SNP counts; ratio of transition mutations to transversions; concordance of a particular sample's calls " +
                "to a genotyping chip; number of singletons per sample; etc. Furthermore, it is often useful to stratify these metrics " +
                "by various criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the amino acid " +
                "degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: by providing several built-in " +
                "evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation " +
                "and stratification modules.",
        oneLineSummary = "General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and a lot more)",
        programGroup = VariantEvaluationProgramGroup.class
)
@DocumentedFeature
@BetaFeature
public class VariantEval extends MultiVariantWalkerGroupedOnStart {
    protected VariantEvalEngine engine;

    @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
            shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
            doc="File to which variants should be written")
    protected File outFile;

    @ArgumentCollection
    protected VariantEvalArgumentCollection variantEvalArgs = new VariantEvalArgumentCollection();

    /**
     * Note that the --list argument requires a fully resolved and correct command-line to work.
     */
    @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", optional=true)
    protected Boolean LIST = false;

    @Override
    protected MultiVariantInputArgumentCollection getMultiVariantInputArgumentCollection() {
        return new MultiVariantInputArgumentCollection() {
            private static final long serialVersionUID = 1L;

            @Override
            public List getDrivingVariantPaths() {
                //driving variants will be determined by initializeDrivingVariants()
                return Collections.emptyList();
            }
        };
    }

    @Override
    protected void initializeDrivingVariants() {
        getDrivingVariantsFeatureInputs().addAll(variantEvalArgs.getFeatureInputsForDrivingVariants());

        super.initializeDrivingVariants();
    }

    @Override
    public void onTraversalStart() {
        // Just list the modules, and exit quickly.
        if (LIST) { listModulesAndExit(); }

        Utils.nonNull(outFile);
        IOUtil.assertFileIsWritable(outFile);

        engine = new VariantEvalEngine(variantEvalArgs, this.features, getTraversalIntervals(), getSequenceDictionaryForDrivingVariants(), getSamplesForVariants());
    }

    /**
     * List all of the available evaluation modules, then exit successfully
     */
    public void listModulesAndExit() {
        logger.info("Available stratification modules:");
        logger.info("(Standard modules are starred)");

        for (String name: VariantEvalEngine.getStratifierClasses().keySet()) {

            logger.info("\t" + name + (VariantEvalEngine.getRequiredStratificationNames().contains(name) || VariantEvalEngine.getStandardStratificationNames().contains(name) ? "*" : ""));
        }
        logger.info("");

        logger.info("Available evaluation modules:");
        logger.info("(Standard modules are starred)");
        for (String veName : VariantEvalEngine.getEvaluatorClasses().keySet()) {
            logger.info("\t" + veName + (VariantEvalEngine.getStandardEvaluatorNames().contains(veName) ? "*" : ""));
        }
        logger.info("");

        System.exit(0);
    }

    @Override
    public void apply(final List variantContexts, final ReferenceContext referenceContext, final List readsContexts) {
        engine.apply(variantContexts, referenceContext);
    }

    @Override
    public Object onTraversalSuccess() {
        logger.info("Finalizing variant report");

        engine.finalizeReport(outFile);

        return null;
    }

    @Override
    public List> getPluginDescriptors() {
        // Reason this override is needed:
        // Because getDefaultReadFilters() is nearly always included (see GATKTool), this pulls in SampleReadFilter,
        // which has an argument named "sample", which conflicts with the local argument.  we get an exception like:
        // org.broadinstitute.barclay.argparser.CommandLineException$CommandLineParserInternalException: [sample, sample] has already been used.
        // at org.broadinstitute.barclay.argparser.CommandLineArgumentParser.handleArgumentAnnotation(CommandLineArgumentParser.java:1002)
        // therefore override the method and return an empty list.
        return Collections.emptyList();
    }
}