All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.variantutils.ValidateVariants Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.variantutils;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
import picard.cmdline.programgroups.VariantEvaluationProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.VariantWalker;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;

import java.util.*;


/**
 * Validate a VCF file with a strict set of criteria
 *
 * 

This tool is designed to validate the adherence of a file to VCF format. The tool will validate .g.vcf GVCF * format files as well. For VCF specifications, see * https://samtools.github.io/hts-specs/. * Besides standard adherence to the VCF specification, this tool performs additional strict validations to ensure * that the information contained within the file is correctly encoded. These include: *

* *
    *
  • REF - correctness of the reference base(s)
  • *
  • CHR_COUNTS - accuracy of AC and AN values
  • *
  • IDS - tests against rsIDs when a dbSNP file is provided (requires a dbsnp VCF provided via `--dbsnp`).
  • *
  • ALLELES - that all alternate alleles are present in at least one sample
  • *
* *

* By default the tool applies all the strict validations unless you indicate which one should be * excluded using `--validation-type-to-exclude`. You can exclude as many types as you want. Furthermore, you * can exclude all strict validations with the special code `ALL`. In this case the tool will only test for * adherence to the VCF specification. *

* *

Input

*

* A VCF file to validate. *

* *

Usage examples

* *

Minimally validate a file for adherence to VCF format:

* gatk ValidateVariants \ * -V cohort.vcf.gz * *

Validate a GVCF for adherence to VCF format, including REF allele match:

* gatk ValidateVariants \ * -V sample.g.vcf.gz \ * -R reference.fasta * -gvcf * *

To perform VCF format and all strict validations:

*
 * gatk ValidateVariants \
 *   -R ref.fasta \
 *   -V input.vcf \
 *   --dbsnp dbsnp.vcf
 * 
* *

To perform only VCF format tests:

*
 * gatk ValidateVariants
 *   -R ref.fasta \
 *   -V input.vcf \
 *   --validation-type-to-exclude ALL
 * 
* *

To perform all validations except the strict `ALLELE` validation:

*
 * gatk ValidateVariants \
 *   -R ref.fasta \
 *   -V input.vcf \
 *   --validation-type-to-exclude ALLELES \
 *   --dbsnp dbsnp.vcf
 * 
* */ @CommandLineProgramProperties( summary = "Validates a VCF file with an extra strict set of criteria.", oneLineSummary = "Validate VCF", programGroup = VariantEvaluationProgramGroup.class ) @DocumentedFeature public final class ValidateVariants extends VariantWalker { static final Logger logger = LogManager.getLogger(ValidateVariants.class); public static final String GVCF_VALIDATE = "validate-GVCF"; public static final String DO_NOT_VALIDATE_FILTERED_RECORDS = "do-not-validate-filtered-records"; public enum ValidationType { /** * Makes reference to all extra-strict tests listed below. */ ALL, /** * Check whether the reported reference base in the VCF is the same as the corresponding base in the * actual reference. */ REF, /** * Checks whether the variant IDs exists, only relevant if the user indicates a DBSNP vcf file (see {@link #dbsnp}). */ IDS, /** * Check whether all alternative alleles participate in a genotype call of at least on sample. */ ALLELES, /** * Check that the AN and AC annotations are consistent with the number of calls, alleles and then number these * are called across samples. */ CHR_COUNTS; /** * Unmodifiable set of concrete validation types. * *

These are all types except {@link #ALL}.

*/ public static final Set CONCRETE_TYPES; static { final Set cts = new LinkedHashSet<>(values().length - 1); for (final ValidationType v : values()) { if (v != ALL) cts.add(v); } CONCRETE_TYPES = Collections.unmodifiableSet(cts); } } @ArgumentCollection DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); @Argument(fullName = "validation-type-to-exclude", shortName = "Xtype", doc = "which validation type to exclude from a full strict validation", optional = true) List excludeTypes = new ArrayList<>(); /** * By default, even filtered records are validated. */ @Argument(fullName = DO_NOT_VALIDATE_FILTERED_RECORDS, shortName = "do-not-validate-filtered-records", doc = "skip validation on filtered records", optional = true, mutex = GVCF_VALIDATE) Boolean DO_NOT_VALIDATE_FILTERED = false; @Argument(fullName = "warn-on-errors", shortName = "warn-on-errors", doc = "just emit warnings on errors instead of terminating the run at the first instance", optional = true) Boolean WARN_ON_ERROR = false; /** * Validate this file as a gvcf. In particular, every variant must include a allele, and that * every base in the territory under consideration is covered by a variant (or a reference block). * If you specifed intervals (using -L or -XL) to restrict analysis to a subset of genomic regions, * those intervals will need to be covered in a valid gvcf. */ @Argument(fullName = GVCF_VALIDATE, shortName = "gvcf", doc = "Validate this file as a GVCF", optional = true, mutex = DO_NOT_VALIDATE_FILTERED_RECORDS) Boolean VALIDATE_GVCF = false; /** * Fail if GVCF contains positions that overlap. Overlapping variants are allowed because they are assumed to be on * separate haplotypes. */ @Argument(fullName = "fail-gvcf-on-overlap", shortName = "no-overlaps", doc = "Fail if GVCF contains positions that overlap (except two variants overlapping)", optional = true, mutex = DO_NOT_VALIDATE_FILTERED_RECORDS) Boolean FAIL_ON_OVERLAP = false; /** * Contains final set of validation to apply. */ private Collection validationTypes; private GenomeLocSortedSet genomeLocSortedSet; private CachingIndexedFastaSequenceFile referenceReader; // information to keep track of when validating a GVCF private SimpleInterval previousInterval; private int previousStart = -1; private String previousContig = null; private boolean previousIntervalIsReference = true; private boolean sawOverlap = false; private SimpleInterval firstOverlap; @Override public void onTraversalStart() { if (VALIDATE_GVCF) { final SAMSequenceDictionary seqDictionary = getBestAvailableSequenceDictionary(); if (seqDictionary == null) throw new UserException("Validating a GVCF requires a sequence dictionary but no dictionary was able to be constructed from your input."); genomeLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary)); } validationTypes = calculateValidationTypesToApply(excludeTypes); //warn user if certain requested validations cannot be done due to lack of arguments if(dbsnp.dbsnp == null && (validationTypes.contains(ValidationType.ALL) || validationTypes.contains(ValidationType.IDS))) { logger.warn("IDS validation cannot be done because no DBSNP file was provided"); logger.warn("Other possible validations will still be performed"); } if(!hasReference() && (validationTypes.contains(ValidationType.ALL) || validationTypes.contains(ValidationType.REF))) { logger.warn("REF validation cannot be done because no reference file was provided"); logger.warn("Other possible validations will still be performed"); } if(hasReference()) { referenceReader = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier()); } } @Override public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext ref, final FeatureContext featureContext) { if (DO_NOT_VALIDATE_FILTERED && vc.isFiltered()) { return; } // get the true reference allele final Allele reportedRefAllele = vc.getReference(); final int refLength = reportedRefAllele.length(); final Allele observedRefAllele = hasReference() ? Allele.create(ReferenceUtils.getRefBasesAtPosition(referenceReader, vc.getContig(), vc.getStart(), refLength)) : null; final Set rsIDs = getRSIDs(featureContext); if (VALIDATE_GVCF) { final SimpleInterval refInterval = ref.getInterval(); validateVariantsOrder(vc); final boolean thisIntervalIsReference = vc.getAlternateAlleles().size() == 1 && vc.getAlternateAllele(0).equals(Allele.NON_REF_ALLELE); // GenomeLocSortedSet will automatically merge intervals that are overlapping when setting `mergeIfIntervalOverlaps` // to true. In a GVCF most blocks are adjacent to each other so they wouldn't normally get merged. We check // if the current record is adjacent to the previous record and "overlap" them if they are so our set is as // small as possible while still containing the same bases. if (previousInterval != null && previousInterval.overlapsWithMargin(refInterval, 0) && (previousIntervalIsReference || thisIntervalIsReference)) { logger.warn("Current interval " + refInterval.toString() + " overlaps previous interval ending at " + previousInterval.getEnd()); sawOverlap = true; firstOverlap = refInterval; } final int start = (previousInterval != null && previousInterval.overlapsWithMargin(refInterval, 1)) ? previousInterval.getStart() : refInterval.getStart(); final int end = (previousInterval != null && previousInterval.overlapsWithMargin(refInterval, 1)) ? Math.max(previousInterval.getEnd(), vc.getEnd()) : vc.getEnd(); final GenomeLoc possiblyMergedGenomeLoc = genomeLocSortedSet.getGenomeLocParser().createGenomeLoc(refInterval.getContig(), start, end); genomeLocSortedSet.add(possiblyMergedGenomeLoc, true); previousInterval = new SimpleInterval(possiblyMergedGenomeLoc); previousStart = vc.getStart(); previousIntervalIsReference = thisIntervalIsReference; validateGVCFVariant(vc); } for (final ValidationType t : validationTypes) { try{ applyValidationType(vc, reportedRefAllele, observedRefAllele, rsIDs, t); } catch (TribbleException e) { throwOrWarn(new UserException.FailsStrictValidation(drivingVariantFile.getRawInputString(), t, e.getMessage())); } } } @Override public Object onTraversalSuccess() { if (VALIDATE_GVCF) { final GenomeLocSortedSet intervalArgumentGenomeLocSortedSet; final SAMSequenceDictionary seqDictionary = getBestAvailableSequenceDictionary(); if (intervalArgumentCollection.intervalsSpecified()){ intervalArgumentGenomeLocSortedSet = GenomeLocSortedSet.createSetFromList(genomeLocSortedSet.getGenomeLocParser(), IntervalUtils.genomeLocsFromLocatables(genomeLocSortedSet.getGenomeLocParser(), intervalArgumentCollection.getIntervals(seqDictionary))); } else { intervalArgumentGenomeLocSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(seqDictionary); } final GenomeLocSortedSet uncoveredIntervals = intervalArgumentGenomeLocSortedSet.subtractRegions(genomeLocSortedSet); if (uncoveredIntervals.coveredSize() > 0) { final UserException e = new UserException.ValidationFailure("A GVCF must cover the entire region. Found " + uncoveredIntervals.coveredSize() + " loci with no VariantContext covering it. The first uncovered segment is:" + uncoveredIntervals.iterator().next()); throwOrWarn(e); } if (FAIL_ON_OVERLAP && sawOverlap) { final UserException e = new UserException.ValidationFailure("This GVCF contained overlapping reference blocks. The first overlapping interval is " + firstOverlap.toString()); throwOrWarn(e); } } return null; } /* * Returns the list of RSIDs overlapping the current variant that we're walking over. * If there's no RSID or if there was not dbsnp file passed in as an argument, * an empty set is returned (and then no validation is performed, see applyValidationType. */ private Set getRSIDs(FeatureContext featureContext) { Set rsIDs = new LinkedHashSet<>(); for (VariantContext rsID : featureContext.getValues(dbsnp.dbsnp)) { rsIDs.addAll(Arrays.asList(rsID.getID().split(VCFConstants.ID_FIELD_SEPARATOR))); } return rsIDs; } /** * Given the validation type and exclusion type, calculate the final set of type to validate. * @param excludeTypes types to exclude. * * @return the final set of type to validate. May be empty. */ private Collection calculateValidationTypesToApply(final List excludeTypes) { //creates local, temp list so that original list provided by user doesn't get modified List excludeTypesTemp = new ArrayList<>(excludeTypes); if (VALIDATE_GVCF && !excludeTypesTemp.contains(ValidationType.ALLELES)) { // Note: in a future version allele validation might be OK for GVCFs, if that happens // this will be more complicated. logger.warn("GVCF format is currently incompatible with allele validation. Not validating Alleles."); excludeTypesTemp.add(ValidationType.ALLELES); } if (excludeTypesTemp.isEmpty()) { return Collections.singleton(ValidationType.ALL); } final Set excludeTypeSet = new LinkedHashSet<>(excludeTypesTemp); if (excludeTypesTemp.size() != excludeTypeSet.size()) { logger.warn("found repeat redundant validation types listed using the --validation-type-to-exclude argument"); } if (excludeTypeSet.contains(ValidationType.ALL)) { if (excludeTypeSet.size() > 1) { logger.warn("found ALL in the --validation-type-to-exclude list together with other concrete type exclusions that are redundant"); } return Collections.emptyList(); } else { final Set result = new LinkedHashSet<>(ValidationType.CONCRETE_TYPES); result.removeAll(excludeTypeSet); if (result.contains(ValidationType.REF) && !hasReference()) { throw new UserException.MissingReference("Validation type " + ValidationType.REF.name() + " was selected but no reference was provided.", true); } return result; } } private void validateVariantsOrder(final VariantContext vc) { // Check if the current VC belongs to the same contig as the previous one. // If not, reset the start position to -1. if (previousContig == null || !previousContig.equals(vc.getContig())) { previousContig = vc.getContig(); previousStart = -1; } //if next VC refers to a previous genomic position, throw an error //Note that HaplotypeCaller can emit variants that start inside of a deletion on another haplotype, // making v2's start less than the deletion's end if (previousStart > -1 && vc.getStart() < previousStart) { final UserException e = new UserException(String.format("In a GVCF all records must ordered. Record: %s covers a position previously traversed.", vc.toStringWithoutGenotypes())); throwOrWarn(e); } } private void validateGVCFVariant(final VariantContext vc) { if (!vc.hasAllele(Allele.NON_REF_ALLELE)) { final UserException e = new UserException(String.format("In a GVCF all records must contain a %s allele. Offending record: %s", Allele.NON_REF_STRING, vc.toStringWithoutGenotypes())); throwOrWarn(e); } } private void applyValidationType(VariantContext vc, Allele reportedRefAllele, Allele observedRefAllele, Set rsIDs, ValidationType t) { // Note: VariantContext.validateRSIDs blows up on an empty list (but works fine with null). // The workaround is to not pass an empty list. switch( t ) { case ALL: if(hasReference()) { if(!rsIDs.isEmpty()) { vc.extraStrictValidation(reportedRefAllele, observedRefAllele, rsIDs); } else{ vc.validateReferenceBases(reportedRefAllele, observedRefAllele); vc.validateAlternateAlleles(); vc.validateChromosomeCounts(); } } else{ if (rsIDs.isEmpty()) { vc.validateAlternateAlleles(); vc.validateChromosomeCounts(); } else{ vc.validateAlternateAlleles(); vc.validateChromosomeCounts(); vc.validateRSIDs(rsIDs); } } break; case REF: vc.validateReferenceBases(reportedRefAllele, observedRefAllele); break; case IDS: if (!rsIDs.isEmpty()) { vc.validateRSIDs(rsIDs); } break; case ALLELES: vc.validateAlternateAlleles(); break; case CHR_COUNTS: vc.validateChromosomeCounts(); break; } } private void throwOrWarn(UserException e) { if (WARN_ON_ERROR) { logger.warn("***** " + e.getMessage() + " *****"); } else { throw e; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy