org.broadinstitute.hellbender.tools.walkers.variantutils.ValidateVariants Maven / Gradle / Ivy
package org.broadinstitute.hellbender.tools.walkers.variantutils;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
import picard.cmdline.programgroups.VariantEvaluationProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.VariantWalker;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import java.util.*;
/**
* Validate a VCF file with a strict set of criteria
*
* This tool is designed to validate the adherence of a file to VCF format. The tool will validate .g.vcf GVCF
* format files as well. For VCF specifications, see
* https://samtools.github.io/hts-specs/.
* Besides standard adherence to the VCF specification, this tool performs additional strict validations to ensure
* that the information contained within the file is correctly encoded. These include:
*
*
*
* - REF - correctness of the reference base(s)
* - CHR_COUNTS - accuracy of AC and AN values
* - IDS - tests against rsIDs when a dbSNP file is provided (requires a dbsnp VCF provided via `--dbsnp`).
* - ALLELES - that all alternate alleles are present in at least one sample
*
*
*
* By default the tool applies all the strict validations unless you indicate which one should be
* excluded using `--validation-type-to-exclude`. You can exclude as many types as you want. Furthermore, you
* can exclude all strict validations with the special code `ALL`. In this case the tool will only test for
* adherence to the VCF specification.
*
*
* Input
*
* A VCF file to validate.
*
*
* Usage examples
*
* Minimally validate a file for adherence to VCF format:
* gatk ValidateVariants \
* -V cohort.vcf.gz
*
* Validate a GVCF for adherence to VCF format, including REF allele match:
* gatk ValidateVariants \
* -V sample.g.vcf.gz \
* -R reference.fasta
* -gvcf
*
* To perform VCF format and all strict validations:
*
* gatk ValidateVariants \
* -R ref.fasta \
* -V input.vcf \
* --dbsnp dbsnp.vcf
*
*
* To perform only VCF format tests:
*
* gatk ValidateVariants
* -R ref.fasta \
* -V input.vcf \
* --validation-type-to-exclude ALL
*
*
* To perform all validations except the strict `ALLELE` validation:
*
* gatk ValidateVariants \
* -R ref.fasta \
* -V input.vcf \
* --validation-type-to-exclude ALLELES \
* --dbsnp dbsnp.vcf
*
*
*/
@CommandLineProgramProperties(
summary = "Validates a VCF file with an extra strict set of criteria.",
oneLineSummary = "Validate VCF",
programGroup = VariantEvaluationProgramGroup.class
)
@DocumentedFeature
public final class ValidateVariants extends VariantWalker {
static final Logger logger = LogManager.getLogger(ValidateVariants.class);
public static final String GVCF_VALIDATE = "validate-GVCF";
public static final String DO_NOT_VALIDATE_FILTERED_RECORDS = "do-not-validate-filtered-records";
public enum ValidationType {
/**
* Makes reference to all extra-strict tests listed below.
*/
ALL,
/**
* Check whether the reported reference base in the VCF is the same as the corresponding base in the
* actual reference.
*/
REF,
/**
* Checks whether the variant IDs exists, only relevant if the user indicates a DBSNP vcf file (see {@link #dbsnp}).
*/
IDS,
/**
* Check whether all alternative alleles participate in a genotype call of at least on sample.
*/
ALLELES,
/**
* Check that the AN and AC annotations are consistent with the number of calls, alleles and then number these
* are called across samples.
*/
CHR_COUNTS;
/**
* Unmodifiable set of concrete validation types.
*
* These are all types except {@link #ALL}.
*/
public static final Set CONCRETE_TYPES;
static {
final Set cts = new LinkedHashSet<>(values().length - 1);
for (final ValidationType v : values()) {
if (v != ALL)
cts.add(v);
}
CONCRETE_TYPES = Collections.unmodifiableSet(cts);
}
}
@ArgumentCollection
DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
@Argument(fullName = "validation-type-to-exclude",
shortName = "Xtype",
doc = "which validation type to exclude from a full strict validation",
optional = true)
List excludeTypes = new ArrayList<>();
/**
* By default, even filtered records are validated.
*/
@Argument(fullName = DO_NOT_VALIDATE_FILTERED_RECORDS,
shortName = "do-not-validate-filtered-records",
doc = "skip validation on filtered records",
optional = true,
mutex = GVCF_VALIDATE)
Boolean DO_NOT_VALIDATE_FILTERED = false;
@Argument(fullName = "warn-on-errors",
shortName = "warn-on-errors",
doc = "just emit warnings on errors instead of terminating the run at the first instance",
optional = true)
Boolean WARN_ON_ERROR = false;
/**
* Validate this file as a gvcf. In particular, every variant must include a allele, and that
* every base in the territory under consideration is covered by a variant (or a reference block).
* If you specifed intervals (using -L or -XL) to restrict analysis to a subset of genomic regions,
* those intervals will need to be covered in a valid gvcf.
*/
@Argument(fullName = GVCF_VALIDATE,
shortName = "gvcf",
doc = "Validate this file as a GVCF",
optional = true,
mutex = DO_NOT_VALIDATE_FILTERED_RECORDS)
Boolean VALIDATE_GVCF = false;
/**
* Fail if GVCF contains positions that overlap. Overlapping variants are allowed because they are assumed to be on
* separate haplotypes.
*/
@Argument(fullName = "fail-gvcf-on-overlap",
shortName = "no-overlaps",
doc = "Fail if GVCF contains positions that overlap (except two variants overlapping)",
optional = true,
mutex = DO_NOT_VALIDATE_FILTERED_RECORDS)
Boolean FAIL_ON_OVERLAP = false;
/**
* Contains final set of validation to apply.
*/
private Collection validationTypes;
private GenomeLocSortedSet genomeLocSortedSet;
private CachingIndexedFastaSequenceFile referenceReader;
// information to keep track of when validating a GVCF
private SimpleInterval previousInterval;
private int previousStart = -1;
private String previousContig = null;
private boolean previousIntervalIsReference = true;
private boolean sawOverlap = false;
private SimpleInterval firstOverlap;
@Override
public void onTraversalStart() {
if (VALIDATE_GVCF) {
final SAMSequenceDictionary seqDictionary = getBestAvailableSequenceDictionary();
if (seqDictionary == null)
throw new UserException("Validating a GVCF requires a sequence dictionary but no dictionary was able to be constructed from your input.");
genomeLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));
}
validationTypes = calculateValidationTypesToApply(excludeTypes);
//warn user if certain requested validations cannot be done due to lack of arguments
if(dbsnp.dbsnp == null && (validationTypes.contains(ValidationType.ALL) || validationTypes.contains(ValidationType.IDS)))
{
logger.warn("IDS validation cannot be done because no DBSNP file was provided");
logger.warn("Other possible validations will still be performed");
}
if(!hasReference() && (validationTypes.contains(ValidationType.ALL) || validationTypes.contains(ValidationType.REF)))
{
logger.warn("REF validation cannot be done because no reference file was provided");
logger.warn("Other possible validations will still be performed");
}
if(hasReference()) {
referenceReader = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier());
}
}
@Override
public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext ref, final FeatureContext featureContext) {
if (DO_NOT_VALIDATE_FILTERED && vc.isFiltered()) {
return;
}
// get the true reference allele
final Allele reportedRefAllele = vc.getReference();
final int refLength = reportedRefAllele.length();
final Allele observedRefAllele = hasReference() ? Allele.create(ReferenceUtils.getRefBasesAtPosition(referenceReader, vc.getContig(), vc.getStart(), refLength)) : null;
final Set rsIDs = getRSIDs(featureContext);
if (VALIDATE_GVCF) {
final SimpleInterval refInterval = ref.getInterval();
validateVariantsOrder(vc);
final boolean thisIntervalIsReference = vc.getAlternateAlleles().size() == 1 && vc.getAlternateAllele(0).equals(Allele.NON_REF_ALLELE);
// GenomeLocSortedSet will automatically merge intervals that are overlapping when setting `mergeIfIntervalOverlaps`
// to true. In a GVCF most blocks are adjacent to each other so they wouldn't normally get merged. We check
// if the current record is adjacent to the previous record and "overlap" them if they are so our set is as
// small as possible while still containing the same bases.
if (previousInterval != null && previousInterval.overlapsWithMargin(refInterval, 0) &&
(previousIntervalIsReference || thisIntervalIsReference)) {
logger.warn("Current interval " + refInterval.toString() + " overlaps previous interval ending at " + previousInterval.getEnd());
sawOverlap = true;
firstOverlap = refInterval;
}
final int start = (previousInterval != null && previousInterval.overlapsWithMargin(refInterval, 1)) ?
previousInterval.getStart() : refInterval.getStart();
final int end = (previousInterval != null && previousInterval.overlapsWithMargin(refInterval, 1)) ?
Math.max(previousInterval.getEnd(), vc.getEnd()) : vc.getEnd();
final GenomeLoc possiblyMergedGenomeLoc = genomeLocSortedSet.getGenomeLocParser().createGenomeLoc(refInterval.getContig(), start, end);
genomeLocSortedSet.add(possiblyMergedGenomeLoc, true);
previousInterval = new SimpleInterval(possiblyMergedGenomeLoc);
previousStart = vc.getStart();
previousIntervalIsReference = thisIntervalIsReference;
validateGVCFVariant(vc);
}
for (final ValidationType t : validationTypes) {
try{
applyValidationType(vc, reportedRefAllele, observedRefAllele, rsIDs, t);
} catch (TribbleException e) {
throwOrWarn(new UserException.FailsStrictValidation(drivingVariantFile.getRawInputString(), t, e.getMessage()));
}
}
}
@Override
public Object onTraversalSuccess() {
if (VALIDATE_GVCF) {
final GenomeLocSortedSet intervalArgumentGenomeLocSortedSet;
final SAMSequenceDictionary seqDictionary = getBestAvailableSequenceDictionary();
if (intervalArgumentCollection.intervalsSpecified()){
intervalArgumentGenomeLocSortedSet = GenomeLocSortedSet.createSetFromList(genomeLocSortedSet.getGenomeLocParser(), IntervalUtils.genomeLocsFromLocatables(genomeLocSortedSet.getGenomeLocParser(), intervalArgumentCollection.getIntervals(seqDictionary)));
} else {
intervalArgumentGenomeLocSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(seqDictionary);
}
final GenomeLocSortedSet uncoveredIntervals = intervalArgumentGenomeLocSortedSet.subtractRegions(genomeLocSortedSet);
if (uncoveredIntervals.coveredSize() > 0) {
final UserException e = new UserException.ValidationFailure("A GVCF must cover the entire region. Found " + uncoveredIntervals.coveredSize() +
" loci with no VariantContext covering it. The first uncovered segment is:" +
uncoveredIntervals.iterator().next());
throwOrWarn(e);
}
if (FAIL_ON_OVERLAP && sawOverlap) {
final UserException e = new UserException.ValidationFailure("This GVCF contained overlapping reference blocks. The first overlapping interval is " +
firstOverlap.toString());
throwOrWarn(e);
}
}
return null;
}
/*
* Returns the list of RSIDs overlapping the current variant that we're walking over.
* If there's no RSID or if there was not dbsnp file passed in as an argument,
* an empty set is returned (and then no validation is performed, see applyValidationType.
*/
private Set getRSIDs(FeatureContext featureContext) {
Set rsIDs = new LinkedHashSet<>();
for (VariantContext rsID : featureContext.getValues(dbsnp.dbsnp)) {
rsIDs.addAll(Arrays.asList(rsID.getID().split(VCFConstants.ID_FIELD_SEPARATOR)));
}
return rsIDs;
}
/**
* Given the validation type and exclusion type, calculate the final set of type to validate.
* @param excludeTypes types to exclude.
*
* @return the final set of type to validate. May be empty.
*/
private Collection calculateValidationTypesToApply(final List excludeTypes) {
//creates local, temp list so that original list provided by user doesn't get modified
List excludeTypesTemp = new ArrayList<>(excludeTypes);
if (VALIDATE_GVCF && !excludeTypesTemp.contains(ValidationType.ALLELES)) {
// Note: in a future version allele validation might be OK for GVCFs, if that happens
// this will be more complicated.
logger.warn("GVCF format is currently incompatible with allele validation. Not validating Alleles.");
excludeTypesTemp.add(ValidationType.ALLELES);
}
if (excludeTypesTemp.isEmpty()) {
return Collections.singleton(ValidationType.ALL);
}
final Set excludeTypeSet = new LinkedHashSet<>(excludeTypesTemp);
if (excludeTypesTemp.size() != excludeTypeSet.size()) {
logger.warn("found repeat redundant validation types listed using the --validation-type-to-exclude argument");
}
if (excludeTypeSet.contains(ValidationType.ALL)) {
if (excludeTypeSet.size() > 1) {
logger.warn("found ALL in the --validation-type-to-exclude list together with other concrete type exclusions that are redundant");
}
return Collections.emptyList();
} else {
final Set result = new LinkedHashSet<>(ValidationType.CONCRETE_TYPES);
result.removeAll(excludeTypeSet);
if (result.contains(ValidationType.REF) && !hasReference()) {
throw new UserException.MissingReference("Validation type " + ValidationType.REF.name() + " was selected but no reference was provided.", true);
}
return result;
}
}
private void validateVariantsOrder(final VariantContext vc) {
// Check if the current VC belongs to the same contig as the previous one.
// If not, reset the start position to -1.
if (previousContig == null || !previousContig.equals(vc.getContig())) {
previousContig = vc.getContig();
previousStart = -1;
}
//if next VC refers to a previous genomic position, throw an error
//Note that HaplotypeCaller can emit variants that start inside of a deletion on another haplotype,
// making v2's start less than the deletion's end
if (previousStart > -1 && vc.getStart() < previousStart) {
final UserException e = new UserException(String.format("In a GVCF all records must ordered. Record: %s covers a position previously traversed.",
vc.toStringWithoutGenotypes()));
throwOrWarn(e);
}
}
private void validateGVCFVariant(final VariantContext vc) {
if (!vc.hasAllele(Allele.NON_REF_ALLELE)) {
final UserException e = new UserException(String.format("In a GVCF all records must contain a %s allele. Offending record: %s",
Allele.NON_REF_STRING, vc.toStringWithoutGenotypes()));
throwOrWarn(e);
}
}
private void applyValidationType(VariantContext vc, Allele reportedRefAllele, Allele observedRefAllele, Set rsIDs, ValidationType t) {
// Note: VariantContext.validateRSIDs blows up on an empty list (but works fine with null).
// The workaround is to not pass an empty list.
switch( t ) {
case ALL:
if(hasReference())
{
if(!rsIDs.isEmpty())
{
vc.extraStrictValidation(reportedRefAllele, observedRefAllele, rsIDs);
}
else{
vc.validateReferenceBases(reportedRefAllele, observedRefAllele);
vc.validateAlternateAlleles();
vc.validateChromosomeCounts();
}
}
else{
if (rsIDs.isEmpty())
{
vc.validateAlternateAlleles();
vc.validateChromosomeCounts();
}
else{
vc.validateAlternateAlleles();
vc.validateChromosomeCounts();
vc.validateRSIDs(rsIDs);
}
}
break;
case REF:
vc.validateReferenceBases(reportedRefAllele, observedRefAllele);
break;
case IDS:
if (!rsIDs.isEmpty()) {
vc.validateRSIDs(rsIDs);
}
break;
case ALLELES:
vc.validateAlternateAlleles();
break;
case CHR_COUNTS:
vc.validateChromosomeCounts();
break;
}
}
private void throwOrWarn(UserException e) {
if (WARN_ON_ERROR) {
logger.warn("***** " + e.getMessage() + " *****");
} else {
throw e;
}
}
}