All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.variantutils.UpdateVCFSequenceDictionary Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.variantutils;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.VariantWalker;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.exceptions.UserException;
import picard.cmdline.programgroups.VariantManipulationProgramGroup;

/**
 * Updates the reference contigs in the header of the VCF format file, i.e. the reference dictionary, using the
 * dictionary from a variant, alignment, reference, or dictionary file.
 *
 * 

This tool is designed to update the sequence dictionary in a variant file using a dictionary from another variant, * alignment, dictionary, or reference file. The dictionary must be valid, i.e. must contain a sequence record, for all * variants in the target file. The dictionary lines start with '##contig='. *

* *

* By specifying both --replace and --disable-sequence-dictionary-validation, one can force replace an invalid * sequence dictionary in a variant file with a valid sequence dictionary in another file. *

* *

Usage example

* *

Use the contig dictionary from a BAM (SQ lines) to replace an existing dictionary in the header of a VCF.

*
 * gatk UpdateVCFSequenceDictionary \
 *     -V cohort.vcf.gz \
 *     --source-dictionary sample.bam \
 *     --output cohort_replacedcontiglines.vcf.gz \
 *     --replace=true
 * 
* *

Use a reference dictionary to add reference contig lines to a VCF without any.

*
 * gatk UpdateVCFSequenceDictionary \
 *     -V resource.vcf.gz \
 *     --source-dictionary reference.dict \
 *     --output resource_newcontiglines.vcf.gz
 * 
* *

Use the reference set to add contig lines to a VCF without any.

*
 * gatk UpdateVCFSequenceDictionary \
 *     -V resource.vcf.gz \
 *     -R reference.fasta \
 *     --output resource_newcontiglines.vcf.gz
 * 
* *

* The -O argument specifies the name of the updated file. The --source-dictionary argument specifies the * input sequence dictionary. The --replace argument is optional, and forces the replacement of the dictionary * if the input file already has a dictionary. *

* */ @CommandLineProgramProperties( summary = "Updates the sequence dictionary in a variant file using the dictionary from another variant, " + "alignment, dictionary, or reference file. The dictionary must be valid for all variants in the " + "target file.", oneLineSummary = "Updates the sequence dictionary in a variant file.", programGroup = VariantManipulationProgramGroup.class ) @DocumentedFeature public final class UpdateVCFSequenceDictionary extends VariantWalker { private static final Logger logger = LogManager.getLogger(UpdateVCFSequenceDictionary.class); @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc="File to which updated variants should be written") public GATKPath outFile = null; public final static String DICTIONARY_ARGUMENT_NAME = "source-dictionary"; @Argument(fullName=DICTIONARY_ARGUMENT_NAME, doc="A variant, alignment, dictionary, or reference file to use as a dictionary source " + "(optional if the sequence dictionary source is specified as a reference argument). The dictionary " + "presented must be valid (contain a sequence record) for each sequence that is referenced by any " + "variant in the input file.", optional=true) GATKPath dictionarySource; // optional since a reference can be provided as a dictionary source instead public final static String REPLACE_ARGUMENT_NAME = "replace"; @Argument(fullName=REPLACE_ARGUMENT_NAME, doc="Force replacement of the dictionary if the input file already has a dictionary", optional=true) boolean replace = false; // optional since a reference can be provided as a dictionary source instead private VariantContextWriter vcfWriter; private SAMSequenceDictionary sourceDictionary; @Override public void onTraversalStart() { VCFHeader inputHeader = getHeaderForVariants(); VCFHeader outputHeader = inputHeader == null ? new VCFHeader() : new VCFHeader(inputHeader.getMetaDataInInputOrder(), inputHeader.getGenotypeSamples()) ; getDefaultToolVCFHeaderLines().forEach(line -> outputHeader.addMetaDataLine(line)); sourceDictionary = getBestAvailableSequenceDictionary(); // If -replace is set, do not need to check the sequence dictionary for validity here -- it will still be // checked in our normal sequence dictionary validation. Warn and require opt-in via -replace if we're about to // clobber a valid sequence dictionary. Check the input file directly via the header rather than using the // engine, since it might dig one up from an index. if (!replace) { SAMSequenceDictionary oldDictionary = inputHeader == null ? null : inputHeader.getSequenceDictionary(); if (oldDictionary != null && !oldDictionary.getSequences().isEmpty()) { throw new CommandLineException.BadArgumentValue( String.format( "The input variant file %s already contains a sequence dictionary. " + "Use %s to force the dictionary to be replaced.", getDrivingVariantsFeatureInput().getName(), REPLACE_ARGUMENT_NAME ) ); } } outputHeader.setSequenceDictionary(sourceDictionary); vcfWriter = createVCFWriter(outFile); vcfWriter.writeHeader(outputHeader); } @Override public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext ref, final FeatureContext featureContext) { // Validate each variant against the source dictionary manually SAMSequenceRecord samSeqRec = sourceDictionary.getSequence(vc.getContig()); if (samSeqRec == null) { throw new CommandLineException.BadArgumentValue( String.format( "The input variant file contains a variant (ID: \"%s\") with a reference to a sequence (\"%s\") " + "that is not present in the provided dictionary", vc.getID(), vc.getContig() ) ); } else if (vc.getEnd() > samSeqRec.getSequenceLength()) { throw new CommandLineException.BadArgumentValue( String.format( "The input variant file contains a variant (ID: \"%s\") with a reference to a sequence (\"%s\") " + "that ends at a position (%d) that exceeds the length of that sequence (%d) in the provided dictionary", vc.getID(), vc.getContig(), vc.getEnd(), samSeqRec.getSequenceLength() ) ); } vcfWriter.add(vc); } /** * Close out the new variants file. */ @Override public void closeTool() { if (vcfWriter != null) { vcfWriter.close(); } } // Override getBestAvailableSequenceDictionary() to ensure that the new sequence dictionary specified by // the user (and not the sequence dictionary from the VCF we're trying to update) is used uniformly by all // callers. Otherwise, the wrong dictionary would be used when writing the index for the output vcf. // @Override public SAMSequenceDictionary getBestAvailableSequenceDictionary() { final SAMSequenceDictionary resultDictionary; final SAMSequenceDictionary masterDictionary = getMasterSequenceDictionary(); if (dictionarySource == null) { if (masterDictionary != null) { // We'll accept the master dictionary if one was specified. Using the master dictionary // arg will result in sequence dictionary validation. logger.warn("Using the dictionary supplied via the {} argument", StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME); resultDictionary = masterDictionary; } else if (hasReference()) { resultDictionary = getReferenceDictionary(); } else { throw new CommandLineException.MissingArgument( DICTIONARY_ARGUMENT_NAME, "A dictionary source file or reference file must be provided"); } } else { if (masterDictionary != null) { throw new CommandLineException(String.format("Only one of %s or %s may be specified on the command line", StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, DICTIONARY_ARGUMENT_NAME)); } resultDictionary = SAMSequenceDictionaryExtractor.extractDictionary(dictionarySource.toPath()); if (resultDictionary == null || resultDictionary.getSequences().isEmpty()) { throw new CommandLineException.BadArgumentValue( String.format( "The specified dictionary source has an empty or invalid sequence dictionary: %s", dictionarySource) ); } } if( seqValidationArguments.performSequenceDictionaryValidation() && resultDictionary != null && dictionaryHasMissingLengths(resultDictionary)) { throw new UserException.SequenceDictionaryIsMissingContigLengths(dictionarySource.getRawInputString(), resultDictionary); } return resultDictionary; } private boolean dictionaryHasMissingLengths(final SAMSequenceDictionary resultDictionary) { return resultDictionary.getSequences().stream().anyMatch(s -> s.getSequenceLength() == SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy