org.broadinstitute.hellbender.tools.walkers.variantutils.UpdateVCFSequenceDictionary Maven / Gradle / Ivy
package org.broadinstitute.hellbender.tools.walkers.variantutils;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.VariantWalker;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.exceptions.UserException;
import picard.cmdline.programgroups.VariantManipulationProgramGroup;
/**
* Updates the reference contigs in the header of the VCF format file, i.e. the reference dictionary, using the
* dictionary from a variant, alignment, reference, or dictionary file.
*
* This tool is designed to update the sequence dictionary in a variant file using a dictionary from another variant,
* alignment, dictionary, or reference file. The dictionary must be valid, i.e. must contain a sequence record, for all
* variants in the target file. The dictionary lines start with '##contig='.
*
*
*
* By specifying both --replace and --disable-sequence-dictionary-validation, one can force replace an invalid
* sequence dictionary in a variant file with a valid sequence dictionary in another file.
*
*
* Usage example
*
* Use the contig dictionary from a BAM (SQ lines) to replace an existing dictionary in the header of a VCF.
*
* gatk UpdateVCFSequenceDictionary \
* -V cohort.vcf.gz \
* --source-dictionary sample.bam \
* --output cohort_replacedcontiglines.vcf.gz \
* --replace=true
*
*
* Use a reference dictionary to add reference contig lines to a VCF without any.
*
* gatk UpdateVCFSequenceDictionary \
* -V resource.vcf.gz \
* --source-dictionary reference.dict \
* --output resource_newcontiglines.vcf.gz
*
*
* Use the reference set to add contig lines to a VCF without any.
*
* gatk UpdateVCFSequenceDictionary \
* -V resource.vcf.gz \
* -R reference.fasta \
* --output resource_newcontiglines.vcf.gz
*
*
*
* The -O argument specifies the name of the updated file. The --source-dictionary argument specifies the
* input sequence dictionary. The --replace argument is optional, and forces the replacement of the dictionary
* if the input file already has a dictionary.
*
*
*/
@CommandLineProgramProperties(
summary = "Updates the sequence dictionary in a variant file using the dictionary from another variant, " +
"alignment, dictionary, or reference file. The dictionary must be valid for all variants in the " +
"target file.",
oneLineSummary = "Updates the sequence dictionary in a variant file.",
programGroup = VariantManipulationProgramGroup.class
)
@DocumentedFeature
public final class UpdateVCFSequenceDictionary extends VariantWalker {
private static final Logger logger = LogManager.getLogger(UpdateVCFSequenceDictionary.class);
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc="File to which updated variants should be written")
public GATKPath outFile = null;
public final static String DICTIONARY_ARGUMENT_NAME = "source-dictionary";
@Argument(fullName=DICTIONARY_ARGUMENT_NAME,
doc="A variant, alignment, dictionary, or reference file to use as a dictionary source " +
"(optional if the sequence dictionary source is specified as a reference argument). The dictionary " +
"presented must be valid (contain a sequence record) for each sequence that is referenced by any " +
"variant in the input file.",
optional=true)
GATKPath dictionarySource; // optional since a reference can be provided as a dictionary source instead
public final static String REPLACE_ARGUMENT_NAME = "replace";
@Argument(fullName=REPLACE_ARGUMENT_NAME,
doc="Force replacement of the dictionary if the input file already has a dictionary",
optional=true)
boolean replace = false; // optional since a reference can be provided as a dictionary source instead
private VariantContextWriter vcfWriter;
private SAMSequenceDictionary sourceDictionary;
@Override
public void onTraversalStart() {
VCFHeader inputHeader = getHeaderForVariants();
VCFHeader outputHeader = inputHeader == null ?
new VCFHeader() :
new VCFHeader(inputHeader.getMetaDataInInputOrder(), inputHeader.getGenotypeSamples()) ;
getDefaultToolVCFHeaderLines().forEach(line -> outputHeader.addMetaDataLine(line));
sourceDictionary = getBestAvailableSequenceDictionary();
// If -replace is set, do not need to check the sequence dictionary for validity here -- it will still be
// checked in our normal sequence dictionary validation. Warn and require opt-in via -replace if we're about to
// clobber a valid sequence dictionary. Check the input file directly via the header rather than using the
// engine, since it might dig one up from an index.
if (!replace) {
SAMSequenceDictionary oldDictionary =
inputHeader == null ? null : inputHeader.getSequenceDictionary();
if (oldDictionary != null && !oldDictionary.getSequences().isEmpty()) {
throw new CommandLineException.BadArgumentValue(
String.format(
"The input variant file %s already contains a sequence dictionary. " +
"Use %s to force the dictionary to be replaced.",
getDrivingVariantsFeatureInput().getName(),
REPLACE_ARGUMENT_NAME
)
);
}
}
outputHeader.setSequenceDictionary(sourceDictionary);
vcfWriter = createVCFWriter(outFile);
vcfWriter.writeHeader(outputHeader);
}
@Override
public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext ref, final FeatureContext featureContext) {
// Validate each variant against the source dictionary manually
SAMSequenceRecord samSeqRec = sourceDictionary.getSequence(vc.getContig());
if (samSeqRec == null) {
throw new CommandLineException.BadArgumentValue(
String.format(
"The input variant file contains a variant (ID: \"%s\") with a reference to a sequence (\"%s\") " +
"that is not present in the provided dictionary",
vc.getID(),
vc.getContig()
)
);
} else if (vc.getEnd() > samSeqRec.getSequenceLength()) {
throw new CommandLineException.BadArgumentValue(
String.format(
"The input variant file contains a variant (ID: \"%s\") with a reference to a sequence (\"%s\") " +
"that ends at a position (%d) that exceeds the length of that sequence (%d) in the provided dictionary",
vc.getID(),
vc.getContig(),
vc.getEnd(),
samSeqRec.getSequenceLength()
)
);
}
vcfWriter.add(vc);
}
/**
* Close out the new variants file.
*/
@Override
public void closeTool() {
if (vcfWriter != null) {
vcfWriter.close();
}
}
// Override getBestAvailableSequenceDictionary() to ensure that the new sequence dictionary specified by
// the user (and not the sequence dictionary from the VCF we're trying to update) is used uniformly by all
// callers. Otherwise, the wrong dictionary would be used when writing the index for the output vcf.
//
@Override
public SAMSequenceDictionary getBestAvailableSequenceDictionary() {
final SAMSequenceDictionary resultDictionary;
final SAMSequenceDictionary masterDictionary = getMasterSequenceDictionary();
if (dictionarySource == null) {
if (masterDictionary != null) {
// We'll accept the master dictionary if one was specified. Using the master dictionary
// arg will result in sequence dictionary validation.
logger.warn("Using the dictionary supplied via the {} argument", StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME);
resultDictionary = masterDictionary;
}
else if (hasReference()) {
resultDictionary = getReferenceDictionary();
} else {
throw new CommandLineException.MissingArgument(
DICTIONARY_ARGUMENT_NAME, "A dictionary source file or reference file must be provided");
}
} else {
if (masterDictionary != null) {
throw new CommandLineException(String.format("Only one of %s or %s may be specified on the command line",
StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME,
DICTIONARY_ARGUMENT_NAME));
}
resultDictionary = SAMSequenceDictionaryExtractor.extractDictionary(dictionarySource.toPath());
if (resultDictionary == null || resultDictionary.getSequences().isEmpty()) {
throw new CommandLineException.BadArgumentValue(
String.format(
"The specified dictionary source has an empty or invalid sequence dictionary: %s",
dictionarySource)
);
}
}
if( seqValidationArguments.performSequenceDictionaryValidation()
&& resultDictionary != null
&& dictionaryHasMissingLengths(resultDictionary)) {
throw new UserException.SequenceDictionaryIsMissingContigLengths(dictionarySource.getRawInputString(), resultDictionary);
}
return resultDictionary;
}
private boolean dictionaryHasMissingLengths(final SAMSequenceDictionary resultDictionary) {
return resultDictionary.getSequences().stream().anyMatch(s -> s.getSequenceLength() == SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH);
}
}