All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.variantutils.LeftAlignAndTrimVariants Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.variantutils;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.Hidden;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.VariantWalker;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.VcfUtils;
import picard.cmdline.programgroups.VariantManipulationProgramGroup;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod;

import java.nio.file.Path;
import java.util.*;

/**
 * Left-align indels in a variant callset
 *
 * 

* This tool takes a VCF file, left-aligns the indels and trims common bases from indels, * leaving them with a minimum representation. The same indel can often be placed at multiple positions and still * represent the same haplotype. While the standard convention with VCF is to place an indel at the left-most position * this isn't always done, so this tool can be used to left-align them. This tool optionally splits multiallelic * sites into biallelics and left-aligns individual alleles. Optionally, the tool will not trim common bases from indels. *

* *

Input

*

* A variant call set to left-align and trim. *

* *

Output

*

* A left-aligned VCF. *

* *

Usage examples

* *

Left align and trim alleles

*
 * gatk LeftAlignAndTrimVariants \
 *   -R reference.fasta \
 *   -V input.vcf \
 *   -O output.vcf
 * 
* *

Left align and don't trim alleles

*
 * gatk LeftAlignAndTrimVariants \
 *   -R reference.fasta \
 *   -V input.vcf \
 *   -O output.vcf \
 *   --dont-trim-alleles
 * 
* *

Left align and trim alleles, process alleles <= 208 bases

*
 * gatk LeftAlignAndTrimVariants \
 *   -R reference.fasta \
 *   -V input.vcf \
 *   -O output.vcf \
 *   --max-indel-length 208
 * 
*

Split multiallics into biallelics, left align and trim alleles

*
 * gatk LeftAlignAndTrimVariants \
 *   -R reference.fasta \
 *   -V input.vcf \
 *   -O output.vcf \
 *   --split-multi-allelics
 * 
* *

Split multiallelics into biallics, left align but don't trim alleles, and store the original AC, AF, and AN values

*
 * gatk LeftAlignAndTrimVariants \
 *   -R reference.fasta \
 *   -V input.vcf \
 *   -O output.vcf \
 *   --split-multi-allelics \
 *   --dont-trim-alleles \
 *   --keep-original-ac
 * 
*

Left align variants up to 2000 bases to the left (default is at most left aligning 1000 bases to left)

*
 * gatk LeftAlignAndTrimVariants \
 *   -R reference.fasta \
 *   -V input.vcf \
 *   -O output.vcf \
 *   --max-leading-bases 2000
 * 
*/ @CommandLineProgramProperties( summary = "This tool takes a VCF file, left-aligns the indels and trims common bases from indels," + "leaving them with a minimum representation. The same indel can often be placed at multiple positions and still" + "represent the same haplotype. While the standard convention with VCF is to place an indel at the left-most position" + "this isn't always done, so this tool can be used to left-align them. This tool optionally splits multiallelic" + "sites into biallelics and left-aligns individual alleles. Optionally, the tool will not trim common bases from indels.", oneLineSummary = "Left align and trim vairants", programGroup = VariantManipulationProgramGroup.class ) @DocumentedFeature public class LeftAlignAndTrimVariants extends VariantWalker { public static final String DONT_TRIM_ALLELES_LONG_NAME = "dont-trim-alleles"; public static final String DONT_TRIM_ALLELES_SHORT_NAME = "no-trim"; public static final String SPLIT_MULTIALLELEICS_LONG_NAME = "split-multi-allelics"; public static final String KEEP_ORIGINAL_AC_LONG_NAME = "keep-original-ac"; public static final String MAX_INDEL_LENGTH_LONG_NAME = "max-indel-length"; public static final int DEFAULT_MAX_LEADING_BASES= 1000; @VisibleForTesting static final int DEFAULT_MAX_INDEL_SIZE = 200; public static final String MAX_LEADING_BASES_LONG_NAME = "max-leading-bases"; /** * Output file to which to write left aligned variants */ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "File to which variants should be written") public GATKPath outFile = null; /** * If this argument is set, bases common to all alleles will not be removed and will not leave their minimal representation. */ @Argument(fullName = DONT_TRIM_ALLELES_LONG_NAME, shortName = DONT_TRIM_ALLELES_SHORT_NAME, doc = "Do not Trim alleles to remove bases common to all of them", optional = true) protected boolean dontTrimAlleles = false; /** * If this argument is set, split multiallelic records and left-align individual alleles. * If this argument is not set, multiallelic records are not attempted to left-align and will be copied as is. */ @Argument(fullName = SPLIT_MULTIALLELEICS_LONG_NAME, doc = "Split multiallelic records and left-align individual alleles", optional = true) protected boolean splitMultiallelics = false; /** * When subsetting a callset, this tool recalculates the AC, AF, and AN values corresponding to the contents of the * subset. If this flag is enabled, the original values of those annotations will be stored in new annotations called * AC_Orig, AF_Orig, and AN_Orig. */ @Argument(fullName = KEEP_ORIGINAL_AC_LONG_NAME, doc = "Store the original AC, AF, and AN values after subsetting", optional = true) private boolean keepOriginalChrCounts = false; /** * Maximum indel size to realign. Indels larger than this will be left unadjusted. */ @Argument(fullName = MAX_INDEL_LENGTH_LONG_NAME, doc = "Set maximum indel size to realign", optional = true) protected int maxIndelSize = DEFAULT_MAX_INDEL_SIZE; /** * Distance in reference to look back before allele */ @Argument(fullName = MAX_LEADING_BASES_LONG_NAME, doc = "Set max reference window size to look back before allele", optional = true) protected int maxLeadingBases = DEFAULT_MAX_LEADING_BASES; @Hidden @Argument(fullName = "suppress-reference-path", optional = true, doc = "Suppress reference path in output for test result differencing") private boolean suppressReferencePath = false; private VariantContextWriter vcfWriter = null; private VCFHeader vcfHeader = null; VariantContext lastVariant; @Override public void onTraversalStart() { final Map vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants()); final SortedSet vcfSamples = VcfUtils.getSortedSampleSet(vcfHeaders, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); // Initialize VCF header lines final Path refPath = referenceArguments.getReferencePath(); final Set actualLines = VcfUtils.updateHeaderContigLines(createVCFHeaderLineList(vcfHeaders), refPath, getReferenceDictionary(), suppressReferencePath); vcfWriter = createVCFWriter(outFile); vcfHeader = new VCFHeader(actualLines, vcfSamples); vcfWriter.writeHeader(vcfHeader); } /** * Prepare the VCF header lines */ private Set createVCFHeaderLineList(Map vcfHeaders) { final Set headerLines = VCFUtils.smartMergeHeaders(vcfHeaders.values(), true); headerLines.addAll(getDefaultToolVCFHeaderLines()); if (splitMultiallelics) { GATKVariantContextUtils.addChromosomeCountsToHeader(headerLines); } if (keepOriginalChrCounts) { headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AC_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AF_KEY)); headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AN_KEY)); } return headerLines; } /** * Left aligns variants in repetitive regions. Also trims alleles and splits multiallelics to biallelics, if desired */ @Override public void apply(VariantContext vc, ReadsContext readsContext, ReferenceContext ref, FeatureContext featureContext) { final List vcList; if (splitMultiallelics) { if (vc.getGenotypes().stream().anyMatch(g -> g.hasAnyAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY))) { vcList = GATKVariantContextUtils.splitSomaticVariantContextToBiallelics(vc, false, vcfHeader); } else { vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc, false, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, keepOriginalChrCounts); } } else { vcList = Collections.singletonList(vc); } for (final VariantContext splitVariant : vcList) { final List indelLengths = splitVariant.getIndelLengths(); final int indelLength = indelLengths == null ? 0 : indelLengths.stream().map(Math::abs).max(Integer::compareTo).orElse(0); if (indelLength > maxIndelSize) { logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --max-indel-length >= %d", "Indel is too long", indelLength, splitVariant.getContig(), splitVariant.getStart(), indelLength)); lastVariant = splitVariant; vcfWriter.add(splitVariant); } else { final int distanceToLastVariant = (lastVariant != null && splitVariant.contigsMatch(lastVariant)) ? splitVariant.getStart() - lastVariant.getEnd() : Integer.MAX_VALUE; lastVariant = GATKVariantContextUtils.leftAlignAndTrim(splitVariant, ref, Math.min(maxLeadingBases, distanceToLastVariant - 1), !dontTrimAlleles); vcfWriter.add(lastVariant); } } } /** * Reference is required for left alignment */ @Override public boolean requiresReference() { return true; } /** * Print out message of how many variants were realigned * * @return */ @Override public Object onTraversalSuccess() { return "SUCCESS"; } /** * Close out the new variants file. */ @Override public void closeTool() { if (vcfWriter != null) { vcfWriter.close(); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy