All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.fasta.FastaAlternateReferenceMaker Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.fasta;

import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import picard.cmdline.programgroups.ReferenceProgramGroup;

import java.util.ArrayList;
import java.util.List;
import java.util.Optional;


/**
 * Generate an alternative reference sequence over the specified interval
 *
 * 

Given a variant callset, this tool replaces the reference bases at variation sites with the bases supplied in the * corresponding callset records. Additionally, it allows for one or more "snp-mask" VCFs to set overlapping bases to 'N'.

* *

The output format can be partially controlled using the provided command-line arguments. * Specify intervals with the usual -L argument to output only the reference bases within your intervals. * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a * separate fasta sequence (named numerically in order).

* *

Caveats

*
    *
  • If there are multiple variants that start at a site, it chooses one of them randomly.
  • *
  • When there are overlapping indels (but with different start positions) only the first will be chosen.
  • *
  • This tool works only for SNPs and for simple indels (but not for things like complex substitutions).
  • *
  • This tool works only with VCF files. Using GVCF files as input may result in unexpected behavior.
  • *
*

Input

*

* The reference, requested intervals, and any number of variant files. *

* *

Output

*

* A FASTA file representing the requested intervals. *

* *

Usage example

*
 * gatk FastaAlternateReferenceMaker \
 *   -R reference.fasta \
 *   -O output.fasta \
 *   -L input.intervals \
 *   -V input.vcf \
 *
 *   [--snp-mask mask.vcf]
 * 
* */ @DocumentedFeature @CommandLineProgramProperties(summary = "Create an alternative fasta reference by inserting variants from a vcf into an existing reference sequence.", oneLineSummary = "Create an alternative reference by combining a fasta with a vcf.", programGroup = ReferenceProgramGroup.class) public class FastaAlternateReferenceMaker extends FastaReferenceMaker { //Pre-allocated as a lame optimization private static final byte[] NO_BASES = {}; private static final byte[] N_BYTES = {'N'}; private static final byte[] n_BYTES = {'n'}; private static final byte[] A_BYTES = {'A'}; private static final byte[] a_BYTES = {'a'}; private static final byte[] G_BYTES = {'G'}; private static final byte[] g_BYTES = {'g'}; private static final byte[] C_BYTES = {'C'}; private static final byte[] c_BYTES = {'c'}; private static final byte[] T_BYTES = {'T'}; private static final byte[] t_BYTES = {'t'}; private static final String EMPTY_BASE = " "; /** * Variants from this input file are used by this tool to construct an alternate reference. */ @Argument(fullName= StandardArgumentDefinitions.VARIANT_LONG_NAME, shortName = StandardArgumentDefinitions.VARIANT_SHORT_NAME, doc="A source of variants to merge with the reference sequence.") protected FeatureInput variants; public static final String SNP_MASK_LONG_NAME = "snp-mask"; /** * SNPs from this file are used as a mask (inserting N's in the sequence) when constructing the alternate reference */ @Argument(fullName= SNP_MASK_LONG_NAME, doc="SNP mask VCF file", optional=true) protected FeatureInput snpmask; public static final String SNP_MASK_PRIORITY_LONG_NAME = "snp-mask-priority"; /** * Gives priority to a SNP mask over an input VCF for a site. Only has an effect if the --snp-mask argument is used. */ @Argument(fullName= SNP_MASK_PRIORITY_LONG_NAME, doc="Give the snp mask priority over the input VCF.", optional=true) protected boolean snpmaskPriority = false; public static final String USE_IUPAC_SAMPLE_LONG_NAME = "use-iupac-sample"; /** * This option will generate an error if the specified sample does not exist in the VCF. * Non-diploid (or non-called) genotypes are ignored. */ @Argument(fullName= USE_IUPAC_SAMPLE_LONG_NAME, doc = "If specified, heterozygous SNP sites will be output using IUPAC ambiguity codes given the genotypes for this sample", optional=true) private String iupacSample = null; private int deletionBasesRemaining = 0; @Override public void onTraversalStart() { super.onTraversalStart(); if (snpmaskPriority && snpmask == null){ throw new CommandLineException("Cannot specify --" + SNP_MASK_PRIORITY_LONG_NAME + " without " + " --" + SNP_MASK_LONG_NAME); } if ( iupacSample != null ) { final VCFHeader variantHeader = (VCFHeader) getHeaderForFeatures(variants); final ArrayList samples = variantHeader.getSampleNamesInOrder(); if( samples == null || !samples.contains(iupacSample)) { throw new UserException.BadInput("the IUPAC sample specified is not present in the provided VCF file"); } } } @Override public void apply(final ReferenceContext ref, final ReadsContext reads, final FeatureContext features) { final byte[] bases = handlePosition(ref.getInterval(), ref.getBase(), features); final SimpleInterval interval = ref.getInterval(); if( bases.length == 0){ advancePosition(interval); } else { for( byte base : bases) { addToReference(interval, base); } } } private byte[] handlePosition(SimpleInterval interval, byte base, FeatureContext features) { if (deletionBasesRemaining > 0) { deletionBasesRemaining--; return NO_BASES; } // If we have a mask at this site, use it if ( snpmaskPriority ){ if (isMasked(features) ) return N_BYTES; } // Check to see if we have a called snp for ( final VariantContext vc : features.getValues(variants) ) { if ( vc.isFiltered() || vc.getStart() != interval.getStart() ) continue; if ( vc.isSimpleDeletion()) { deletionBasesRemaining = vc.getReference().length() - 1; // delete the next n bases, not this one return baseToByteArray(base); } else if ( vc.isSimpleInsertion() || vc.isSNP() ) { // Get the first alt allele that is not a spanning deletion. If none present, use the empty allele final Optional optionalAllele = getFirstConcreteAltAllele(vc.getAlternateAlleles()); final Allele allele = optionalAllele.orElseGet(() -> Allele.create(EMPTY_BASE, false)); if ( vc.isSimpleInsertion() ) { return allele.getBases(); } else { final String iupacBase = (iupacSample != null) ? getIUPACBase(vc.getGenotype(iupacSample)) : allele.toString(); return iupacBase.getBytes(); } } } if ( !snpmaskPriority ){ if ( isMasked(features)) { return N_BYTES; } } // if we got here then we're just ref return baseToByteArray(base); } private byte[] baseToByteArray(byte base) { switch(base) { case 'A': return A_BYTES; case 'a': return a_BYTES; case 'C': return C_BYTES; case 'c': return c_BYTES; case 'G': return G_BYTES; case 'g': return g_BYTES; case 'T': return T_BYTES; case 't': return t_BYTES; case 'N': return N_BYTES; case 'n': return n_BYTES; default: return new byte[]{base}; } } /** * Get the first non-symbolic, non-spanning deletion, called allele * @param altAlleles the alternate alleles * @return the first non spanning deletion allele or null */ private Optional getFirstConcreteAltAllele(final List altAlleles ) { return altAlleles.stream() .filter(allele -> !allele.isSymbolic()) .filter(allele -> !GATKVCFConstants.isSpanningDeletion(allele)) .filter(Allele::isCalled) .findFirst(); } /** * Mask a SNP (inserting N's in the sequence) * * @param features the Reference Metadata available at a particular site in the genome * @return mask at the locus or null if no SNP at that locus */ private boolean isMasked(final FeatureContext features){ return features.getValues(snpmask).stream().anyMatch(VariantContext::isSNP); } /** * Returns the IUPAC encoding for the given genotype or the reference base if not possible * * @param genotype the genotype to encode * @return non-null, non-empty String of bases */ private String getIUPACBase(final Genotype genotype) { Utils.nonNull(genotype, () -> "The genotype is null for sample " + iupacSample); // If we have a spanning deletion, if both alleles are spanning deletions, use the empty allele. Otherwise, use the allele that is not a // spanning deletion. if ( genotype.getAlleles().contains(Allele.SPAN_DEL) ) { if ( genotype.isHomVar() ) { return EMPTY_BASE; } else { return genotype.getAllele(0).equals(Allele.SPAN_DEL) ? genotype.getAllele(1).getBaseString() : genotype.getAllele(0).getBaseString(); } } if ( !genotype.isHet() ) { return genotype.getAllele(0).getBaseString(); } final byte allele1 = genotype.getAllele(0).getBases()[0]; final byte allele2 = genotype.getAllele(1).getBases()[0]; return new String(new byte[] {BaseUtils.basesToIUPAC(allele1, allele2)}); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy