org.broadinstitute.hellbender.tools.walkers.variantutils.SelectVariants Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.walkers.variantutils;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.tribble.util.ParsingUtils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.VariantContextUtils;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
import java.nio.file.Path;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.Hidden;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.filters.*;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
import picard.cmdline.programgroups.VariantManipulationProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.VariantWalker;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.ChromosomeCounts;
import org.broadinstitute.hellbender.tools.walkers.genotyper.AlleleSubsettingUtils;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod;
import org.broadinstitute.hellbender.utils.samples.MendelianViolation;
import org.broadinstitute.hellbender.utils.samples.SampleDB;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.*;
import java.util.*;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* Select a subset of variants from a VCF file
*
* This tool makes it possible to select a subset of variants based on various criteria in order to facilitate certain
* analyses. Examples of such analyses include comparing and contrasting cases vs. controls, extracting variant or
* non-variant loci that meet certain requirements, or troubleshooting some unexpected results, to name a few.
*
*
* There are many different options for selecting subsets of variants from a larger callset:
*
* - Extract one or more samples from a callset based on either a complete sample name or a pattern match.
* - Specify criteria for inclusion that place thresholds on annotation values, e.g. "DP > 1000" (depth of
* coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25). These criteria are written
* as "JEXL expressions", which are documented in the
* article about using JEXL expressions.
* - Provide concordance or discordance tracks in order to include or exclude variants that are also present
* in other given callsets.
* - Select variants based on criteria like their type (e.g. INDELs only), evidence of mendelian violation,
* filtering status, allelicity, etc.
*
*
*
* There are also several options for recording the original values of certain annotations which are recalculated
* when one subsets the new callset, trims alleles, etc.
*
* Input
*
* A variant call set in VCF format from which a subset can be selected.
*
*
* Output
*
* A new VCF file containing the selected subset of variants.
*
*
* * Usage examples
* Select SNPs
*
* gatk SelectVariants \
* -R Homo_sapiens_assembly38.fasta \
* -V input.vcf \
* --select-type-to-include SNP \
* -O output.vcf
*
*
* Query Chromosome 20 Variants from a GenomicsDB
*
* gatk SelectVariants \
* -R Homo_sapiens_assembly38.fasta \
* -V gendb://genomicsDB \
* -L 20 \
* -O output.chr20.vcf
*
*
* Use JEXL Expressions to filter variants by INFO fields
*
* gatk SelectVariants \
* -R Homo_sapiens_assembly38.fasta \
* -V input.vcf \
* -select "AF > 0.001" \
* -O output.vcf
*
*
* Use JEXL Expressions to filter variants by genotype fields. The variant will be kept in the output vcf if at least one sample meets the criterion.
*
* gatk SelectVariants \
* -R Homo_sapiens_assembly38.fasta \
* -V input.vcf \
* -select-genotype "GQ > 50" \
* -O output.vcf
*
*
* Filter by a genotype field in a specific sample (e.g. NA12878) by directly accessing the VariantContext object with -select. Contrast this to -select-genotype (see above), which looks at the genotype field for all samples.
*
* gatk SelectVariants \
* -R Homo_sapiens_assembly38.fasta \
* -V input.vcf \
* -select "vc.getGenotype('NA12878').getGQ() > 35" \
* -O output.vcf
*
*
* We recommend that a -select expression that contains the logical-or be split into two separate -select arguments, as in the first example. This approach is robust to the case where one of the arguments to the logical-or evaluates to null. For example, the filtering expression of the following command is equivalent to "AF > 0.01 || MQ > 55".
*
* gatk SelectVariants \
* -R Homo_sapiens_assembly38.fasta \
* -V input.vcf \
* -select "AF > 0.01" \
* -select "MQ > 55" \
* -O output.vcf
*
*
* On the other hand, the logical-and expression must be a single -select expression.
*
* gatk SelectVariants \
* -R Homo_sapiens_assembly38.fasta \
* -V input.vcf \
* -select "AF > 0.01 && MQ > 55" \
* -O output.vcf
*
*/
@CommandLineProgramProperties(
summary = "This tool makes it possible to select a subset of variants based on various criteria in order to facilitate certain " +
"analyses. Examples include comparing and contrasting cases vs. controls, extracting variant or non-variant loci that meet " +
"certain requirements, or troubleshooting some unexpected results, to name a few.",
oneLineSummary = "Select a subset of variants from a VCF file",
programGroup = VariantManipulationProgramGroup.class
)
@DocumentedFeature
public final class SelectVariants extends VariantWalker {
private static final int MAX_FILTERED_GENOTYPES_DEFAULT_VALUE = Integer.MAX_VALUE;
private static final double MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 1.0;
private static final int MIN_FILTERED_GENOTYPES_DEFAULT_VALUE = 0;
private static final double MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE = 0.0;
private static final int MAX_NOCALL_NUMBER_DEFAULT_VALUE = Integer.MAX_VALUE;
private static final double MAX_NOCALL_FRACTION_DEFAULT_VALUE = 1.0;
private static final List GVCF_EXTENSIONS = Arrays.asList(".g.vcf", ".g.vcf.gz", ".gvcf", ".gvcf.gz");
/**
* A site is considered discordant if there exists some sample in the variant track that has a non-reference
* genotype and either the site isn't present in this track, the sample isn't present in this track, or the
* sample is called reference in this track.
*/
@Argument(fullName="discordance", shortName = "disc",
doc="Output variants not called in this comparison track", optional=true)
private FeatureInput discordanceTrack;
/**
* A site is considered concordant if (1) we are not looking for specific samples and there is a variant called
* in both the variant and concordance tracks or (2) every sample present in the variant track is present in the
* concordance track and they have the sample genotype call.
*/
@Argument(fullName="concordance", shortName = "conc",
doc="Output variants also called in this comparison track", optional=true)
private FeatureInput concordanceTrack;
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc="Path to which variants should be written")
public GATKPath vcfOutput = null;
/**
* This argument can be specified multiple times in order to provide multiple sample names, or to specify
* the name of one or more files containing sample names. File names must use the extension ".args", and the
* expected file format is simply plain text with one sample name per line. Note that sample exclusion takes
* precedence over inclusion, so that if a sample is in both lists it will be excluded.
*/
@Argument(fullName=StandardArgumentDefinitions.SAMPLE_NAME_LONG_NAME, shortName=StandardArgumentDefinitions.SAMPLE_NAME_SHORT_NAME, doc="Include genotypes from this sample", optional=true)
private Set sampleNames = new LinkedHashSet<>(0);
/**
* Using a regular expression allows you to match multiple sample names that have that pattern in common. This
* argument can be specified multiple times in order to use multiple different matching patterns.
*/
@Argument(fullName="sample-expressions", shortName="se",
doc="Regular expression to select multiple samples", optional=true)
private Set sampleExpressions = new LinkedHashSet<>(0);
/**
* Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be
* excluded. This argument can be specified multiple times in order to provide multiple sample names, or to
* specify the name of one or more files containing sample names. File names must use the extension ".args",
* and the expected file format is simply plain text with one sample name per line.
*/
@Argument(fullName="exclude-sample-name", shortName="xl-sn", doc="Exclude genotypes from this sample", optional=true)
private Set XLsampleNames = new LinkedHashSet<>(0);
/**
* Using a regular expression allows you to match multiple sample names that have that pattern in common. Note that
* sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded. This
* argument can be specified multiple times in order to use multiple different matching patterns.
*/
@Argument(fullName="exclude-sample-expressions", shortName="xl-se",
doc="List of sample expressions to exclude", optional=true)
private Set XLsampleExpressions = new LinkedHashSet<>(0);
/**
* See example commands above for detailed usage examples. The expressions given to this argument
* should either refer to INFO fields, or access FORMAT field with the VariantContext object
* e.g. -select "vc.getGenotype('NA12878').getGQ() > 35"
*/
public static final String SELECT_NAME = "select";
@Argument(fullName=SELECT_NAME, doc="A filtering expression in terms of either INFO fields or the VariantContext object). If the expression evaluates to true for a variant, it will be kept in the output vcf.", optional=true)
private ArrayList selectExpressions = new ArrayList<>();
/**
* JEXL expressions to be applied to genotype (FORMAT) fields e.g. GQ, AD.
* If at least one of the samples meets the criteria, the variant will be included in the output.
*/
public static final String GENOTYPE_SELECT_SHORT_NAME = "select-genotype";
public static final String GENOTYPE_SELECT_LONG_NAME = "select-genotype-expressions";
@Argument(fullName=GENOTYPE_SELECT_LONG_NAME, shortName=GENOTYPE_SELECT_SHORT_NAME, doc="A filtering expression in terms of FORMAT fields. If the expression evaluates to true for a variant, it will be kept in the output vcf.", optional=true)
private ArrayList selectGenotypeExpressions = new ArrayList<>();
/**
* This flag is provided to allow the user to do JEXL filtering
* before subsetting the format fields, in particular the case where the filtering is done
* on INFO fields only, which may improve speed when working with a large cohort vcf that contains genotypes for
* thousands of samples (format fields).
*/
public static final String APPLY_JEXL_FIRST_SHORT_NAME = "jexl-first";
public static final String APPLY_JEXL_FIRST_LONG_NAME = "apply-jexl-filters-first";
@Argument(fullName=APPLY_JEXL_FIRST_LONG_NAME, shortName=APPLY_JEXL_FIRST_SHORT_NAME, doc="Apply JEXL-based filtering before subsetting samples", optional=true)
private boolean applyJexlFiltersBeforeFilteringGenotypes = false;
/**
* Invert the selection criteria for -select.
*/
@Argument(shortName="invert-select", doc="Invert the selection criteria for -select", optional=true)
private boolean invertSelect = false;
/*
* If this flag is enabled, sites that are found to be non-variant after the subsetting procedure (i.e. where none
* of the selected samples display evidence of variation) will be excluded from the output.
*/
@Argument(fullName="exclude-non-variants", doc="Don't include non-variant sites", optional=true)
private boolean excludeNonVariants = false;
/**
* If this flag is enabled, sites that have been marked as filtered (i.e. have anything other than `.` or `PASS`
* in the FILTER field) will be excluded from the output.
*/
@Argument(fullName="exclude-filtered", doc="Don't include filtered sites", optional=true)
private boolean excludeFiltered = false;
/**
* The default behavior of this tool is to remove bases common to all remaining alleles after subsetting
* operations have been completed, leaving only their minimal representation. If this flag is enabled, the original
* alleles will be preserved as recorded in the input VCF.
*/
@Argument(fullName="preserve-alleles", doc="Preserve original alleles, do not trim", optional=true)
private boolean preserveAlleles = false;
/**
* When this flag is enabled, all alternate alleles that are not present in the (output) samples will be removed.
* Note that this even extends to biallelic SNPs - if the alternate allele is not present in any sample, it will be
* removed and the record will contain a '.' in the ALT column. Note also that sites-only VCFs, by definition, do
* not include the alternate allele in any genotype calls. Further note that PLs will be trimmed appropriately,
* removing likelihood information (even for homozygous reference calls).
*/
@Argument(fullName="remove-unused-alternates",
doc="Remove alternate alleles not present in any genotypes", optional=true)
private boolean removeUnusedAlternates = false;
/**
* When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many
* alleles are listed in the ALT column of a VCF. For example, a multiallelic record such as:
* 1 100 . A AAA,AAAAA
* will be excluded if `-restrict-alleles-to BIALLELIC` is used, because there are two alternate alleles, whereas a
* record such as:
* 1 100 . A T
* will be included in that case, but would be excluded if `-restrict-alleles-to MULTIALLELIC` is used.
* Valid options are ALL (default), MULTIALLELIC or BIALLELIC.
*/
@Argument(fullName="restrict-alleles-to",
doc="Select only variants of a particular allelicity", optional=true)
private NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL;
/**
* When subsetting a callset, this tool recalculates the AC, AF, and AN values corresponding to the contents of the
* subset. If this flag is enabled, the original values of those annotations will be stored in new annotations called
* AC_Orig, AF_Orig, and AN_Orig.
*/
@Argument(fullName="keep-original-ac",
doc="Store the original AC, AF, and AN values after subsetting", optional=true)
private boolean keepOriginalChrCounts = false;
/**
* When subsetting a callset, this tool recalculates the site-level (INFO field) DP value corresponding to the
* contents of the subset. If this flag is enabled, the original value of the DP annotation will be stored in
* a new annotation called DP_Orig.
*/
@Argument(fullName="keep-original-dp",
doc="Store the original DP value after subsetting", optional=true)
private boolean keepOriginalDepth = false;
/**
* If this flag is enabled, this tool will select only variants that correspond to a mendelian violation as
* determined on the basis of family structure. Requires passing a pedigree file using the engine-level
* `-ped` argument.
*/
@Argument(fullName="mendelian-violation", doc="Output mendelian violation sites only", optional=true)
private Boolean mendelianViolations = false;
/**
* If this flag is enabled, this tool will select only variants that do not correspond to a mendelian violation as
* determined on the basis of family structure. Requires passing a pedigree file using the engine-level
* `-ped` argument.
*/
@Argument(fullName="invert-mendelian-violation",
doc="Output non-mendelian violation sites only", optional=true)
private Boolean invertMendelianViolations = false;
/**
* This argument specifies the genotype quality (GQ) threshold that all members of a trio must have in order
* for a site to be accepted as a mendelian violation. Note that the `-mv` flag must be set for this argument
* to have an effect.
*/
@Argument(fullName="mendelian-violation-qual-threshold",
doc="Minimum GQ score for each trio member to accept a site as a violation", optional=true)
private double mendelianViolationQualThreshold = 0;
@Argument(fullName=StandardArgumentDefinitions.PEDIGREE_FILE_LONG_NAME, shortName=StandardArgumentDefinitions.PEDIGREE_FILE_SHORT_NAME, doc="Pedigree file", optional=true)
private GATKPath pedigreeFile = null;
/**
* The value of this argument should be a number between 0 and 1 specifying the fraction of total variants to be
* randomly selected from the input callset. Note that this is done using a probabilistic function, so the final
* result is not guaranteed to carry the exact fraction requested. Can be used for large fractions.
*/
public static final String FRACTION_TO_KEEP_SHORT_NAME = "fraction";
public static final String FRACTION_TO_KEEP_LONG_NAME = "select-random-fraction";
@Argument(fullName=FRACTION_TO_KEEP_LONG_NAME, shortName=FRACTION_TO_KEEP_SHORT_NAME,
doc="Select a fraction of variants at random from the input", optional=true)
private double fractionToKeep = 1.0;
/**
* The value of this argument should be a number between 0 and 1 specifying the fraction of total variants to be
* randomly selected from the input callset and set to no-call (./). Note that this is done using a probabilistic
* function, so the final result is not guaranteed to carry the exact fraction requested. Can be used for large fractions.
*/
@Argument(fullName="remove-fraction-genotypes",
doc="Select a fraction of genotypes at random from the input and sets them to no-call", optional=true)
private double fractionGenotypes = 0;
/**
* This argument selects particular kinds of variants out of a list. If left empty, there is no type selection
* and all variant types are considered for other selection criteria. Valid types are INDEL, SNP, MIXED, MNP,
* SYMBOLIC, NO_VARIATION. Can be specified multiple times.
*/
@Argument(fullName="select-type-to-include", shortName="select-type",
doc="Select only a certain type of variants from the input file", optional=true)
private List typesToInclude = new ArrayList<>();
/**
* This argument excludes particular kinds of variants out of a list. If left empty, there is no type selection
* and all variant types are considered for other selection criteria. Valid types are INDEL, SNP, MIXED, MNP,
* SYMBOLIC, NO_VARIATION. Can be specified multiple times.
*/
@Argument(fullName="select-type-to-exclude", shortName="xl-select-type",
doc="Do not select certain type of variants from the input file", optional=true)
private List typesToExclude = new ArrayList<>();
/**
* When this argument is set, NON_REF alleles will not be considered for the variant type determination. This is
* necessary because every variant in a GVCF file would otherwise be assigned the type MIXED, which makes it
* impossible to filter for e.g. SNPs. If only NON_REF alleles are present at a given site it will still be
* considered SYMBOLIC.
*/
@Argument(fullName="ignore-non-ref-in-types",
doc="If set, NON_REF alleles will be ignored for variant type determination, which is required for filtering GVCF files by type", optional=true)
private boolean ignoreNonRefInTypes = false;
/**
* List of IDs (or a .list file containing ids) to select. The tool will only select variants whose ID
* field is present in this list of IDs. The matching is done by exact string matching. If a file, the file
* name must end in ".list", and the expected file format is simply plain text with one ID per line.
*/
@Argument(fullName="keep-ids", shortName="ids", doc="List of variant rsIDs to select", optional=true)
private Set rsIDsToKeep = new HashSet<>();
/**
* List of IDs (or a .list file containing ids) to exclude. The tool will exclude variants whose ID
* field is present in this list of IDs. The matching is done by exact string matching. If a file, the
* file name must end in ".list", and the expected file format is simply plain text with one ID per line.
*/
@Argument(fullName="exclude-ids", shortName="xl-ids", doc="List of variant rsIDs to exclude", optional=true)
private Set rsIDsToRemove = new HashSet<>();
/**
* If set to true, before we start processing each VariantContext,
* we fully-decode the genotypes i.e. convert each genotype field
* from a String to appropriate data types such as integers and arrays.
* (Not immediately clear why this is useful.)
*/
@Hidden
@Argument(fullName="fully-decode", doc="If true, the incoming VariantContext will be fully decoded", optional=true)
private boolean fullyDecode = false;
/**
* If this argument is provided, indels that are larger than the specified size will be excluded.
*/
@Argument(fullName="max-indel-size", optional=true, doc="Maximum size of indels to include")
private int maxIndelSize = Integer.MAX_VALUE;
/**
* If this argument is provided, indels that are smaller than the specified size will be excluded.
*/
@Argument(fullName="min-indel-size", optional=true, doc="Minimum size of indels to include")
private int minIndelSize = 0;
/**
* If this argument is provided, select sites where at most a maximum number of samples are filtered at the genotype level.
*/
@Argument(fullName="max-filtered-genotypes", optional=true, doc="Maximum number of samples filtered at the genotype level")
private int maxFilteredGenotypes = MAX_FILTERED_GENOTYPES_DEFAULT_VALUE;
/**
* If this argument is provided, select sites where at least a minimum number of samples are filtered at
* the genotype level.
*/
@Argument(fullName="min-filtered-genotypes", optional=true,
doc="Minimum number of samples filtered at the genotype level")
private int minFilteredGenotypes = MIN_FILTERED_GENOTYPES_DEFAULT_VALUE;
/**
* If this argument is provided, select sites where a fraction or less of the samples are filtered at
* the genotype level.
*/
@Argument(fullName="max-fraction-filtered-genotypes",
optional=true, doc="Maximum fraction of samples filtered at the genotype level")
private double maxFractionFilteredGenotypes = MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE;
/**
* If this argument is provided, select sites where a fraction or more of the samples are filtered at
* the genotype level.
*/
@Argument(fullName="min-fraction-filtered-genotypes", optional=true,
doc="Maximum fraction of samples filtered at the genotype level")
private double minFractionFilteredGenotypes = MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE;
/**
* If this argument is provided, select sites where at most the given number of samples have no-call genotypes.
*/
@Argument(fullName="max-nocall-number", optional=true,
doc="Maximum number of samples with no-call genotypes")
private int maxNOCALLnumber = MAX_NOCALL_NUMBER_DEFAULT_VALUE;
/**
* If this argument is provided, select sites where at most the given fraction of samples have no-call genotypes.
*/
@Argument(fullName="max-nocall-fraction", optional=true,
doc="Maximum fraction of samples with no-call genotypes")
private double maxNOCALLfraction = MAX_NOCALL_FRACTION_DEFAULT_VALUE;
/**
* If this argument is provided, set filtered genotypes to no-call (./.).
*/
@Argument(fullName="set-filtered-gt-to-nocall", optional=true, doc="Set filtered genotypes to no-call")
private boolean setFilteredGenotypesToNocall = false;
/**
* Info annotation fields to be dropped (specified by key)
*/
@Argument(fullName = "drop-info-annotation", shortName = "DA", optional = true, doc = "Info annotations to drop from output vcf. Annotations to be dropped are specified by their key.")
private List infoAnnotationsToDrop = new ArrayList<>();
/**
* Genotype annotation fields to be dropped (specified by key)
*/
@Argument(fullName = "drop-genotype-annotation", shortName = "DGA", optional = true, doc = "Genotype annotations to drop from output vcf. Annotations to be dropped are specified by their key.")
private List genotypeAnnotationsToDrop = new ArrayList<>();
@Hidden
@Argument(fullName="allow-nonoverlapping-command-line-samples", optional=true,
doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.")
private boolean allowNonOverlappingCommandLineSamples = false;
@Hidden
@Argument(fullName="suppress-reference-path", optional=true,
doc="Suppress reference path in output for test result differencing")
private boolean suppressReferencePath = false;
@Hidden
@Argument(fullName="fail-on-unsorted-genotype", optional=true,
doc="Throw an exception if the genotype field is unsorted")
private boolean failOnUnsortedGenotype = false;
@ArgumentCollection
private GenomicsDBArgumentCollection genomicsdbArgs = new GenomicsDBArgumentCollection();
private VariantContextWriter vcfWriter = null;
private enum NumberAlleleRestriction {
ALL,
BIALLELIC,
MULTIALLELIC
}
private SortedSet samples = new TreeSet<>();
private boolean noSamplesSpecified = false;
private Set selectedTypes = new LinkedHashSet<>();
private final ArrayList selectNames = new ArrayList<>();
private final ArrayList selectGenotypeNames = new ArrayList<>();
private List infoJexls = null;
private List genotypeJexls = null;
private boolean discordanceOnly = false;
private boolean concordanceOnly = false;
private MendelianViolation mv = null;
private SampleDB sampleDB = null;
// Random number generator for the genotypes to remove
private final Random randomGenotypes = new Random();
private final List diploidNoCallAlleles = GATKVariantContextUtils.noCallAlleles(2);
private final Map ploidyToNumberOfAlleles = new LinkedHashMap();
@Override
protected GenomicsDBOptions getGenomicsDBOptions() {
if (genomicsDBOptions == null) {
genomicsDBOptions = new GenomicsDBOptions(referenceArguments.getReferencePath(), genomicsdbArgs);
}
return genomicsDBOptions;
}
// We do not output the vcf entries in the order they arrive, as trimming alleles may change the start position
// (e.g. at a multiallelic site, see #6444).
final private PriorityQueue pendingVariants = new PriorityQueue<>(Comparator.comparingInt(VariantContext::getStart));
/**
* Set up the VCF writer, the sample expressions and regexs, filters inputs, and the JEXL matcher
*
*/
@Override
public void onTraversalStart() {
final Map vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants());
final List genotypeField = getHeaderForVariants().getGenotypeSamples();
if(!ParsingUtils.isSorted(genotypeField)){
// We do not warn the user if there are only a few samples
if(genotypeField.size() > 10) {
logger.warn("***************************************************************************************************************************");
logger.warn("* Detected unsorted genotype fields on input. *");
logger.warn("* SelectVariants will sort the genotypes on output which could result in slow traversal as it involves genotype parsing. *");
logger.warn("***************************************************************************************************************************");
}
if(failOnUnsortedGenotype){
throw new UserException.ValidationFailure("Input file genotypes are unsorted and we are in strict genotype ordering validation mode.");
}
}
// Initialize VCF header lines
final Set headerLines = createVCFHeaderLineList(vcfHeaders);
// It's not necessary that the user supply select names for the JEXL expressions, since those
// expressions will only be needed for omitting records. Make up the select names here.
IntStream.range(0, selectExpressions.size()).forEach(i -> selectNames.add(String.format("select-%d", i)));
IntStream.range(0, selectGenotypeExpressions.size()).forEach(i -> selectGenotypeNames.add(String.format("genotype-select-%d", i)));
// These are maps of type (name, JEXL expression class)
// Note that infoJexls could also contain JEXL expressions that access genotype fields via the VariantContext object
infoJexls = VariantContextUtils.initializeMatchExps(selectNames, selectExpressions);
genotypeJexls = VariantContextUtils.initializeMatchExps(selectGenotypeNames, selectGenotypeExpressions);
// Prepare the sample names and types to be used by the corresponding filters
samples = createSampleNameInclusionList(vcfHeaders);
selectedTypes = createSampleTypeInclusionList();
// Look at the parameters to decide which analysis to perform
discordanceOnly = discordanceTrack != null;
if (discordanceOnly) {
logger.info("Selecting only variants discordant with the track: " + discordanceTrack.getName());
}
concordanceOnly = concordanceTrack != null;
if (concordanceOnly) {
logger.info("Selecting only variants concordant with the track: " + concordanceTrack.getName());
}
if (mendelianViolations) {
sampleDB = SampleDB.createSampleDBFromPedigree(pedigreeFile);
mv = new MendelianViolation(mendelianViolationQualThreshold, false, true);
}
if (fractionToKeep < 1.0) {
logger.info("Selecting approximately " + 100.0* fractionToKeep + "% of the variants at random from the variant track");
}
//TODO: this should be refactored/consolidated as part of
// https://github.com/broadinstitute/gatk/issues/121 and
// https://github.com/broadinstitute/gatk/issues/1116
Set actualHeaderLines = null;
SAMSequenceDictionary sequenceDictionary = null;
if (hasReference()) {
Path refPath = referenceArguments.getReferencePath();
sequenceDictionary= this.getReferenceDictionary();
actualHeaderLines = VcfUtils.updateHeaderContigLines(headerLines, refPath, sequenceDictionary, suppressReferencePath);
} else {
sequenceDictionary = getHeaderForVariants().getSequenceDictionary();
if (sequenceDictionary != null) {
actualHeaderLines = VcfUtils.updateHeaderContigLines(headerLines, null, sequenceDictionary, suppressReferencePath);
} else {
actualHeaderLines = headerLines;
}
}
if (!infoAnnotationsToDrop.isEmpty()) {
for (final String infoField : infoAnnotationsToDrop) {
logger.info(String.format("Will drop info annotation: %s",infoField));
}
}
if (!genotypeAnnotationsToDrop.isEmpty()) {
for (final String genotypeAnnotation : genotypeAnnotationsToDrop) {
logger.info(String.format("Will drop genotype annotation: %s",genotypeAnnotation));
}
}
if (!ignoreNonRefInTypes && (!typesToInclude.isEmpty() || !typesToExclude.isEmpty()) &&
GVCF_EXTENSIONS.stream().anyMatch(extension -> getDrivingVariantsFeatureInput().hasExtension(extension))) {
logger.warn("Filtering by variant type and GVCF input detected, but --ignore-non-ref-in-types argument is not set. Variant types will likely not be filtered correctly. Consider setting this argument for meaningful results.");
}
final Path outPath = vcfOutput.toPath();
vcfWriter = createVCFWriter(outPath);
vcfWriter.writeHeader(new VCFHeader(actualHeaderLines, samples));
}
@Override
public void apply(VariantContext vc, ReadsContext readsContext, ReferenceContext ref, FeatureContext featureContext) {
/*check for pending variants to write out
since variant starts will only be moved further right, we can write out a pending variant if the current variant start is after the pending variant start
variant record locations can move to the right due to allele trimming if preserveAlleles is false
*/
while (!pendingVariants.isEmpty() && (pendingVariants.peek().getStart()<=vc.getStart() || !(pendingVariants.peek().getContig().equals(vc.getContig())))) {
vcfWriter.add(pendingVariants.poll());
}
// fullyDecode means to convert from strings to appropriate data types (e.g. int, array)
if (fullyDecode) {
vc = vc.fullyDecode(getHeaderForVariants(), lenientVCFProcessing);
}
if (applyFirstRoundOfFiltering(vc, featureContext)){
return;
}
// Filtered genotypes are ones with the FT FORMAT field
if (considerFilteredGenotypes()) {
final int numFilteredSamples = numFilteredGenotypes(vc);
final double fractionFilteredGenotypes = samples.isEmpty() ? 0.0 : numFilteredSamples / samples.size();
if (numFilteredSamples > maxFilteredGenotypes || numFilteredSamples < minFilteredGenotypes ||
fractionFilteredGenotypes > maxFractionFilteredGenotypes || fractionFilteredGenotypes < minFractionFilteredGenotypes)
return;
}
if (considerNoCallGenotypes()) {
final int numNoCallSamples = numNoCallGenotypes(vc);
final double fractionNoCallGenotypes = samples.isEmpty() ? 0.0 : ((double) numNoCallSamples) / samples.size();
if (numNoCallSamples > maxNOCALLnumber || fractionNoCallGenotypes > maxNOCALLfraction)
return;
}
VariantContext result = subsetGenotypesBySampleNames(vc, preserveAlleles, removeUnusedAlternates);
if ( setFilteredGenotypesToNocall ) {
final VariantContextBuilder builder = new VariantContextBuilder(result);
GATKVariantContextUtils.setFilteredGenotypeToNocall(builder, result, setFilteredGenotypesToNocall, this::getGenotypeFilters);
result = builder.make();
}
// After subsetting samples (e.g. to a single sample), it's possible that:
// 1. none of the remaining samples has the variant in question (i.e. site is no longer polymorphic);
// 2. the only remaining alternate allele is spanning deletion (*)
// If this is the case, we call it a non-variant and remove it from the output based on {@code excludeNonVariants}
if (excludeNonVariants) {
// It would be cleaner to say "if (excludeNonVariants && nonVariant) {...}", but isPolymorphicInSamples() could be relatively expensive,
// when we have many samples, so we call it only when excludeNonVariants is set to true.
final boolean nonVariant = ! result.isPolymorphicInSamples() || GATKVariantContextUtils.isSpanningDeletionOnly(result);
if (nonVariant) {
return;
}
}
if (!applyJexlFiltersBeforeFilteringGenotypes && ! passesJexlFilters(result)){
return;
}
pendingVariants.add(dropAnnotations(result, infoAnnotationsToDrop, genotypeAnnotationsToDrop));
}
/**
* Group all the initial checks we do before subsetting samples.
* If any of these predicates evaluates to true we are safe to remove the present variant from the output,
* return (i.e. short-circuit) out of apply(), and skip to the next variant.
*/
private boolean applyFirstRoundOfFiltering(final VariantContext vc, final FeatureContext featureContext){
// Since SelectVariants does not modify the filter field,
// we can check this before doing expensive operations
if (excludeFiltered && vc.isFiltered()){
return true;
}
if (fractionToKeep < 1.0 && Utils.getRandomGenerator().nextDouble() >= fractionToKeep){
return true;
}
if (mendelianViolations && invertLogic((mv.countFamilyViolations(sampleDB, samples, vc) == 0), invertMendelianViolations)) {
return true;
}
if (discordanceOnly && !isDiscordant(vc, featureContext.getValues(discordanceTrack))) {
return true;
}
if (concordanceOnly && !isConcordant(vc, featureContext.getValues(concordanceTrack))) {
return true;
}
if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic()) {
return true;
}
if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic()) {
return true;
}
if (containsIndelLargerOrSmallerThan(vc, maxIndelSize, minIndelSize)) {
return true;
}
if (applyJexlFiltersBeforeFilteringGenotypes && !passesJexlFilters(vc)) {
return true;
}
return false;
}
/**
* Applies JEXL filters
*
* - Expressions that contain the logical-and (&&) should appear in a single -select argument.
* - When an annotation is absent, the logical-or expression returns false, even when the other argument evaluates to true.
* (This behavior may be overridden by changing {@code JEXLMap.howToTreatMissingValues}).
* - When multiple -select arguments are given, the logical expressions are combined with the logical-or operator. In particular,
* we do not currently support complex logical expressions involving both logical-and's and logical-or's. e.g. (x || y) && z
*
* When both the INFO (--select) and genotype (--select-genotype) filters are given, the current behavior is to take the logical-or of the resulting boolean values.
* We should support combining by logical-and if the users request it.
*/
private boolean passesJexlFilters(final VariantContext vc){
if (infoJexls.isEmpty() && genotypeJexls.isEmpty()){
return true;
}
try {
// ##### Apply INFO JEXL filters #####
// Note that an element in infoJexls can either be an info JEXL or a genotype field accessed via the VariantContext object (e.g. vc.getGenotype('NA12878')...)
for (VariantContextUtils.JexlVCMatchExp jexl : infoJexls) {
// If invert-select is set to true, we take the complement (i.e. "not") of each jexl expression,
// then take the logical-or across the expressions in infoJexls.
//
// For example, given the command line
// "-select AF > 0.01 -select ReadPosRankSum < -20.0"
// If invert-select is false, the expression is understood to be "AF > 0.01 || ReadPosRankSum < -20.0"
// If invert-select is true, it is "!(AF > 0.01) || !(ReadPosRankSum < -20.0)", or
// Note that this is not equivalent to "!(AF > 0.01 || ReadPosRankSum < -20.0)"
// Notice here that calling the match method without the genotype g leads to genotype g being set to null,
// which is fine since infoJexls should not refer to genotype fields (except via vc.getGenotype())
if (invertLogic(VariantContextUtils.match(vc, jexl), invertSelect)){
return true;
}
}
// ##### Apply Genotype JEXL filters #####
for (VariantContextUtils.JexlVCMatchExp jexl : genotypeJexls) {
// We separate the genotype vs INFO cases into two separate arguments because we don't want to be
// looping over all genotypes by default
for (Genotype g : vc.getGenotypes()){
if (invertLogic(VariantContextUtils.match(vc, g, jexl), invertSelect)){
return true;
}
}
}
} catch (IllegalArgumentException e) {
// The IAE thrown by htsjdk already includes an informative error message ("Invalid JEXL
// expression detected...")
throw new UserException(e.getMessage() +
"\nSee https://gatk.broadinstitute.org/hc/en-us/articles/360035891011-JEXL-filtering-expressions for documentation on using JEXL in GATK", e);
}
return false;
}
/**
* write out all remaining pending variants
*/
@Override
public Object onTraversalSuccess() {
while(!pendingVariants.isEmpty()) {
vcfWriter.add(pendingVariants.poll());
}
return null;
}
private VariantContext dropAnnotations(final VariantContext vc, final List infoAnnotationsToDrop,
List genotypeAnnotationsToDrop) {
if (infoAnnotationsToDrop.isEmpty() && genotypeAnnotationsToDrop.isEmpty()) {
return vc;
}
final VariantContextBuilder rmAnnotationsBuilder = new VariantContextBuilder(vc);
for (String infoField : infoAnnotationsToDrop) {
rmAnnotationsBuilder.rmAttribute(infoField);
}
if (!genotypeAnnotationsToDrop.isEmpty()) {
final ArrayList genotypesToWrite = new ArrayList<>();
for (Genotype genotype : vc.getGenotypes()) {
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(genotype).noAttributes();
final Map attributes = new HashMap<>(genotype.getExtendedAttributes());
for (String genotypeAnnotation : genotypeAnnotationsToDrop) {
attributes.remove(genotypeAnnotation);
}
genotypeBuilder.attributes(attributes);
genotypesToWrite.add(genotypeBuilder.make());
}
rmAnnotationsBuilder.genotypes(GenotypesContext.create(genotypesToWrite));
}
return rmAnnotationsBuilder.make();
}
/**
* Get the genotype filters
*
* @param vc the variant context
* @param g the genotype
* @return list of genotype filter names
*/
private List getGenotypeFilters(final VariantContext vc, final Genotype g) {
final List filters = new ArrayList<>();
if (g.isFiltered()) {
filters.add(g.getFilters());
}
return filters;
}
/**
* Close out the new variants file.
*/
@Override
public void closeTool() {
if (vcfWriter != null) {
vcfWriter.close();
}
}
/**
* Create filters for variant types, ids, and genomic intervals.
*/
@Override
protected CountingVariantFilter makeVariantFilter() {
CountingVariantFilter compositeFilter = new CountingVariantFilter(VariantFilterLibrary.ALLOW_ALL_VARIANTS);
if (!selectedTypes.isEmpty()) {
compositeFilter = compositeFilter.and(new CountingVariantFilter(new VariantTypesVariantFilter(selectedTypes, ignoreNonRefInTypes)));
}
if (rsIDsToKeep != null && !rsIDsToKeep.isEmpty()) {
compositeFilter = compositeFilter.and(new CountingVariantFilter(new VariantIDsVariantFilter(rsIDsToKeep)));
}
if (rsIDsToRemove != null && !rsIDsToRemove.isEmpty()) {
compositeFilter = compositeFilter.and(new CountingVariantFilter(new VariantIDsVariantFilter(rsIDsToRemove).negate()));
}
return compositeFilter;
}
/**
* Prepare the sample names to be included(/excluded) in the output by the names filter.
*/
private SortedSet createSampleNameInclusionList(Map vcfHeaders) {
final SortedSet vcfSamples = VcfUtils.getSortedSampleSet(vcfHeaders, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
final Collection samplesFromExpressions = Utils.filterCollectionByExpressions(vcfSamples, sampleExpressions, false);
// first, find any samples that were listed on the command line but don't exist in the header
final Set samplesNotInHeader = new LinkedHashSet<>(samplesFromExpressions.size()+sampleNames.size());
samplesNotInHeader.addAll(sampleNames);
samplesNotInHeader.removeAll(vcfSamples);
// second, add the requested samples
samples.addAll(sampleNames);
samples.addAll(samplesFromExpressions);
// report any requested samples that don't exist in the header and remove them from the list we're accumulating
logger.debug(Utils.join(",", samplesNotInHeader));
if (!samplesNotInHeader.isEmpty()) {
if (allowNonOverlappingCommandLineSamples) {
logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored.");
samples.removeAll(samplesNotInHeader);
}
else {
throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s",
"Samples entered on command line (through -sf or -sn) that are not present in the VCF.",
"A list of these samples:",
Utils.join(",", samplesNotInHeader),
"To ignore these samples, run with --allow-nonoverlapping-command-line-samples"));
}
}
// if none were requested, we want all of them
if (samples.isEmpty()) {
samples.addAll(vcfSamples);
noSamplesSpecified = true;
}
// Exclude samples take precedence over include - remove any excluded samples
final Collection XLsamplesFromExpressions = Utils.filterCollectionByExpressions(vcfSamples, XLsampleExpressions, false);
samples.removeAll(XLsampleNames);
samples.removeAll(XLsamplesFromExpressions);
noSamplesSpecified = noSamplesSpecified &&
XLsampleNames.isEmpty() &&
XLsamplesFromExpressions.isEmpty();
if (samples.isEmpty() && !noSamplesSpecified) {
throw new UserException("All samples requested to be included were also requested to be excluded.");
}
if (!noSamplesSpecified) {
for (String sample : samples) {
logger.info("Including sample '" + sample + "'");
}
}
return samples;
}
/**
* Prepare the type inclusion list to be used by the type filter
*/
private Set createSampleTypeInclusionList() {
// if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
if (typesToInclude.isEmpty()) {
selectedTypes.addAll(Arrays.asList(VariantContext.Type.values()));
}
else {
selectedTypes.addAll(typesToInclude);
}
// Exclude types take precedence over include - remove specified exclude types
selectedTypes.removeAll(typesToExclude);
return selectedTypes;
}
/**
* Prepare the VCF header lines
*/
private Set createVCFHeaderLineList(Map vcfHeaders) {
final Set headerLines = VCFUtils.smartMergeHeaders(vcfHeaders.values(), true);
headerLines.addAll(getDefaultToolVCFHeaderLines());
if (keepOriginalChrCounts) {
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AC_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AF_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_AN_KEY));
}
if (keepOriginalDepth) {
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.ORIGINAL_DP_KEY));
}
for (final String key : ChromosomeCounts.keyNames) {
headerLines.removeIf(line->line instanceof VCFInfoHeaderLine && ((VCFInfoHeaderLine)line).getID().equals(key));
headerLines.add(VCFStandardHeaderLines.getInfoLine(key));
}
headerLines.removeIf(line->line instanceof VCFInfoHeaderLine && ((VCFInfoHeaderLine)line).getID().equals(VCFConstants.DEPTH_KEY));
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY));
//remove header lines for info field and genotype annotations being dropped
headerLines.removeIf(l->l instanceof VCFInfoHeaderLine && infoAnnotationsToDrop.contains(((VCFInfoHeaderLine)l).getID()));
headerLines.removeIf(l->l instanceof VCFFormatHeaderLine && genotypeAnnotationsToDrop.contains(((VCFFormatHeaderLine)l).getID()));
return headerLines;
}
/**
* Invert logic if specified
*
* @param logic boolean logical operation value
* @param invert whether to invert logic
* @return invert logic if invert flag is true, otherwise leave the logic
*/
private static boolean invertLogic(final boolean logic, final boolean invert){
return invert ? !logic : logic;
}
/*
* Determines if any of the alternate alleles are greater than the max indel size or less than the min indel size
*
* @param vc the variant context to check
* @param max-indel-size the maximum size of allowed indels
* @param min-indel-size the minimum size of allowed indels
* @return true if the VC contains an indel larger than max-indel-size or less than the min-indel-size, false otherwise
*
* Protected for unit test access
*/
protected static boolean containsIndelLargerOrSmallerThan(final VariantContext vc, final int maxIndelSize, final int minIndelSize) {
final List lengths = vc.getIndelLengths();
if (lengths == null)
return false;
for (final Integer indelLength : lengths) {
if (Math.abs(indelLength) > maxIndelSize || Math.abs(indelLength) < minIndelSize)
return true;
}
return false;
}
/**
* Find the number of no-call genotypes
*
* @param vc the variant context
* @return number of filtered samples
*/
private int numNoCallGenotypes(final VariantContext vc) {
return numGenotypes(vc, Genotype::isNoCall);
}
/**
* Find the number of filtered samples
*
* @param vc the variant context
* @return number of filtered samples
*/
private int numFilteredGenotypes(final VariantContext vc) {
return numGenotypes(vc, g -> g.isFiltered() && !g.getFilters().isEmpty());
}
/**
* Find the number of samples passing the given filter.
*
* @param vc the variant
* @param f predicate by which to filter genotypes
* @return number of filtered samples
*/
private int numGenotypes(final VariantContext vc, final Predicate f) {
return vc == null ? 0 : (int)vc.getGenotypes(samples).stream().filter(f).count();
}
/**
* Checks if vc has a variant call for (at least one of) the samples.
*
* @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap)
* @param compVCs the comparison VariantContext (discordance)
* @return true VariantContexts are discordant, false otherwise
*/
private boolean isDiscordant (final VariantContext vc, final Collection compVCs) {
if (vc == null) {
return false;
}
// if we're not looking at specific samples then the absence of a compVC means discordance
if (noSamplesSpecified) {
return (compVCs == null || compVCs.isEmpty());
}
// check if we find it in the variant rod
final GenotypesContext genotypes = vc.getGenotypes(samples);
for (final Genotype g : genotypes) {
if (sampleHasVariant(g)) {
// There is a variant called (or filtered with not exclude filtered option set) that is not HomRef for at least one of the samples.
if (compVCs == null) {
return true;
}
// Look for this sample in the all vcs of the comp ROD track.
boolean foundVariant = false;
for (final VariantContext compVC : compVCs) {
if (haveSameGenotypes(g, compVC.getGenotype(g.getSampleName()))) {
foundVariant = true;
break;
}
}
// if (at least one sample) was not found in all VCs of the comp ROD, we have discordance
if (!foundVariant)
return true;
}
}
return false; // we only get here if all samples have a variant in the comp rod.
}
/**
* Checks if the two variants have the same genotypes for the selected samples
*
* @param vc the variant rod VariantContext.
* @param compVCs the comparison VariantContext
* @return true if VariantContexts are concordant, false otherwise
*/
private boolean isConcordant (final VariantContext vc, final Collection compVCs) {
if (vc == null || compVCs == null || compVCs.isEmpty()) {
return false;
}
// if we're not looking for specific samples then the fact that we have both VCs is enough to call it concordant.
if (noSamplesSpecified) {
return true;
}
// make a list of all samples contained in this variant VC that are being tracked by the user command line arguments.
final Set variantSamples = vc.getSampleNames();
variantSamples.retainAll(samples);
// check if we can find all samples from the variant rod in the comp rod.
for (final String sample : variantSamples) {
boolean foundSample = false;
for (final VariantContext compVC : compVCs) {
final Genotype varG = vc.getGenotype(sample);
final Genotype compG = compVC.getGenotype(sample);
if (haveSameGenotypes(varG, compG)) {
foundSample = true;
break;
}
}
// if at least one sample doesn't have the same genotype, we don't have concordance
if (!foundSample) {
return false;
}
}
return true;
}
private boolean sampleHasVariant(final Genotype g) {
return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !excludeFiltered)));
}
private boolean haveSameGenotypes(final Genotype g1, final Genotype g2) {
if (g1 == null || g2 == null) {
return false;
}
if ((g1.isCalled() && g2.isFiltered()) ||
(g2.isCalled() && g1.isFiltered()) ||
(g1.isFiltered() && g2.isFiltered() && excludeFiltered)) {
return false;
}
final List a1s = g1.getAlleles();
final List a2s = g2.getAlleles();
return (a1s.containsAll(a2s) && a2s.containsAll(a1s));
}
/**
* Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF).
*
* @param vc the VariantContext record to subset
* @param preserveAlleles should we trim constant sequence from the beginning and/or end of all alleles, or preserve it?
* @param removeUnusedAlternates removes alternate alleles with AC=0
* @return the subsetted VariantContext
*/
private VariantContext subsetGenotypesBySampleNames(final VariantContext vc, final boolean preserveAlleles, final boolean removeUnusedAlternates) {
//subContextFromSamples() always decodes the vc, which is a fairly expensive operation. Avoid if possible
if (noSamplesSpecified && !removeUnusedAlternates) {
return vc;
}
// strip out the alternate alleles that aren't being used
final VariantContext sub = vc.subContextFromSamples(samples, removeUnusedAlternates);
// If no subsetting of samples or alleles happened, exit now
if (sub.getNSamples() == vc.getNSamples() && sub.getNAlleles() == vc.getNAlleles()) {
return vc;
}
GenotypesContext newGC;
if (sub.getNAlleles() != vc.getNAlleles()) {
// fix the PL and AD values if sub has fewer alleles than original vc
final GenotypesContext subGenotypesWithOldAlleles = sub.getGenotypes(); //we need sub for the right samples, but PLs still go with old alleles
newGC = sub.getNAlleles() == vc.getNAlleles() ? subGenotypesWithOldAlleles :
AlleleSubsettingUtils.subsetAlleles(subGenotypesWithOldAlleles, 0, vc.getAlleles(),
sub.getAlleles(), null, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES);
} else {
newGC = sub.getGenotypes();
}
// Change a fraction of the genotypes to NOCALL if requested
if (fractionGenotypes > 0) {
final List genotypes = newGC.stream().map(genotype -> randomGenotypes.nextDouble() > fractionGenotypes ? genotype :
new GenotypeBuilder(genotype).alleles(getNoCallAlleles(genotype.getPloidy())).noGQ().make()).collect(Collectors.toList());
newGC = GenotypesContext.create(new ArrayList<>(genotypes));
}
// since the VC has been subset (either by sample or allele), we need to strip out the MLE tags
final VariantContextBuilder builder = new VariantContextBuilder(sub);
builder.rmAttributes(Arrays.asList(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY));
builder.genotypes(newGC);
addAnnotations(builder, vc, sub.getSampleNames());
final VariantContext subset = builder.make();
return preserveAlleles ? subset : GATKVariantContextUtils.trimAlleles(subset,true,true);
}
/**
* Get the ploidy number of NO-CALL alleles
*
* @param ploidy number of sets of chromosomes
* @return the NO-CALL alleles
*/
private List getNoCallAlleles(final int ploidy) {
return ploidy == 2 ? diploidNoCallAlleles : GATKVariantContextUtils.noCallAlleles(ploidy);
}
/*
* Add annotations to the new VC
*
* @param builder the new VC to annotate
* @param originalVC the original VC
* @param selectedSampleNames the post-selection list of sample names
*/
private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC, final Set selectedSampleNames) {
if (fullyDecode) {
return; // TODO -- annotations are broken with fully decoded data
}
if (keepOriginalChrCounts) {
final int[] indexOfOriginalAlleleForNewAllele;
final List newAlleles = builder.getAlleles();
final int numOriginalAlleles = originalVC.getNAlleles();
// if the alleles already match up, we can just copy the previous list of counts
if (numOriginalAlleles == newAlleles.size()) {
indexOfOriginalAlleleForNewAllele = null;
}
// otherwise we need to parse them and select out the correct ones
else {
indexOfOriginalAlleleForNewAllele = new int[newAlleles.size() - 1];
Arrays.fill(indexOfOriginalAlleleForNewAllele, -1);
// note that we don't care about the reference allele at position 0
for (int newI = 1; newI < newAlleles.size(); newI++) {
final Allele newAlt = newAlleles.get(newI);
for (int oldI = 0; oldI < numOriginalAlleles - 1; oldI++) {
if (newAlt.equals(originalVC.getAlternateAllele(oldI), false)) {
indexOfOriginalAlleleForNewAllele[newI - 1] = oldI;
break;
}
}
}
}
if (originalVC.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) {
builder.attribute(GATKVCFConstants.ORIGINAL_AC_KEY,
getReorderedAttributes(originalVC.getAttribute(VCFConstants.ALLELE_COUNT_KEY), indexOfOriginalAlleleForNewAllele));
}
if (originalVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) {
builder.attribute(GATKVCFConstants.ORIGINAL_AF_KEY,
getReorderedAttributes(originalVC.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY), indexOfOriginalAlleleForNewAllele));
}
if (originalVC.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY)) {
builder.attribute(GATKVCFConstants.ORIGINAL_AN_KEY, originalVC.getAttribute(VCFConstants.ALLELE_NUMBER_KEY));
}
}
VariantContextUtils.calculateChromosomeCounts(builder, false);
if (keepOriginalDepth && originalVC.hasAttribute(VCFConstants.DEPTH_KEY)) {
builder.attribute(GATKVCFConstants.ORIGINAL_DP_KEY, originalVC.getAttribute(VCFConstants.DEPTH_KEY));
}
boolean sawDP = false;
int depth = 0;
for (final String sample : selectedSampleNames ) {
final Genotype g = originalVC.getGenotype(sample);
if (!g.isFiltered()) {
if (g.hasDP()) {
depth += g.getDP();
sawDP = true;
}
}
}
if (sawDP) {
builder.attribute(VCFConstants.DEPTH_KEY, depth);
}
}
/**
* Pulls out the appropriate tokens from the old ordering of an attribute to the new ordering
*
* @param attribute the non-null attribute (from the INFO field)
* @param oldToNewIndexOrdering the mapping from new to old ordering
* @return non-null Object attribute
*/
private Object getReorderedAttributes(final Object attribute, final int[] oldToNewIndexOrdering) {
// if the ordering is the same, then just use the original attribute
if (oldToNewIndexOrdering == null) {
return attribute;
}
// break the original attributes into separate tokens; unfortunately, this means being smart about class types
final Object[] tokens;
if (attribute.getClass().isArray()) {
tokens = (Object[]) attribute;
} else if (List.class.isAssignableFrom(attribute.getClass())) {
tokens = ((List) attribute).toArray();
} else {
tokens = attribute.toString().split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR);
}
Utils.validateArg(Arrays.stream(oldToNewIndexOrdering).allMatch(index -> index < tokens.length), () ->
"the old attribute has an incorrect number of elements: " + attribute);
return Arrays.stream(oldToNewIndexOrdering).mapToObj(index -> tokens[index]).collect(Collectors.toList());
}
/**
* Should the number of filtered genotypes be considered for filtering?
* @return true if any of the filtered genotype samples arguments is used (not the default value), false otherwise
*/
private boolean considerFilteredGenotypes(){
// Returns true if default has been overridden, otherwise false.
return maxFilteredGenotypes != MAX_FILTERED_GENOTYPES_DEFAULT_VALUE ||
minFilteredGenotypes != MIN_FILTERED_GENOTYPES_DEFAULT_VALUE ||
maxFractionFilteredGenotypes != MAX_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE ||
minFractionFilteredGenotypes != MIN_FRACTION_FILTERED_GENOTYPES_DEFAULT_VALUE;
}
/**
* Should the number of no-call genotypes be considered for filtering?
*
* @return true if any of the filtered genotype samples arguments is used (not the default value), false otherwise
*/
private boolean considerNoCallGenotypes(){
return maxNOCALLnumber != MAX_NOCALL_NUMBER_DEFAULT_VALUE ||
maxNOCALLfraction != MAX_NOCALL_FRACTION_DEFAULT_VALUE;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy