org.broadinstitute.hellbender.engine.VariantWalkerBase Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.engine;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.filters.CountingVariantFilter;
import org.broadinstitute.hellbender.engine.filters.VariantFilter;
import org.broadinstitute.hellbender.engine.filters.VariantFilterLibrary;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
import org.broadinstitute.hellbender.transformers.VariantTransformer;
import org.broadinstitute.hellbender.utils.IndexUtils;
import org.broadinstitute.hellbender.utils.variant.writers.IntervalFilteringVcfWriter;

import java.util.Spliterator;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

/**
 * Base class for variant walkers, which process variants from one or more sources of variants,
 * with optional contextual information from a reference, sets of reads, and/or supplementary sources of
 * Features.
 *
 * Subclasses must implement the {@link #traverse} method to process variants, {@link #initializeDrivingVariants},
 * {@link #getHeaderForVariants},
 * {@link #getSequenceDictionaryForDrivingVariants},
 * {@link #getSpliteratorForDrivingVariants}, and may optionally implement {@link #onTraversalStart},
 * {@link #onTraversalSuccess} and/or {@link #closeTool}.
 */
public abstract class VariantWalkerBase extends WalkerBase {

    /**
     * Default value to control the size of the cache for our driving variants input(s)
     * (specifically, the number of additional bases worth of overlapping records to cache for
     * queries on the driving variants).
     */
    public static final int DEFAULT_DRIVING_VARIANTS_LOOKAHEAD_BASES = 100_000;
    @Argument(fullName = StandardArgumentDefinitions.VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME,
            doc = "Restrict the output variants to ones that match the specified intervals according to the specified matching mode.",
            optional = true)
    @Advanced
    public IntervalFilteringVcfWriter.Mode userOutputVariantIntervalFilteringMode = null;

    //Various options for reading from a GenomicsDB
    protected GenomicsDBOptions genomicsDBOptions;

    @Override
    public boolean requiresFeatures() { return true; }

    @Override
    public String getProgressMeterRecordLabel() { return "variants"; }

    @Override
    protected GenomicsDBOptions getGenomicsDBOptions() {
        if (genomicsDBOptions == null) {
            genomicsDBOptions = new GenomicsDBOptions(referenceArguments.getReferencePath());
        }
        return genomicsDBOptions;
    }

    @Override
    void initializeFeatures() {

        // We override this method to prevent our FeatureManager from being set to null when no side feature inputs
        // are specified, since VariantWalkers always have at least one (driving) variants input. Note that the query
        // lookahead size used here for side inputs is not necessarily the same as for driving variants, which is
        // determined by {@link #getDrivingVariantCacheLookAheadBases}.
        //
        // TODO: I think reducing the lookahead for side inputs from DEFAULT_DRIVING_VARIANTS_LOOKAHEAD_BASES to
        // TODO: FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES will likely hurt performance for tools like VQSR,
        // TODO: but let's test it
        features = new FeatureManager(this, DEFAULT_DRIVING_VARIANTS_LOOKAHEAD_BASES, cloudPrefetchBuffer, cloudIndexPrefetchBuffer,
                                      getGenomicsDBOptions());
        initializeDrivingVariants();
    }

    /**
     * Overriding the superclass method to preferentially
     * choose the sequence dictionary from the driving source of variants.
     */
    @Override
    public SAMSequenceDictionary getBestAvailableSequenceDictionary() {
        final SAMSequenceDictionary dictFromDrivingVariants = getSequenceDictionaryForDrivingVariants();
        if (dictFromDrivingVariants != null) {
            //If this dictionary looks like it was synthesized from a feature index, see
            //if there is a better dictionary available from another source (i.e., the reference)
            if (IndexUtils.isSequenceDictionaryFromIndex(dictFromDrivingVariants)) {
                final SAMSequenceDictionary otherDictionary = super.getBestAvailableSequenceDictionary();
                return otherDictionary != null ?
                        otherDictionary :
                        dictFromDrivingVariants;
            } else {
                return dictFromDrivingVariants;
            }
        }
        return super.getBestAvailableSequenceDictionary();
    }

    /**
     * Process the feature inputs that represent the primary driving source(s) of variants for this tool, and
     * perform any necessary header and sequence dictionary validation. Called by the framework during feature
     * initialization.
     */
    protected abstract void initializeDrivingVariants();

    /**
     * Return the VCFHeader to be used for the driving variants for this tool. The value returned will usually
     * have been prepared in {@link #initializeDrivingVariants}
     * @return VCFHeader to be used for the driving variants.
     */
    public abstract VCFHeader getHeaderForVariants();

    @Override
    public IntervalFilteringVcfWriter.Mode getVariantOutputFilteringMode() {
        if (userOutputVariantIntervalFilteringMode != null) {
            return userOutputVariantIntervalFilteringMode;
        } else {
            // Use whatever is the default provided by GATKTool
            return super.getVariantOutputFilteringMode();
        }
    }

    /**
     * Return the primary sequence dictionary to be used for the driving variants for this tool. The value returned
     * will usually have been prepared in {@link #initializeDrivingVariants}
     */
    protected abstract SAMSequenceDictionary getSequenceDictionaryForDrivingVariants();

    /**
     * Return a spliterator to be used to iterate over the elements of the driving variants.
     */
    protected abstract Spliterator getSpliteratorForDrivingVariants();

    /**
     * When performing a query on the driving variants input(s), the number of additional bases beyond the end
     * of the query for which overlapping variants should be pre-fetched and cached.
     *
     * Defaults to {@link #DEFAULT_DRIVING_VARIANTS_LOOKAHEAD_BASES}
     *
     * Subclasses can customize this value by overriding this method.
     *
     * @return the number of additional bases to prefetch and cache beyond a query on the driving variants
     */
    protected int getDrivingVariantCacheLookAheadBases(){
        return DEFAULT_DRIVING_VARIANTS_LOOKAHEAD_BASES;
    }

    /**
     * Returns the pre-filter variant transformer (simple or composite) that will be applied to the variants before filtering.
     * The default implementation uses the {@link VariantTransformer#identity()}.
     * Default implementation of {@link #traverse()} calls this method once before iterating over the variants and reuses
     * the transformer object to avoid object allocation.
     *
     * Subclasses can extend to provide own transformers (i.e. override and call super).
     * Multiple transformers can be composed by using {@link VariantTransformer} composition methods.
     */
    public VariantTransformer makePreVariantFilterTransformer() {
        return VariantTransformer.identity();
    }

    /**
     * Returns the post-filter variant transformer (simple or composite) that will be applied to the variants after filtering.
     * The default implementation uses the {@link VariantTransformer#identity()}.
     * Default implementation of {@link #traverse()} calls this method once before iterating over the variants and reuses
     * the transformer object to avoid object allocation.
     *
     * Subclasses can extend to provide own transformers (i.e. override and call super).
     * Multiple transformers can be composed by using {@link VariantTransformer} composition methods.
     */
    public VariantTransformer makePostVariantFilterTransformer(){
        return VariantTransformer.identity();
    }

    /**
     * Returns a stream over the variants, which are:
     *
     * 1. Transformed with {@link #makePreVariantFilterTransformer()}.
     * 2. Filtered with {@code filter}.
     * 3. Transformed with {@link #makePostVariantFilterTransformer()}.
     */
    protected Stream getTransformedVariantStream(final CountingVariantFilter filter) {
        final VariantTransformer preTransformer  = makePreVariantFilterTransformer();
        final VariantTransformer postTransformer = makePostVariantFilterTransformer();
        return getTransformedVariantStream(getSpliteratorForDrivingVariants(),
                preTransformer,
                filter,
                postTransformer);
    }

    /**
     * Returns a stream over the variants returned by source, which are:
     *
     * 1. Transformed with preTransformer.
     * 2. Filtered with filter.
     * 3. Transformed with postTransformer.
     */
    protected Stream getTransformedVariantStream(
            final Spliterator source,
            final VariantTransformer preTransformer,
            final CountingVariantFilter filter,
            final VariantTransformer postTransformer) {
        return StreamSupport.stream(source, false)
                .map(preTransformer)
                .filter(filter)
                .map(postTransformer);
    }

    /**
     * Returns the variant filter (simple or composite) that will be applied to the variants before calling {@link #apply}.
     * The default implementation filters nothing.
     * Default implementation of {@link #traverse()} calls this method once before iterating
     * over the reads and reuses the filter object to avoid object allocation. Nevertheless, keeping state in filter objects is strongly discouraged.
     *
     * Subclasses can extend to provide own filters (ie override and call super).
     * Multiple filters can be composed by using {@link VariantFilter} composition methods.
     */
    protected CountingVariantFilter makeVariantFilter() {
        return new CountingVariantFilter(VariantFilterLibrary.ALLOW_ALL_VARIANTS);
    }

}