org.broadinstitute.hellbender.engine.FeatureDataSource Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
There is a newer version: 4.6.0.0
package org.broadinstitute.hellbender.engine;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.CloseableTribbleIterator;
import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodec;
import htsjdk.tribble.FeatureReader;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.IndexFeatureFile;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBConstants;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
import org.broadinstitute.hellbender.utils.IndexUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.genomicsdb.model.GenomicsDBExportConfiguration;
import org.genomicsdb.reader.GenomicsDBFeatureReader;

import java.io.File;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;

import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.createExportConfiguration;

/**
 * Enables traversals and queries over sources of Features, which are metadata associated with a location
 * on the genome in a format supported by our file parsing framework, Tribble. Examples of Features are
 * VCF records and hapmap records.
 * 
 * Two basic operations are available on this data source:
 * 

 * -Iteration over all Features in this data source, optionally restricted to Features overlapping
 * a set of intervals if intervals are provided via {@link #setIntervalsForTraversal(List)}. Traversal
 * by a set of intervals requires the file to have been indexed using the bundled tool IndexFeatureFile.
 * The set of intervals provided MUST be non-overlapping and sorted in increasing order of start position.
 * 

 * -Targeted queries by one interval at a time. This also requires the file to have been indexed using
 * the bundled tool IndexFeatureFile. Targeted queries by one interval at a time are unaffected by
 * any intervals for full traversal set via {@link #setIntervalsForTraversal(List)}.
 * 

 * To improve performance in the case of targeted queries by one interval at a time, this class uses a caching
 * scheme that is optimized for the common access pattern of multiple separate queries over intervals with
 * gradually increasing start positions. It optimizes for this use case by pre-fetching records immediately
 * following each interval during a query and caching them. Performance will suffer if the access pattern is
 * random, involves queries over intervals with DECREASING start positions instead of INCREASING start positions,
 * or involves lots of very large jumps forward on the genome or lots of contig switches. Query caching
 * can be disabled, if desired.
 *
 * @param  The type of Feature returned by this data source
 */
public final class FeatureDataSource implements GATKDataSource, AutoCloseable {
    private static final Logger logger = LogManager.getLogger(FeatureDataSource.class);

    /**
     * Feature reader used to retrieve records from our file
     */
    private final FeatureReader featureReader;

    /**
     * Iterator representing an open traversal over this data source initiated via a call to {@link #iterator}
     * (null if there is no open traversal). We need this to ensure that each iterator is properly closed,
     * and to enforce the constraint (required by Tribble) that we never have more than one iterator open
     * over our feature reader.
     */
    private CloseableTribbleIterator currentIterator;

    /**
     * Our intervals for traversal. If set, restricts full traversals initiated via {@link #iterator} to
     * return only Features overlapping this set of intervals. Does not affect individual queries
     * initiated via {@link #query(SimpleInterval)} and/or {@link #queryAndPrefetch(Locatable)}.
     */
    private List intervalsForTraversal;

    /**
     * Cache containing Features from recent queries initiated via {@link #query(SimpleInterval)} and/or
     * {@link #queryAndPrefetch(Locatable)}. This is guaranteed to start at the start position of the
     * most recent query, but will typically end well after the end of the most recent query. Designed to
     * improve performance of the common access pattern involving multiple queries across nearby intervals
     * with gradually increasing start positions.
     */
    private final FeatureCache queryCache;

    /**
     * When we experience a cache miss (ie., a query interval not fully contained within our cache) and need
     * to re-populate the Feature cache from disk to satisfy a query, this controls the number of extra bases
     * AFTER the end of our interval to fetch. Should be sufficiently large so that typically a significant number
     * of subsequent queries will be cache hits (ie., query intervals fully contained within our cache) before
     * we have another cache miss and need to go to disk again.
     */
    private final int queryLookaheadBases;

    /**
     * Holds information about the path this datasource reads from.
     */
    private final FeatureInput featureInput;

    /**
     * True if this datasource is backed by a file that has an associated index file, false if it doesn't
     */
    private final boolean hasIndex;

    /**
     * True if this datasource supports efficient random access queries.
     * 

     * For a file, this is the same as {@link #hasIndex}, but there are non-file data sources (eg., GenomicsDB)
     * that don't have a separate index file but do support random access.
     */
    private final boolean supportsRandomAccess;

    /**
     * Default value for queryLookaheadBases, if none is specified. This is designed to be large enough
     * so that in typical usage (ie., query intervals with gradually increasing start locations) there will
     * be a substantial number of cache hits between cache misses, reducing the number of times we need to
     * repopulate the cache from disk.
     */
    public static final int DEFAULT_QUERY_LOOKAHEAD_BASES = 1000;

    /**
     * Creates a FeatureDataSource backed by the provided File. The data source will have an automatically
     * generated name, and will look ahead the default number of bases ({@link #DEFAULT_QUERY_LOOKAHEAD_BASES})
     * during queries that produce cache misses.
     *
     * @param featureFile file containing Features
     */
    public FeatureDataSource(final File featureFile) {
        this(featureFile, null);
    }

    /**
     * Creates a FeatureDataSource backed by the provided path. The data source will have an automatically
     * generated name, and will look ahead the default number of bases ({@link #DEFAULT_QUERY_LOOKAHEAD_BASES})
     * during queries that produce cache misses.
     *
     * @param featurePath path or URI to source of Features
     */
    public FeatureDataSource(final String featurePath) {
        this(featurePath, null, DEFAULT_QUERY_LOOKAHEAD_BASES, null);
    }

    /**
     * Creates a FeatureDataSource backed by the provided File and assigns this data source the specified logical
     * name. We will look ahead the default number of bases ({@link #DEFAULT_QUERY_LOOKAHEAD_BASES}) during queries
     * that produce cache misses.
     *
     * @param featureFile file containing Features
     * @param name        logical name for this data source (may be null)
     */
    public FeatureDataSource(final File featureFile, final String name) {
        this(featureFile, name, DEFAULT_QUERY_LOOKAHEAD_BASES);
    }

    /**
     * Creates a FeatureDataSource backed by the provided File and assigns this data source the specified logical
     * name. We will look ahead the specified number of bases during queries that produce cache misses.
     *
     * @param featureFile         file containing Features
     * @param name                logical name for this data source (may be null)
     * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
     */
    public FeatureDataSource(final File featureFile, final String name, final int queryLookaheadBases) {
        this(Utils.nonNull(featureFile).getAbsolutePath(), name, queryLookaheadBases, null);
    }

    /**
     * Creates a FeatureDataSource backed by the resource at the provided path.
     *
     * @param featurePath         path to file or GenomicsDB url containing features
     * @param name                logical name for this data source (may be null)
     * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
     * @param targetFeatureType   When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
     *                            that produce this type of Feature. May be null, which results in an unrestricted search.
     */
    public FeatureDataSource(final String featurePath, final String name, final int queryLookaheadBases, final Class targetFeatureType) {
        this(new FeatureInput<>(featurePath, name != null ? name : featurePath), queryLookaheadBases, targetFeatureType);
    }

    /**
     * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
     * during queries that produce cache misses.
     *
     * @param featureInput        a FeatureInput specifying a source of Features
     * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
     * @param targetFeatureType   When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
     *                            that produce this type of Feature. May be null, which results in an unrestricted search.
     */
    public FeatureDataSource(final FeatureInput featureInput, final int queryLookaheadBases, final Class targetFeatureType) {
        this(featureInput, queryLookaheadBases, targetFeatureType, 0, 0);
    }

    /**
     * Creates a FeatureDataSource backed by the resource at the provided path.
     *
     * @param featurePath              path to file or GenomicsDB url containing features
     * @param name                     logical name for this data source (may be null)
     * @param queryLookaheadBases      look ahead this many bases during queries that produce cache misses
     * @param targetFeatureType        When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
     *                                 that produce this type of Feature. May be null, which results in an unrestricted search.
     * @param cloudPrefetchBuffer      MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
     * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
     */
    public FeatureDataSource(final String featurePath, final String name, final int queryLookaheadBases, final Class targetFeatureType,
                             final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer) {
        this(new FeatureInput<>(featurePath, name != null ? name : featurePath), queryLookaheadBases, targetFeatureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer);
    }

    /**
     * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
     * during queries that produce cache misses.
     *
     * @param featureInput             a FeatureInput specifying a source of Features
     * @param queryLookaheadBases      look ahead this many bases during queries that produce cache misses
     * @param targetFeatureType        When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
     *                                 that produce this type of Feature. May be null, which results in an unrestricted search.
     * @param cloudPrefetchBuffer      MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
     * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
     */
    public FeatureDataSource(final FeatureInput featureInput, final int queryLookaheadBases, final Class targetFeatureType,
                             final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer) {
        this(featureInput, queryLookaheadBases, targetFeatureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer,
             new GenomicsDBOptions());
    }

    /**
     * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
     * during queries that produce cache misses.
     *
     * @param featureInput             a FeatureInput specifying a source of Features
     * @param queryLookaheadBases      look ahead this many bases during queries that produce cache misses
     * @param targetFeatureType        When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
     *                                 that produce this type of Feature. May be null, which results in an unrestricted search.
     * @param cloudPrefetchBuffer      MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
     * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
     * @param reference                 the reference genome corresponding to the data to be read
     */
    public FeatureDataSource(final FeatureInput featureInput, final int queryLookaheadBases, final Class targetFeatureType,
                             final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final Path reference) {
        this(featureInput, queryLookaheadBases, targetFeatureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer,
                new GenomicsDBOptions(reference));
    }

    /**
     * Creates a FeatureDataSource backed by the provided FeatureInput. We will look ahead the specified number of bases
     * during queries that produce cache misses.
     *
     * @param featureInput             a FeatureInput specifying a source of Features
     * @param queryLookaheadBases      look ahead this many bases during queries that produce cache misses
     * @param targetFeatureType        When searching for a {@link FeatureCodec} for this data source, restrict the search to codecs
     *                                 that produce this type of Feature. May be null, which results in an unrestricted search.
     * @param cloudPrefetchBuffer      MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
     * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
     * @param genomicsDBOptions         options and info for reading from a GenomicsDB; may be null
     */
    public FeatureDataSource(final FeatureInput featureInput, final int queryLookaheadBases, final Class targetFeatureType,
                             final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions genomicsDBOptions) {
        Utils.validateArg(queryLookaheadBases >= 0, "Query lookahead bases must be >= 0");
        this.featureInput = Utils.nonNull(featureInput, "featureInput must not be null");
        if (IOUtils.isGenomicsDBPath(featureInput)) {
            Utils.nonNull(genomicsDBOptions, "GenomicsDBOptions must not be null. Calling tool may not read from a GenomicsDB data source.");
        }

        // Create a feature reader without requiring an index.  We will require one ourselves as soon as
        // a query by interval is attempted.
        this.featureReader = getFeatureReader(featureInput, targetFeatureType,
                BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer),
                BucketUtils.getPrefetchingWrapper(cloudIndexPrefetchBuffer),
                genomicsDBOptions);

        if (IOUtils.isGenomicsDBPath(featureInput)) {
            //genomics db uri's have no associated index file to read from, but they do support random access
            this.hasIndex = false;
            this.supportsRandomAccess = true;
        } else if (featureReader instanceof AbstractFeatureReader) {
            this.hasIndex = ((AbstractFeatureReader) featureReader).hasIndex();
            this.supportsRandomAccess = hasIndex;
        } else {
            throw new GATKException("Found a feature input that was neither GenomicsDB or a Tribble AbstractFeatureReader.  Input was " + featureInput.toString() + ".");
        }
        // Due to a bug in HTSJDK, unindexed block compressed input files may fail to parse completely. For safety,
        // these files have been disabled. See https://github.com/broadinstitute/gatk/issues/4224 for discussion
        if (!hasIndex && IOUtil.hasBlockCompressedExtension(featureInput.getFeaturePath())) {
            throw new UserException.MissingIndex(featureInput.toString(), "Support for unindexed block-compressed files has been temporarily disabled. Try running IndexFeatureFile on the input.");
        }

        this.currentIterator = null;
        this.intervalsForTraversal = null;
        this.queryCache = new FeatureCache<>();
        this.queryLookaheadBases = queryLookaheadBases;
    }

    final void printCacheStats() {
        queryCache.printCacheStatistics( getName() );
    }

    @SuppressWarnings("unchecked")
    private static  FeatureReader getFeatureReader(final FeatureInput featureInput, final Class targetFeatureType,
                                                                         final Function cloudWrapper,
                                                                         final Function cloudIndexWrapper,
                                                                         final GenomicsDBOptions genomicsDBOptions) {
        if (IOUtils.isGenomicsDBPath(featureInput.getFeaturePath())) {
            Utils.nonNull(genomicsDBOptions);
            try {
                if (genomicsDBOptions.getReference() == null) {
                    throw new UserException.MissingReference("You must provide a reference if you want to load from GenomicsDB");
                }
                try {
                    final File referenceAsFile = genomicsDBOptions.getReference().toFile();
                    return (FeatureReader)getGenomicsDBFeatureReader(featureInput, referenceAsFile, genomicsDBOptions);
                } catch (final UnsupportedOperationException e){
                    throw new UserException.BadInput("GenomicsDB requires that the reference be a local file.", e);
                }
            } catch (final ClassCastException e) {
                throw new UserException("GenomicsDB inputs can only be used to provide VariantContexts.", e);
            }
        } else {
            final FeatureCodec codec = getCodecForFeatureInput(featureInput, targetFeatureType);
            return getTribbleFeatureReader(featureInput, codec, cloudWrapper, cloudIndexWrapper);
        }
    }

    /**
     * Get a new FeatureCodec instance to use for a FeatureInput. Avoid re-discovering which codec class to
     * use by checking to see if the FeatureInput already has a cached codec class. It not, discover the codec class
     * and cache it for next time.
     *
     * @return A new FeatureCodec instance to use for the FeatureInput.
     */
    @SuppressWarnings("unchecked")
    private static  FeatureCodec getCodecForFeatureInput(final FeatureInput featureInput,
                                                                                  final Class targetFeatureType) {
        final FeatureCodec codec;
        final Class> codecClass = featureInput.getFeatureCodecClass();
        if (codecClass == null) {
            final Path featurePath = featureInput.toPath();
            IOUtils.assertFileIsReadable(featurePath);
            codec = (FeatureCodec) FeatureManager.getCodecForFile(featurePath, targetFeatureType);
            featureInput.setFeatureCodecClass((Class>) codec.getClass());
        } else {
            try {
                codec = codecClass.getDeclaredConstructor().newInstance();
            } catch (final InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException e) {
                throw new GATKException("Unable to automatically instantiate codec " + codecClass.getName());
            }
        }
        return codec;
    }

    private static  AbstractFeatureReader getTribbleFeatureReader(final FeatureInput featureInput, final FeatureCodec codec, final Function cloudWrapper, final Function cloudIndexWrapper) {
        Utils.nonNull(codec);
        try {
            // Must get the path to the data file from the codec here:
            final String absoluteRawPath = featureInput.getRawInputString();

            // Instruct the reader factory to not require an index. We will require one ourselves as soon as
            // a query by interval is attempted.
            final boolean requireIndex = false;

            // Only apply the wrappers if the feature input is in a remote location which will benefit from prefetching.
            if (BucketUtils.isEligibleForPrefetching(featureInput)) {
                return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, cloudWrapper, cloudIndexWrapper);
            } else {
                return AbstractFeatureReader.getFeatureReader(absoluteRawPath, null, codec, requireIndex, Utils.identityFunction(), Utils.identityFunction());
            }
        } catch (final TribbleException e) {
            throw new GATKException("Error initializing feature reader for path " + featureInput.getFeaturePath(), e);
        }
    }

    protected static FeatureReader getGenomicsDBFeatureReader(final GATKPath path, final File reference, final GenomicsDBOptions genomicsDBOptions) {
        final String workspace = IOUtils.getGenomicsDBAbsolutePath(path) ;
        if (workspace == null) {
            throw new IllegalArgumentException("Trying to create a GenomicsDBReader from  non-GenomicsDB input path " + path);
        } else if (Files.notExists(IOUtils.getPath(workspace.endsWith("/") ? workspace : workspace + "/"))) {
            throw new UserException("GenomicsDB workspace " + path + " does not exist");
        }

        final String callsetJson = IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME);
        final String vidmapJson = IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME);
        final String vcfHeader = IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);

        IOUtils.assertPathsAreReadable(callsetJson, vidmapJson, vcfHeader);

        try {
            final GenomicsDBExportConfiguration.ExportConfiguration exportConfigurationBuilder =
                    createExportConfiguration(workspace, callsetJson, vidmapJson, vcfHeader, genomicsDBOptions);
            if (genomicsDBOptions.useBCFCodec()) {
                return new GenomicsDBFeatureReader<>(exportConfigurationBuilder, new BCF2Codec(), Optional.empty());
            } else {
                return new GenomicsDBFeatureReader<>(exportConfigurationBuilder, new VCFCodec(), Optional.empty());
            }
        } catch (final IOException e) {
            throw new UserException("Couldn't create GenomicsDBFeatureReader", e);
        }
    }

    /**
     * Returns the sequence dictionary for this source of Features.
     * Uses the dictionary from the VCF header (if present) for variant inputs,
     * otherwise attempts to create a sequence dictionary from the index file (if present).
     * Returns null if no dictionary could be created from either the header or the index.
     */
    public SAMSequenceDictionary getSequenceDictionary() {
        SAMSequenceDictionary dict = null;
        final Object header = getHeader();
        if (header instanceof VCFHeader) {
            dict = ((VCFHeader) header).getSequenceDictionary();
        }
        if (dict != null && !dict.isEmpty()) {
            return dict;
        }
        if (hasIndex) {
            return IndexUtils.createSequenceDictionaryFromFeatureIndex(new File(featureInput.getFeaturePath()));
        }
        return null;
    }

    /**
     * Restricts traversals of this data source via {@link #iterator} to only return Features that overlap the provided
     * intervals. Calls to {@link #query(SimpleInterval)} and/or {@link #queryAndPrefetch(Locatable)} are not
     * affected by these intervals.
     * 

     * Intervals MUST be non-overlapping and sorted in order of increasing start position, otherwise traversal
     * results will be incorrect.
     * 

     * Passing in a null or empty interval List clears the intervals for traversal, making future iterations
     * over this data source unrestricted by intervals.
     *
     * @param intervals Our next full traversal will return only Features overlapping these intervals
     */
    public void setIntervalsForTraversal(final List intervals) {
        // Treat null and empty interval lists the same
        intervalsForTraversal = (intervals != null && !intervals.isEmpty()) ? intervals : null;

        if (intervalsForTraversal != null && !supportsRandomAccess) {
            throw new UserException("Input " + featureInput.getFeaturePath() + " must support random access to enable traversal by intervals. " +
                    "If it's a file, please index it using the bundled tool " + IndexFeatureFile.class.getSimpleName());
        }
    }


    /**
     * Gets an iterator over all Features in this data source, restricting traversal to Features
     * overlapping our intervals if intervals were provided via {@link #setIntervalsForTraversal(List)}
     * 

     * Calling this method invalidates (closes) any previous iterator obtained from this method.
     *
     * @return an iterator over all Features in this data source, limited to Features that overlap the intervals supplied via {@link #setIntervalsForTraversal(List)} (if intervals were provided)
     */
    @Override
    public Iterator iterator() {
        // Tribble documentation states that having multiple iterators open simultaneously over the same FeatureReader
        // results in undefined behavior
        closeOpenIterationIfNecessary();

        try {
            // Save the iterator returned so that we can close it properly later
            currentIterator = intervalsForTraversal != null ? new FeatureIntervalIterator<>(intervalsForTraversal, featureReader, featureInput.getFeaturePath())
                    : featureReader.iterator();
            return currentIterator;
        } catch (final IOException e) {
            throw new GATKException("Error creating iterator over file " + featureInput.getFeaturePath(), e);
        }
    }

    /**
     * Gets an iterator over all Features in this data source that overlap the provided interval.
     * 

     * This operation is not affected by intervals provided via {@link #setIntervalsForTraversal(List)}.
     * 

     * Requires the backing file to have been indexed using the IndexFeatureFile tool, and to
     * be sorted in increasing order of start position for each contig.
     * 

     * Query results are cached to improve the performance of future queries during typical access
     * patterns. See notes to the class as a whole for a description of the caching strategy.
     * 

     * Calling this method potentially invalidates (closes) any other open iterator obtained
     * from this data source via a call to {@link #iterator}
     *
     * @param interval retrieve all Features overlapping this interval
     * @return an iterator over all Features in this data source that overlap the provided interval
     */
    @Override
    public Iterator query(final SimpleInterval interval) {
        return queryAndPrefetch(interval).iterator();
    }

    /**
     * Returns a List of all Features in this data source that overlap the provided interval.
     * 

     * This operation is not affected by intervals provided via {@link #setIntervalsForTraversal(List)}.
     * 

     * Requires the backing file to have been indexed using the IndexFeatureFile tool, and to
     * be sorted in increasing order of start position for each contig.
     * 

     * Query results are cached to improve the performance of future queries during typical access
     * patterns. See notes to the class as a whole for a description of the caching strategy.
     * 

     * Calling this method potentially invalidates (closes) any other open iterator obtained
     * from this data source via a call to {@link #iterator}
     *
     * @param interval retrieve all Features overlapping this interval
     * @return a List of all Features in this data source that overlap the provided interval
     */
    public List queryAndPrefetch(final Locatable interval) {
        if (!supportsRandomAccess) {
            throw new UserException("Input " + featureInput.getFeaturePath() + " must support random access to enable queries by interval. " +
                    "If it's a file, please index it using the bundled tool " + IndexFeatureFile.class.getSimpleName());
        }

        // If the query can be satisfied using existing cache contents, prepare for retrieval
        // by discarding all Features at the beginning of the cache that end before the start
        // of our query interval.
        if (queryCache.cacheHit(interval)) {
            queryCache.trimToNewStartPosition(interval.getStart());
        }
        // Otherwise, we have a cache miss, so go to disk to refill our cache.
        else {
            refillQueryCache(interval);
        }

        // Return the subset of our cache that overlaps our query interval
        return queryCache.getCachedFeaturesUpToStopPosition(interval.getEnd());
    }

    /**
     * Refill our cache from disk after a cache miss. Will prefetch Features overlapping an additional
     * queryLookaheadBases bases after the end of the provided interval, in addition to those overlapping
     * the interval itself.
     * 
     * Calling this has the side effect of invalidating (closing) any currently-open iteration over
     * this data source.
     *
     * @param interval the query interval that produced a cache miss
     */
    private void refillQueryCache(final Locatable interval) {
        // Tribble documentation states that having multiple iterators open simultaneously over the same FeatureReader
        // results in undefined behavior
        closeOpenIterationIfNecessary();

        // Expand the end of our query by the configured number of bases, in anticipation of probable future
        // queries with slightly larger start/stop positions.
        //
        // Note that it doesn't matter if we go off the end of the contig in the process, since
        // our reader's query operation is not aware of (and does not care about) contig boundaries.
        // Note: we use addExact to blow up on overflow rather than propagate negative results downstream
        final SimpleInterval queryInterval = new SimpleInterval(interval.getContig(), interval.getStart(), Math.addExact(interval.getEnd(), queryLookaheadBases));

        // Query iterator over our reader will be immediately closed after re-populating our cache
        try (final CloseableTribbleIterator queryIter = featureReader.query(queryInterval.getContig(), queryInterval.getStart(), queryInterval.getEnd())) {
            queryCache.fill(queryIter, queryInterval);
        } catch (final IOException e) {
            throw new GATKException("Error querying file " + featureInput + " over interval " + interval, e);
        }
    }

    /**
     * Get the logical name of this data source.
     *
     * @return the logical name of this data source
     */
    public String getName() {
        return featureInput.getName();
    }

    /**
     * Gets the header associated with this data source
     *
     * @return header associated with this data source as an Object
     */
    public Object getHeader() {
        return featureReader.getHeader();
    }

    /**
     * Permanently close this data source, invalidating any open iteration over it, and making it invalid for future
     * iterations and queries.
     */
    @Override
    public void close() {
        closeOpenIterationIfNecessary();

        logger.debug(String.format("Cache statistics for FeatureInput %s:", featureInput));
        queryCache.printCacheStatistics();

        try {
            if (featureReader != null) {
                featureReader.close();
            }
        } catch (final IOException e) {
            throw new GATKException("Error closing Feature reader for input " + featureInput);
        }
    }

    /**
     * Close the iterator currently open over this data source, if there is one.
     */
    private void closeOpenIterationIfNecessary() {
        if (currentIterator != null) {
            currentIterator.close();
            currentIterator = null;
        }
    }
}