org.broadinstitute.hellbender.engine.MultiVariantDataSource Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.engine;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.MergingIterator;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextComparator;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.VcfUtils;

import java.nio.file.Path;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
 * MultiVariantDataSource aggregates multiple FeatureDataSources of variants, and enables traversals and queries
 * over those sources through a single interface.
 *
 * Two basic operations are available on this data source:
 *
 * -Iteration over all Variants in the data sources, optionally restricted to Variants overlapping
 *  a set of intervals if intervals are provided via {@link #setIntervalsForTraversal(List)}. Traversal
 *  by a set of intervals requires the files to have been indexed using the bundled tool IndexFeatureFile.
 *  The set of intervals provided MUST be non-overlapping and sorted in increasing order of start position.
 *
 * -Targeted queries by one interval at a time. This also requires the files to have been indexed using
 *  the bundled tool IndexFeatureFile. Targeted queries by one interval at a time are unaffected by
 *  any intervals for full traversal set via {@link #setIntervalsForTraversal(List)}.
 */
public final class MultiVariantDataSource implements GATKDataSource, AutoCloseable {
    private static final Logger logger = LogManager.getLogger(MultiVariantDataSource.class);

    /**
     * List of FeatureDataSource objects aggregated by this MultiVariantDataSource
     */
    final private List> featureDataSources = new ArrayList>();

    /**
     * Merged VCF header used for this (aggregate) source, derived from the individual sources.
     */
    final private VCFHeader mergedHeader;

    /**
     * Iterator representing an open traversal over this data source initiated via a call to {@link #iterator}
     * (null if there is no open traversal). We need this to ensure that each iterator is properly closed,
     * and to enforce the constraint (required by Tribble) that we never have more than one iterator open
     * over our feature reader.
     */
    private CloseableIterator currentIterator;
    private SortedSet mergedSamples;

    /**
     * Creates a MultiVariantDataSource backed by the provided FeatureInputs. We will look ahead the specified number of bases
     * during queries that produce cache misses.
     *
     * @param featureInputs List of FeatureInput> specifying sources of VariantContexts
     * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
     */
    public MultiVariantDataSource(final List> featureInputs, final int queryLookaheadBases) {
        this(featureInputs, queryLookaheadBases, 0, 0, null, false);
    }

    /**
     * Creates a MultiVariantDataSource backed by the provided FeatureInputs. We will look ahead the specified number of bases
     * during queries that produce cache misses.
     *
     * @param featureInputs List of FeatureInput> specifying sources of VariantContexts
     * @param queryLookaheadBases look ahead this many bases during queries that produce cache misses
     * @param cloudPrefetchBuffer  MB size of caching/prefetching wrapper for the data, if on Google Cloud (0 to disable).
     * @param cloudIndexPrefetchBuffer MB size of caching/prefetching wrapper for the index, if on Google Cloud (0 to disable).
     * @param reference reference to use when creating FeatureDataSources, may be null, only needed by GenomicsDB
     */
    public MultiVariantDataSource(final List> featureInputs, final int queryLookaheadBases, final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final Path reference, final boolean skipDictionaryValidation) {
        Utils.validateArg(queryLookaheadBases >= 0, "Query lookahead bases must be >= 0");
        Utils.validateArg(featureInputs != null && featureInputs.size() > 0, "FeatureInputs list must be non-null and non-empty");

        featureInputs.forEach(
                featureInput -> featureDataSources.add(
                        new FeatureDataSource<>(featureInput, queryLookaheadBases, VariantContext.class, cloudPrefetchBuffer, cloudIndexPrefetchBuffer,
                                                reference, true)));

        // Ensure that the merged header and sequence dictionary that we use are in sync with each
        // other, and reflect the actual dictionaries used to do validation:
        //
        // 1) Cross validate the sequence dictionaries from each data source (which may be derived from an index
        //    in the case where its not embedded in the input file) to ensure they're mutually compatible
        // 2) Create and cache a merged header using versions of the individual headers from each data source that
        //    have been updated to include the actual dictionary returned from that data source
        //
        if (!skipDictionaryValidation) {
            validateAllSequenceDictionaries();
        }
        mergedHeader = getMergedHeader();
        mergedSamples = getSortedSamples();
        if ((mergedHeader == null || mergedHeader.getSequenceDictionary() == null) && featureInputs.size() > 1) {
            throw new UserException(
                    "No sequence dictionary was found for any input. When using multiple inputs, at least one input " +
                    "must have a sequence dictionary, or an index from which a sequence dictionary can be derived.");
        }

    }

    /**
     * Returns the aggregate sequence dictionary for this source of Variants. Uses the dictionary resulting
     * from merging available individual VCF headers (if present) for variant inputs.
     *
     * @return the sequence dictionary derived from the input sources, or null if no dictionary could be created
     * from any header or index file.
     */
    public SAMSequenceDictionary getSequenceDictionary() {
        return mergedHeader == null ? null : mergedHeader.getSequenceDictionary();
    }

    /**
     * Gets the merged header associated with this data source
     *
     * @return header associated with this data source as an Object
     */
    public VCFHeader getHeader() {
        return mergedHeader;
    }

    /**
     * Restricts traversals of this data source via {@link #iterator} to only return variants that overlap the provided
     * intervals. Calls to {@link #query(SimpleInterval)} are not affected by these intervals.
     *
     * Intervals MUST be non-overlapping and sorted in order of increasing start position, otherwise traversal
     * results will be incorrect.
     *
     * Passing in a null or empty interval List clears the intervals for traversal, making future iterations
     * over this data source unrestricted by intervals.
     *
     * @param intervals Our next full traversal will return only variants overlapping these intervals
     */
    public void setIntervalsForTraversal( final List intervals ) {
        featureDataSources.forEach(ds -> ds.setIntervalsForTraversal(intervals));
    }

    /**
     * Gets an iterator over all variants in this data source, restricting traversal to variants
     * overlapping our intervals if intervals were provided via {@link #setIntervalsForTraversal(List)}
     *
     * Calling this method invalidates (closes) any previous iterator obtained from this method.
     *
     * @return an iterator over all variants in this data source, limited to variants that overlap the intervals supplied via {@link #setIntervalsForTraversal(List)} (if intervals were provided)
     */
    @Override
    public Iterator iterator() {
        return getMergedIteratorFromDataSources(ds -> ds.iterator());
    }

    /**
     * Gets an iterator over all Variants in this data source that overlap the provided interval.
     *
     * This operation is not affected by intervals provided via {@link #setIntervalsForTraversal(List)}.
     *
     * Requires the backing files to have been indexed using the IndexFeatureFile tool, and to
     * be sorted in increasing order of start position for each contig.
     *
     * Query results are cached to improve the performance of future queries during typical access
     * patterns. See notes to the class as a whole for a description of the caching strategy.
     *
     * Calling this method potentially invalidates (closes) any other open iterator obtained
     * from this data source via a call to {@link #iterator}
     *
     * @param interval retrieve all Variants overlapping this interval
     * @return an iterator over all Variants in this data source that overlap the provided interval
     */
    @Override
    public Iterator query( final SimpleInterval interval ) {
        return getMergedIteratorFromDataSources(ds -> ds.queryAndPrefetch(interval).iterator());
    }

    /**
     * Close any existing iterator, create a new iterator and update the local cached iterator reference.
     * @param iteratorFromSource function to retrieve individual iterator, to be applied to each data source
     * @return
     */
    private Iterator getMergedIteratorFromDataSources(
            final Function, Iterator> iteratorFromSource) {

        // Tribble documentation states that having multiple iterators open simultaneously over the same FeatureReader
        // results in undefined behavior
        closeOpenIterationIfNecessary();

        if (featureDataSources.size() > 1) {
            final List> iterators = new ArrayList<>(featureDataSources.size());
            featureDataSources.forEach(ds -> iterators.add(getCloseableIteratorWrapper(iteratorFromSource.apply((ds)))));

            final VariantContextComparator varComparator = new VariantContextComparator(getSequenceDictionary());
            currentIterator = new MergingIterator<>(varComparator, iterators);
        } else {
            currentIterator = getCloseableIteratorWrapper(iteratorFromSource.apply(featureDataSources.get(0)));
        }
        return currentIterator;
    }

    /**
     * Get the logical name of this data source.
     *
     * @return the logical name of this data source
     */
    public String getName() {
        return "MultiVariantDataSource: ("
                + Utils.join(", ", featureDataSources.stream().map(fds -> fds.getName()).collect(Collectors.toList()))
                + ")";
    }

    /**
     * Permanently close this data source, invalidating any open iteration over it, and making it invalid for future
     * iterations and queries.
     */
    @Override
    public void close() {
        closeOpenIterationIfNecessary();
        featureDataSources.forEach(dataSource -> dataSource.close());
    }

    private SAMSequenceDictionary getMergedSequenceDictionary(VCFHeader header) {
        return header != null ? header.getSequenceDictionary() : null;
    }

    /**
     * Merge and sort the samples from each header requiring unique samples
     */
    private SortedSet getSortedSamples() {
        final Map headers = featureDataSources
                .stream()
                .collect(Collectors.toMap(ds -> ds.getName(), ds -> (VCFHeader) ds.getHeader()));
        
        return VcfUtils.getSortedSampleSet(headers, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
    }

    /**
     * Update each individual header with the sequence dictionary returned by the corresponding data source;
     * then merge the resulting headers.
     */
    private VCFHeader getMergedHeader() {
        final List headers = featureDataSources
                .stream()
                .map(ds -> getHeaderWithUpdatedSequenceDictionary(ds))
                .collect(Collectors.toList());

        // Now merge the headers using htsjdk, which is pretty permissive, and which only works properly
        // because of the cross-dictionary validation done in validateAllSequenceDictionaries.
        return headers.size() > 1 ?
                new VCFHeader(VCFUtils.smartMergeHeaders(headers, true)) :
                headers.get(0);
    }

    /**
     * We want the headers that are used to create the merged header to have the same sequence dictionary
     * that was  returned from the data source and used during validation (which may or may not be the
     * one that was embedded in the input file itself), so get the embedded one from the data source and
     * update it to include the actual sequence dictionary.
     */
    private VCFHeader getHeaderWithUpdatedSequenceDictionary(final FeatureDataSource dataSource) {
        final VCFHeader header = (VCFHeader) dataSource.getHeader();
        final SAMSequenceDictionary sourceDict = dataSource.getSequenceDictionary();
        if (header.getSequenceDictionary() == null && sourceDict != null) {
            header.setSequenceDictionary(sourceDict);
        }
        return header;
    }

    /**
     * GATKTool only validates individual feature dictionaries against the reference dictionary, so cross-validate
     * all of the dictionaries against each other here by ensuring that each contig found in any dictionary has the
     * same length (and md5, when a value is present for that contig in both dictionaries) in every other dictionary
     * in which its present.
     */
    private void validateAllSequenceDictionaries() {
        final Map> contigMap = new HashMap<>();
        featureDataSources.forEach(
            ds -> getDataSourceDictionaryAndValidate(ds, contigMap)
        );
    }

    private void getDataSourceDictionaryAndValidate(final FeatureDataSource ds,
                                                    final Map> contigMap) {
        final SAMSequenceDictionary dictionary = ds.getSequenceDictionary();
        if (dictionary == null) {
            logger.warn(
                    "A sequence dictionary is required for each input when using multiple inputs, and one could" +
                            " not be obtained for feature input: " + ds.getName() +
                            ". The input may not exist or may not have a valid header");
        } else {
            //This is HORRIFICALLY inefficient -- for tools with many inputs instead skip cross-validation by
            // overloading doDictionaryCrossValidation as false and require a reference
            dictionary.getSequences().forEach(
                    sourceSequence -> validateContigAgainstPreviousDataSource(sourceSequence, dictionary, contigMap, ds)
            );
        }
    }

    private void validateContigAgainstPreviousDataSource(final SAMSequenceRecord sourceSequence,
                  final SAMSequenceDictionary dictionary,
                  final Map> contigMap,
                  final FeatureDataSource ds){
        final String sourceSequenceName = sourceSequence.getSequenceName();
        final FeatureDataSource previousDataSource = contigMap.getOrDefault(sourceSequenceName, null);
        if (previousDataSource != null) {
            final SAMSequenceDictionary previousDictionary = previousDataSource.getSequenceDictionary();
            final SAMSequenceRecord previousSequence = previousDictionary.getSequence(sourceSequenceName);
            validateSequenceDictionaryRecords(
                    ds.getName(), dictionary, sourceSequence,
                    previousDataSource.getName(), previousDictionary, previousSequence);
        } else {
            contigMap.put(sourceSequenceName, ds);
        }
    }

    // Cross validate the length and md5 for a pair of sequence records.
    private void validateSequenceDictionaryRecords(
            final String sourceDataSourceName,
            final SAMSequenceDictionary sourceDictionary,
            final SAMSequenceRecord sourceSequence,
            final String targetDataSourceName,
            final SAMSequenceDictionary targetDictionary,
            final SAMSequenceRecord targetSequence)
    {
        if (!SequenceDictionaryUtils.sequenceRecordsAreEquivalent(sourceSequence, targetSequence)) {
            final String msg = String.format("Incompatible sequences found (%s: %d) and (%s: %d)",
                    sourceSequence.getSequenceName(),
                    sourceSequence.getSequenceLength(),
                    targetSequence.getSequenceName(),
                    targetSequence.getSequenceLength());
            throw new UserException.IncompatibleSequenceDictionaries(
                    msg,
                    sourceDataSourceName,
                    sourceDictionary,
                    targetDataSourceName,
                    targetDictionary
            );
        } else {
            final String targetMd5 = targetSequence.getMd5();
            final String sourceMd5 = sourceSequence.getMd5();
            if (Utils.xor(targetMd5 == null, sourceMd5 == null)) {
                final String msg = String.format(
                        "The MD5 value (%s) for sequence (%s) is present in at least one input sequence dictionary" +
                        " but missing in at least one other input sequence dictionary. In the case where an input VCF" +
                        " contains no embedded sequence dictionary, one may have been derived from the accompanying" +
                        " index; such derived dictionaries do not contain MD5 values and can result in this warning" +
                        " message.",
                        targetMd5 == null ? sourceMd5 : targetMd5,
                        sourceSequence.getSequenceName()
                );
                logger.warn(msg);
            } else if (!Objects.equals(targetMd5, sourceMd5)) {
                final String msg = String.format("Incompatible sequence MD5 values found (%s: %s) and (%s: %s)",
                        sourceSequence.getSequenceName(),
                        sourceMd5,
                        targetSequence.getSequenceName(),
                        targetMd5);
                throw new UserException.IncompatibleSequenceDictionaries(
                        msg,
                        sourceDataSourceName,
                        sourceDictionary,
                        targetDataSourceName,
                        targetDictionary
                );
            }
        }
    }

    /**
     * Close the iterator currently open over the data sources, if there is one.
     */
    private void closeOpenIterationIfNecessary() {
        if (currentIterator != null) {
            currentIterator.close();
            currentIterator = null;
        }
    }

    /**
     * Wrap the sourceIterator in a CloseableIterator to make it usable as a MergingIterator source.
     */
    private CloseableIterator getCloseableIteratorWrapper(final Iterator sourceIterator) {
        Utils.nonNull(sourceIterator);

        return new CloseableIterator() {
            Iterator delegateIterator = sourceIterator;
            @Override
            public void close() { delegateIterator = null; }

            @Override
            public boolean hasNext() {
                return delegateIterator != null && delegateIterator.hasNext();
            }

            @Override
            public VariantContext next() {
                if (!hasNext()) {
                    throw new NoSuchElementException("hasNext should be called before next");
                }
                return delegateIterator.next();
            }
        };
    }

    /**
     * Return lexicographically sorted set of uniquified sample names merged from across input data sources
     */
    public SortedSet getSamples() {
        return mergedSamples;
    }
}