All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.vcf.processor.VariantIteratorProducer Maven / Gradle / Ivy

There is a newer version: 3.3.0
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2015 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package picard.vcf.processor;

import com.google.common.base.Joiner;
import com.google.common.base.Predicate;
import com.google.common.collect.FluentIterable;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.OverlapDetector;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFileReader;
import picard.vcf.processor.util.PredicateFilterDecoratingClosableIterator;

import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

/**
 * A mechanism for iterating over {@link CloseableIterator} of {@link VariantContext}s in in some fashion, given VCF files and optionally
 * an interval list.
 * 
 * The produced iterators may perform on-the-fly filtering of the produced {@link VariantContext}s.
 *
 * @author mccowan
 */
public abstract class VariantIteratorProducer {
    final static int ONE_HUNDRED_MILLION = (int) 100e6;
    /** 
     * Renders the embodied regions of the VCF files in the form of {@link htsjdk.samtools.util.CloseableIterator}s over
     * {@link VariantContext}s.  The iterator may perform on-the-fly filtering of these elements.
     */
    public abstract Iterable> iterators();

    /** Closes any latent file handles that may have been opened by calls to {@link #iterators()}. */
    public abstract void close();

    /**
     * Produces a chunking with segments of size 100 megabases (or less if a contig boundary is reached), that also performs on-the-fly
     * filtering of {@link VariantContext}
     */
    public static VariantIteratorProducer byHundredMegabaseChunksWithOnTheFlyFilteringByInterval(final List vcfs, final IntervalList intervalList) {
        return new Threadsafe(VcfFileSegmentGenerator.byWholeContigSubdividingWithWidth(ONE_HUNDRED_MILLION), vcfs, intervalList);
    }

    /** Produces a chunking with segments of size 100 megabases (or less if a contig boundary is reached). */
    public static VariantIteratorProducer byHundredMegabaseChunks(final List vcfs) {
        return new Threadsafe(VcfFileSegmentGenerator.byWholeContigSubdividingWithWidth(ONE_HUNDRED_MILLION), vcfs, null);
    }

    /**
     * A {@link VariantIteratorProducer} that is based on a given {@link VcfFileSegmentGenerator} and a list of VCFs.  The chunks are ordered by VCF, and
     * then by whatever ordering of segments are produced by {@link VcfFileSegmentGenerator#forVcf(java.io.File)} for each of those VCFs.
     * 

* The iterators produced by this class are safe to share between multiple threads. *

* If an {@link IntervalList} provided, the produced iterators will perform on-the-fly filtering and calls to {@link Iterator#next()} will * only include {@link VariantContext}s that fall within the regions described by that list. *

* This class maintains a {@link ThreadLocal} of {@link VCFFileReader} to ensure that each thread has its own, and at most one, reader per * VCF. It also guarantees that, so long as a thread closes its "queried into" readers after expiring them, there is only one extant * "queried into" iterator per thread per VCF. * * @author mccowan */ static class Threadsafe extends VariantIteratorProducer { final static Log LOG = Log.getInstance(Threadsafe.class); /** A list of the segments for which the corresponding {@link VariantContext}s will be produced. */ final List segments; final OverlapDetector intervalsOfInterestDetector; /** Maps directly to {@link #segments}; useful for determining if a given variant falls into multiple segments (don't double-count!). */ final Map> multiSegmentDetectorPerFile = new CollectionUtil.DefaultingMap<>(f -> new OverlapDetector<>(0, 0), true); Threadsafe(final VcfFileSegmentGenerator segmenter, final List vcfs) { this(segmenter, vcfs, null); } Threadsafe(final VcfFileSegmentGenerator segmenter, final List vcfs, final IntervalList intervals) { if (intervals != null) { final List uniques = intervals.uniqued(false).getIntervals(); this.intervalsOfInterestDetector = new OverlapDetector<>(0, 0); intervalsOfInterestDetector.addAll(uniques, uniques); } else { intervalsOfInterestDetector = null; } /** * Prepare {@link #segments} and {@link multiSegmentDetectorPerFile}. First, we only want to accumulate segments that are * interesting, e.g., only ones that do not lie completely outside of the interval list provided by the consumer (if it was * provided). * * Then, break up our {@link VcfFileSegment}s by file, and build a corresponding {@link OverlapDetector} for those segments. This * will be used downstream to ensure that we don't emit a given VC from a VCF more than once. * * While we're at it, since this class expects the provided {@link VcfFileSegmentGenerator} produces non-overlapping segments, * assert that this is true. */ final VcfFileSegmentGenerator interestingSegmentSegmenter = intervalsOfInterestDetector == null ? segmenter : VcfFileSegmentGenerator.excludingNonOverlaps(segmenter, intervalsOfInterestDetector); segments = new ArrayList<>(); for (final File vcf : vcfs) { for (final VcfFileSegment segment : interestingSegmentSegmenter.forVcf(vcf)) { segments.add(segment); } } for (final VcfFileSegment segment : segments) { final Interval segmentInterval = segment.correspondingInterval(); final OverlapDetector vcfSpecificDetector = multiSegmentDetectorPerFile.get(segment.vcf()); if (vcfSpecificDetector.getOverlaps(segmentInterval).isEmpty()) { vcfSpecificDetector.addLhs(segment, new Interval(segment.contig(), segment.start(), segment.stop())); } else { throw new IllegalArgumentException(String.format( "Provided segmenting strategy produced overlapping intervals; %s overlaps with: %s", segment, Joiner.on(", ").join(vcfSpecificDetector.getOverlaps(segmentInterval)) )); } } } /** * All of the {@link VCFFileReader}s opened by this object, across all threads. (We can't get this * value out of {@link #localVcfFileReaders} because of the way {@link java.lang.ThreadLocal} works. */ final Collection allReaders = Collections.synchronizedCollection(new ArrayList<>()); /** * A map maintained for each thread that contains the {@link htsjdk.variant.vcf.VCFFileReader}s that it has opened for a * given VCF. (Threads never share {@link htsjdk.variant.vcf.VCFFileReader}s. *

* This is a {@link CollectionUtil.DefaultingMap} to effectively produce readers on-the-fly. Note that it also adds every produced * reader into {@link #allReaders}. */ final ThreadLocal> localVcfFileReaders = new ThreadLocal>() { @Override protected CollectionUtil.DefaultingMap initialValue() { return new CollectionUtil.DefaultingMap<>(file -> { final VCFFileReader reader = new VCFFileReader(file); LOG.debug(String.format("Producing a reader of %s for %s.", file, Thread.currentThread())); allReaders.add(reader); return reader; }, true); } }; /** * Converts a {@link VcfFileSegment} into a {@link VariantContext} iterator. Applies filtering via {@link #intervalsOfInterestDetector} * if it is defined. */ private CloseableIterator iteratorForSegment(final VcfFileSegment segment) { final CloseableIterator query = localVcfFileReaders.get() // Get the collection of VCF file readers local to this thread .get(segment.vcf()) // Get or generate the reader for this segment's VCF file .query(segment.contig(), segment.start(), segment.stop()); // Query the segment // Then wrap the iterator in a on-the-fly interval-list based filter, if requested. final Collection> filters = new ArrayList<>(); if (intervalsOfInterestDetector != null) { filters.add(new OverlapsPredicate()); } filters.add(new NonUniqueVariantPredicate(segment)); return new PredicateFilterDecoratingClosableIterator<>(query, filters); } @Override public Iterable> iterators() { return FluentIterable.from(segments).transform(this::iteratorForSegment); } @Override public void close() { final Iterator i = allReaders.iterator(); while (i.hasNext()) { i.next().close(); i.remove(); } } /** * A predicate that I had difficulty naming. The value of this predicate is that it ensures that no single variant is produced multiple * times from a call to {@link Threadsafe#iterators()}. It works by asking each variant "Hey variant, which of the * {@link VcfFileSegment}s that this {@link Threadsafe} is producing variants from do you fall into (and thus, * would be produced by a corresponding call to {@link VCFFileReader#query(String, int, int)}? If it's more than one, we're only going * to emit you from the query for the very first of those {@link VcfFileSegment}s." */ final class NonUniqueVariantPredicate implements Predicate { final VcfFileSegment sourceSegment; NonUniqueVariantPredicate(final VcfFileSegment sourceSegment) { this.sourceSegment = sourceSegment; } @Override public boolean apply(final VariantContext vc) { if (vc.getStart() == vc.getEnd()) { // This isn't a multi-allelic segment, so it can only be produced by a single segment. return true; } final Collection intersectingSegments = multiSegmentDetectorPerFile.get(sourceSegment.vcf()).getOverlaps(new Interval(vc.getContig(), vc.getStart(), vc.getEnd())); if (intersectingSegments.size() < 2) { // There's only one segment that produces this variant return true; } // The convention is: only emit the VC if it is produced from the first segment that can produce it in the list. final int sourceSegmentIndex = segments.indexOf(sourceSegment); LOG.debug("Found wide variant spanning multiple source segments: ", vc); for (final VcfFileSegment intersectingSegment : intersectingSegments) { if (segments.indexOf(intersectingSegment) < sourceSegmentIndex) { // There is a segment that produces this variant earlier in the segment list, exclude it. return false; } } LOG.debug("Emitting wide variant because it belongs to first segment: ", vc); return true; } } /** A predicate answering if the provided {@link VariantContext} overlaps {@link #intervalsOfInterestDetector}. */ final class OverlapsPredicate implements Predicate { @Override public boolean apply(final VariantContext vc) { final boolean include = !intervalsOfInterestDetector.getOverlaps(new Interval(vc.getContig(), vc.getStart(), vc.getEnd())).isEmpty(); if (!include) LOG.debug("Filtering variant at ", vc.getContig(), ":", vc.getStart(), "-", vc.getEnd()); return include; } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy