All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.engine.AssemblyRegionIterator Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.engine;

import htsjdk.samtools.SAMFileHeader;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.activityprofile.ActivityProfile;
import org.broadinstitute.hellbender.utils.activityprofile.ActivityProfileState;
import org.broadinstitute.hellbender.utils.activityprofile.BandPassActivityProfile;
import org.broadinstitute.hellbender.utils.downsampling.DownsamplingMethod;
import org.broadinstitute.hellbender.utils.iterators.IntervalLocusIterator;
import org.broadinstitute.hellbender.utils.iterators.ReadCachingIterator;
import org.broadinstitute.hellbender.utils.locusiterator.IntervalAlignmentContextIterator;
import org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;

import java.util.*;

/**
 * Given a {@link MultiIntervalShard} of {@link GATKRead}, iterates over each {@link AssemblyRegion} within
 * that shard, using the provided {@link AssemblyRegionEvaluator} to determine the boundaries between assembly
 * regions.
 *
 * Loads the reads from the shard as lazily as possible to minimize memory usage.
 *
 * This iterator represents the core of the {@link AssemblyRegionWalker} traversal.
 *
 * NOTE: the provided shard must have appropriate read filters set on it for this traversal type (ie., unmapped
 * and malformed reads must be filtered out).
 */
public class AssemblyRegionIterator implements Iterator {
    private static final Logger logger = LogManager.getLogger(AssemblyRegionIterator.class);

    private final MultiIntervalShard readShard;
    private final SAMFileHeader readHeader;
    private final ReferenceDataSource reference;
    private final FeatureManager features;
    private final AssemblyRegionEvaluator evaluator;
    final AssemblyRegionArgumentCollection assemblyRegionArgs;
    
    private AssemblyRegion readyRegion;
    private Queue pendingRegions;
    private List previousRegionReads;
    private final ReadCachingIterator readCachingIterator;
    private Queue readCache;
    private final Iterator locusIterator;
    private final LocusIteratorByState libs;
    private final ActivityProfile activityProfile;
    private Queue pendingAlignmentData;

    /**
     * Constructs an AssemblyRegionIterator over a provided read shard
     *  @param readShard MultiIntervalShard containing the reads that will go into the assembly regions.
     *                  Must have a MAPPED filter set on it.
     * @param readHeader header for the reads
     * @param reference source of reference bases (may be null)
     * @param features source of arbitrary features (may be null)
     * @param evaluator evaluator used to determine whether a locus is active
     */
    public AssemblyRegionIterator(final MultiIntervalShard readShard,
                                  final SAMFileHeader readHeader,
                                  final ReferenceDataSource reference,
                                  final FeatureManager features,
                                  final AssemblyRegionEvaluator evaluator,
                                  final AssemblyRegionArgumentCollection assemblyRegionArgs,
                                  final boolean trackPileups ) {

        Utils.nonNull(readShard);
        Utils.nonNull(readHeader);
        Utils.nonNull(evaluator);
        assemblyRegionArgs.validate();


        this.readShard = readShard;
        this.readHeader = readHeader;
        this.reference = reference;
        this.features = features;
        this.evaluator = evaluator;
        this.assemblyRegionArgs = assemblyRegionArgs;

        this.readyRegion = null;
        this.previousRegionReads = null;
        this.pendingRegions = new ArrayDeque<>();
        this.readCachingIterator = new ReadCachingIterator(readShard.iterator());
        this.readCache = new ArrayDeque<>();
        this.activityProfile = new BandPassActivityProfile(assemblyRegionArgs.maxProbPropagationDistance, assemblyRegionArgs.activeProbThreshold, BandPassActivityProfile.MAX_FILTER_SIZE, BandPassActivityProfile.DEFAULT_SIGMA, readHeader);
        this.pendingAlignmentData = trackPileups ? new ArrayDeque<>() : null;

        // We wrap our LocusIteratorByState inside an IntervalAlignmentContextIterator so that we get empty loci
        // for uncovered locations. This is critical for reproducing GATK 3.x behavior!
        this.libs = new LocusIteratorByState(readCachingIterator, DownsamplingMethod.NONE, ReadUtils.getSamplesFromHeader(readHeader), readHeader, true);
        final IntervalLocusIterator intervalLocusIterator = new IntervalLocusIterator(readShard.getIntervals().iterator());
        this.locusIterator = new IntervalAlignmentContextIterator(libs, intervalLocusIterator, readHeader.getSequenceDictionary());

        readyRegion = loadNextAssemblyRegion();
    }

    @Override
    public boolean hasNext() {
        return readyRegion != null;
    }

    @Override
    public AssemblyRegion next() {
        if (!hasNext()) {
            throw new NoSuchElementException("next() called when there were no more elements");
        }

        final AssemblyRegion toReturn = readyRegion;
        previousRegionReads = toReturn.getReads();
        readyRegion = loadNextAssemblyRegion();
        return toReturn;
    }

    private AssemblyRegion loadNextAssemblyRegion() {
        AssemblyRegion nextRegion = null;

        while ( locusIterator.hasNext() && nextRegion == null ) {
            final AlignmentContext pileup = locusIterator.next();

            logger.debug(()->"AssemblyRegionIterator pileup: " + pileup);

            // Pop any new pending regions off of the activity profile. These pending regions will not become ready
            // until we've traversed all the reads that belong in them.
            //
            // Ordering matters here: need to check for forceConversion before adding current pileup to the activity profile
            if ( ! activityProfile.isEmpty() ) {
                final boolean forceConversion = pileup.getLocation().getStart() != activityProfile.getEnd() + 1;
                pendingRegions.addAll(activityProfile.popReadyAssemblyRegions(assemblyRegionArgs.assemblyRegionPadding, assemblyRegionArgs.minAssemblyRegionSize, assemblyRegionArgs.maxAssemblyRegionSize, forceConversion));
            }

            // Add the current pileup to the activity profile
            final SimpleInterval pileupInterval = new SimpleInterval(pileup);
            final ReferenceContext pileupRefContext = new ReferenceContext(reference, pileupInterval);
            final FeatureContext pileupFeatureContext = new FeatureContext(features, pileupInterval);


            final ActivityProfileState profile = evaluator.isActive(pileup, pileupRefContext, pileupFeatureContext);
            logger.debug(() -> profile.toString());
            activityProfile.add(profile);

            if (pendingAlignmentData!=null) {
                AlignmentAndReferenceContext alignmentAndRefContext = new AlignmentAndReferenceContext(pileup, pileupRefContext);
                alignmentAndRefContext.setActivityScore(profile.getOriginalActiveProb());
                pendingAlignmentData.add(alignmentAndRefContext);
            }

            // A pending region only becomes ready once our locus iterator has advanced beyond the end of its extended span
            // (this ensures that we've loaded all reads that belong in the new region)
            if ( ! pendingRegions.isEmpty() && IntervalUtils.isAfter(pileup.getLocation(), pendingRegions.peek().getPaddedSpan(), readHeader.getSequenceDictionary()) ) {
                nextRegion = pendingRegions.poll();
            }
        }

        // When we run out of loci, close out the activity profile, and close out any remaining pending regions one at a time
        // It may require multiple invocations before the pendingRegions queue is cleared out.
        if ( ! locusIterator.hasNext() ) {
            
            // Pull on the encapsulated LocusIteratorByState until it's exhausted, as the enclosing
            // IntervalLocusIterator is not guaranteed to exhaust it. This guarantees that the reads
            // in the final padded region end up in our read cache.
            while ( libs.hasNext() ) {
                libs.next();
            }

            if ( ! activityProfile.isEmpty() ) {
                // Pop the activity profile a final time with forceConversion == true
                pendingRegions.addAll(activityProfile.popReadyAssemblyRegions(assemblyRegionArgs.assemblyRegionPadding, assemblyRegionArgs.minAssemblyRegionSize, assemblyRegionArgs.maxAssemblyRegionSize, true));
            }

            // Grab the next pending region if there is one, unless we already have a region ready to go
            // (could happen if we finalize a region on the same iteration that we run out of loci in the locus iterator)
            if ( ! pendingRegions.isEmpty() && nextRegion == null ) {
                nextRegion = pendingRegions.poll();
            }
        }

        // If there's a region ready, fill it with reads before returning
        if ( nextRegion != null ) {
            fillNextAssemblyRegionWithReads(nextRegion);
            // fillnextessemblyregion; check you are on correct chr; if alignment data is not in the assembly region then pop it
            fillNextAssemblyRegionWithPileupData(nextRegion);
        }

        return nextRegion;
    }

    private void fillNextAssemblyRegionWithReads( final AssemblyRegion region ) {
        // First we need to check the previous region for reads that also belong in this region
        if ( previousRegionReads != null ) {
            for ( final GATKRead previousRegionRead : previousRegionReads ) {
                if ( region.getPaddedSpan().overlaps(previousRegionRead) ) {
                    region.add(previousRegionRead);
                }
            }
        }

        // Update our read cache with the reads newly-read by our locus iterator.
        // Note that the cache is (crucially) maintained in coordinate order.
        readCache.addAll(readCachingIterator.consumeCachedReads());

        // Add all reads from the cache that belong in this region
        while ( ! readCache.isEmpty() ) {
            final GATKRead nextRead = readCache.peek();

            // Stop once we encounter a read that starts after the end of the last region's span
            // (and leave it in the readCache)
            if ( IntervalUtils.isAfter(nextRead, region.getPaddedSpan(), readHeader.getSequenceDictionary()) ) {
                break;
            }

            // Ok, we're going to consume this read
            readCache.poll();

            // Add the read if it overlaps the region's extended span. If it doesn't, it must end before the
            // start of the region's extended span, so we discard it.
            if ( region.getPaddedSpan().overlaps(nextRead) ) {
                region.add(nextRead);
            }
        }
    }

    private void fillNextAssemblyRegionWithPileupData(final AssemblyRegion region){
        // Save ourselves the memory footprint and work of saving the pileups in the event they aren't needed for processing.
        if (pendingAlignmentData == null){
            return;
        }
        final List overlappingAlignmentData = new ArrayList<>();
        final Queue previousAlignmentData = new ArrayDeque<>();

        while (!pendingAlignmentData.isEmpty()) {
            final AlignmentContext pendingAlignmentContext = pendingAlignmentData.peek().getAlignmentContext();
            if (!pendingAlignmentContext.contigsMatch(region) ||
                pendingAlignmentContext.getStart() < region.getStart()) {
                pendingAlignmentData.poll(); // pop this
            } else {
                break;
            }
        }
        while (!pendingAlignmentData.isEmpty()) {
            final AlignmentContext pendingAlignmentContext = pendingAlignmentData.peek().getAlignmentContext();

            if (!pendingAlignmentContext.contigsMatch(region) ||
                    pendingAlignmentContext.getStart() <= region.getEnd()) {
                overlappingAlignmentData.add(pendingAlignmentData.poll()); // pop into overlappingAlignmentData
            } else {
                break;
            }
        }

        // reconstructing queue to contain items that may be in the next assembly region
        previousAlignmentData.addAll(overlappingAlignmentData);
        previousAlignmentData.addAll(pendingAlignmentData);
        pendingAlignmentData = previousAlignmentData;

        region.addAllAlignmentData(overlappingAlignmentData);
    }

    @Override
    public void remove() {
        throw new UnsupportedOperationException("remove() not supported by AssemblyRegionIterator");
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy