All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.engine.AssemblyRegion Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.engine;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.util.Locatable;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator;
import org.broadinstitute.hellbender.utils.read.ReadUtils;

import java.util.*;
import java.util.stream.Collectors;

/**
 * Region of the genome that gets assembled by the local assembly engine.
 *
 * As AssemblyRegion is defined by two intervals -- a primary interval containing a territory for variant calling and a second,
 * padded, interval for assembly -- as well as the reads overlapping the padded interval.  Although we do not call variants in the padded interval,
 * assembling over a larger territory improves calls in the primary territory.
 *
 * This concept is complicated somewhat by the fact that these intervals are mutable and the fact that the AssemblyRegion onject lives on after
 * assembly during local realignment during PairHMM.  Here is an example of the life cycle of an AssemblyRegion:
 *
 * Suppose that the HaplotypeCaller engine finds an evidence for a het in a pileup at locus 400 -- that is, it produces
 * an {@code ActivityProfileState} with non-zero probability at site 400 and passes it to its {@code ActivityProfile}.
 * The {@code ActivityProfile} eventually produces an AssemblyRegion based on the {@code AssemblyRegionArgumentCollection} parameters.
 * Let's suppose that this initial region has primary span 350-450 and padded span 100 - 700.
 *
 * Next, the assembly engine assembles all reads that overlap the padded interval to find variant haplotypes and the variants
 * they contain.  The AssemblyRegion is then trimmed down to a new primary interval bound by all assembled variants within the original primary interval
 * and a new padded interval.  The amount of padding of the new padded interval around the variants depends on the needs of local realignment
 * and as such need not equal the original padding that was used for assembly.
 */
public final class AssemblyRegion implements Locatable {

    private final SAMFileHeader header;

    /**
     * The reads included in this assembly region.  May be empty upon creation, and expand / contract
     * as reads are added or removed from this region.
     */
    private final List reads;

    /**
     * The reads are specifically used for haplotype generation to kmerize reads to match with haplotype kmers.
     */
    private final List hardClippedPileupReads;

    /**
     * The active span in which this AssemblyRegion is responsible for calling variants
     */
    private final SimpleInterval activeSpan;

    /**
     * The padded span in which we perform assembly etc in order to call variants within the active span
     */
    private final SimpleInterval paddedSpan;

    /**
     * Does this region represent an active region (all isActiveProbs above threshold) or
     * an inactive region (all isActiveProbs below threshold)?
     */
    private boolean isActive;

    /**
     * Indicates whether the region has been finalized
     */
    private boolean hasBeenFinalized;

    private List alignmentData = new ArrayList<>();

    /**
     * Create a new AssemblyRegion containing no reads
     *  @param activeSpan the span of this active region
     * @param isActive indicates whether this is an active region, or an inactive one
     * @param padding the active region padding to use for this active region
     */
    public AssemblyRegion(final SimpleInterval activeSpan, final boolean isActive, final int padding, final SAMFileHeader header) {
        this(activeSpan, makePaddedSpan(activeSpan, padding, header), isActive, header);
    }

    private static SimpleInterval makePaddedSpan(final SimpleInterval activeSpan, final int padding, final SAMFileHeader header) {
        final String contig = activeSpan.getContig();
        final SAMSequenceRecord sequence = header.getSequence(contig);
        if( sequence == null) {
            throw new UserException.MissingContigInSequenceDictionary(contig, header.getSequenceDictionary());
        }
        return IntervalUtils.trimIntervalToContig(contig, activeSpan.getStart() - padding, activeSpan.getEnd() + padding, sequence.getSequenceLength());
    }

    /**
     * Create a new AssemblyRegion containing no reads
     * @param activeSpan the span of this active region
     * @param paddedSpan    the padded span of this active region
     * @param isActive indicates whether this is an active region, or an inactive one
     */
    public AssemblyRegion(final SimpleInterval activeSpan, final SimpleInterval paddedSpan, final boolean isActive, final SAMFileHeader header) {
        this.header = Utils.nonNull(header);
        this.activeSpan = Utils.nonNull(activeSpan);
        this.paddedSpan = Utils.nonNull(paddedSpan);

        Utils.validateArg( activeSpan.size() > 0, () -> "Active region cannot be of zero size, but got " + activeSpan);
        Utils.validate(paddedSpan.contains(activeSpan), "Padded span must contain active span.");

        reads = new ArrayList<>();
        hardClippedPileupReads = new ArrayList<>();
        this.isActive = isActive;
    }

    /**
     * Simple interface to create an assembly region that isActive without any profile state
     */
    public AssemblyRegion(final SimpleInterval activeSpan, final int padding, final SAMFileHeader header) {
        this(activeSpan, true, padding, header);
    }

    /**
     * Method for obtaining the alignment data which is attached to the assembly region.
     *
     * @return The list of AlignmentData objects associated with ActiveRegion.
     */
    public List getAlignmentData() {
        return alignmentData;
    }

    /**
     * Method for adding alignment data to the collection of AlignmentData associated with
     * the ActiveRegion.
     */
    public void addAllAlignmentData(List alignmentData) {
        this.alignmentData.addAll(alignmentData);
    }

    @Override
    public String getContig() {
        return activeSpan.getContig();
    }

    @Override
    public int getStart() {
        return activeSpan.getStart();
    }

    @Override
    public int getEnd() {
        return activeSpan.getEnd();
    }

    @Override
    public String toString() {
        return "AssemblyRegion "  + activeSpan.toString() + " active?=" + isActive + " nReads=" + reads.size();
    }

    /**
     * Does this region represent an active region (all isActiveProbs above threshold) or
     * an inactive region (all isActiveProbs below threshold)?
     */
    public boolean isActive() {
        return isActive;
    }

    /**
     * Override activity state of the region
     *
     * Note: Changing the isActive state after construction is a debug-level operation that only engine classes
     * like AssemblyRegionWalker should be able to do
     *
     * @param value new activity state of this region
     */
    void setIsActive(final boolean value) {
        isActive = value;
    }

    /**
     * Get the span of this assembly region including the padding value
     * @return a non-null SimpleInterval
     */
    public SimpleInterval getPaddedSpan() { return paddedSpan; }

    /**
     * Get the raw span of this assembly region (excluding the padding)
     * @return a non-null SimpleInterval
     */
    public SimpleInterval getSpan() { return activeSpan; }

    /**
     * Get an unmodifiable copy of the list of reads currently in this assembly region.
     *
     * The reads are sorted by their coordinate position.
     * @return an unmodifiable and inmutable copy of the reads in the assembly region.
    */
    public List getReads(){
        return Collections.unmodifiableList(new ArrayList<>(reads));
    }

    /**
     * Get an unmodifiable copy of the list of reads currently in this assembly region.
     *
     * The reads are sorted by their coordinate position.
     * @return an unmodifiable and inmutable copy of the reads in the assembly region.
     */
    public List getHardClippedPileupReads(){
        return Collections.unmodifiableList(new ArrayList<>(hardClippedPileupReads));
    }

    /**
     * Returns the header for the reads in this region.
     */
    public SAMFileHeader getHeader(){
        return header;
    }

    /**
     * Trim this region to just the span, producing a new assembly region without any reads that has only
     * the extent of newExtend intersected with the current extent
     * @param span the new extend of the active region we want
     * @param padding the padding size we want for the newly trimmed active region
     * @return a non-null, empty assembly region
     */
    public AssemblyRegion trim(final SimpleInterval span, final int padding) {
        Utils.nonNull(span, "Active region extent cannot be null");
        Utils.validateArg( padding >= 0, "the padding size must be 0 or greater");
        final SimpleInterval paddedSpan = span.expandWithinContig(padding, header.getSequenceDictionary());
        return trim(span, paddedSpan);
    }

    /**
     * Trim this region to no more than the span, producing a new assembly region with properly trimmed reads that
     * attempts to provide the best possible representation of this region covering the span.
     *
     * The challenge here is that span may (1) be larger than can be represented by this assembly region
     * + its original padding and (2) the padding must be symmetric on both sides.  This algorithm
     * therefore determines how best to represent span as a subset of the span of this
     * region with a padding value that captures as much of the span as possible.
     *
     * For example, suppose this active region is
     *
     * Active:    100-200 with padding of 50, so that the true span is 50-250
     * NewExtent: 150-225 saying that we'd ideally like to just have bases 150-225
     *
     * Here we represent the assembly region as a region from 150-200 with 25 bp of padding.
     *
     * The overall constraint is that the region can never exceed the original region, and
     * the padding is chosen to maximize overlap with the desired region
     *
     * @param span the new extend of the active region we want
     * @return a non-null, empty active region
     */
    public AssemblyRegion trim(final SimpleInterval span, final SimpleInterval paddedSpan) {
        Utils.nonNull(span, "Active region extent cannot be null");
        Utils.nonNull(paddedSpan, "Active region padded span cannot be null");
        Utils.validateArg(paddedSpan.contains(span), "The requested padded span must fully contain the requested span");

        final SimpleInterval newActiveSpan = getSpan().intersect(span);
        final SimpleInterval newPaddedSpan = getPaddedSpan().intersect(paddedSpan);

        final AssemblyRegion result = new AssemblyRegion( newActiveSpan, newPaddedSpan, isActive, header );

        final List trimmedReads = reads.stream()
                .map(read -> {GATKRead clipped = ReadClipper.hardClipToRegion(read, newPaddedSpan.getStart(), newPaddedSpan.getEnd());
                              return clipped;})
                .filter(read -> !read.isEmpty() && read.overlaps(result.paddedSpan))
                .sorted(new ReadCoordinateComparator(header))
                .collect(Collectors.toList());

        result.clearReads();
        result.addAll(trimmedReads);
        return result;
    }


    /**
     * Add read to this region
     *
     * Read must have alignment start >= than the last read currently in this active region.
     *
     * @throws IllegalArgumentException if read doesn't overlap the padded region of this active region
     *
     * @param read a non-null GATKRead
     */
    public void add( final GATKRead read ) {
        addToReadCollectionAndValidate(read, reads);
    }

    private void addToReadCollectionAndValidate(final GATKRead read, final List collection) {
        Utils.nonNull(read, "Read cannot be null");
        final SimpleInterval readLoc = new SimpleInterval( read );
        Utils.validateArg(paddedSpan.overlaps(read), () ->
                "Read location " + readLoc + " doesn't overlap with active region padded span " + paddedSpan);

        if ( ! collection.isEmpty() ) {
            final GATKRead lastRead = collection.get(collection.size() - 1);
            Utils.validateArg(Objects.equals(lastRead.getContig(), read.getContig()), () ->
                    "Attempting to add a read to ActiveRegion not on the same contig as other reads: lastRead " + lastRead + " attempting to add " + read);
            Utils.validateArg( read.getStart() >= lastRead.getStart(), () ->
                    "Attempting to add a read to ActiveRegion out of order w.r.t. other reads: lastRead " + lastRead + " at " + lastRead.getStart() + " attempting to add " + read + " at " + read.getStart());
        }

        collection.add( read );
    }

    /**
     * Get the number of reads currently in this region
     * @return an integer >= 0
     */
    public int size() { return reads.size(); }

    /**
     * Clear all of the reads currently in this region
     */
    public void clearReads() {
        reads.clear();
        hardClippedPileupReads.clear();
    }

    /**
     * Remove all of the reads in readsToRemove from this region
     * @param readsToRemove the set of reads we want to remove
     */
    public void removeAll( final Collection readsToRemove ) {
        Utils.nonNull(readsToRemove);
        reads.removeAll(readsToRemove);
    }

    /**
     * Add all readsToAdd to this region
     * @param readsToAdd a collection of readsToAdd to add to this active region
     */
    public void addAll(final Collection readsToAdd){
        Utils.nonNull(readsToAdd).forEach(r -> add(r));
    }

    public void addHardClippedPileupReads(final Collection readsToAdd) {
        Utils.nonNull(readsToAdd).forEach(r -> addToReadCollectionAndValidate(r, hardClippedPileupReads));
    }

    /**
     * Get the reference bases from referenceReader spanned by the padded span of this region,
     * including additional padding bp on either side.  If this expanded region would exceed the boundaries
     * of the active region's contig, the returned result will be truncated to only include on-genome reference
     * bases.
     *
     * @param referenceReader the source of the reference genome bases
     * @param padding the padding, in BP, we want to add to either side of this active region padded span
     * @param genomeLoc a non-null genome loc indicating the base span of the bp we'd like to get the reference for
     * @return a non-null array of bytes holding the reference bases in referenceReader
     */
    private static byte[] getReference(final ReferenceSequenceFile referenceReader, final int padding, final SimpleInterval genomeLoc) {
        Utils.nonNull(referenceReader, "referenceReader cannot be null");
        Utils.nonNull(genomeLoc, "genomeLoc cannot be null");
        Utils.validateArg( padding >= 0, () -> "padding must be a positive integer but got " + padding);
        Utils.validateArg( genomeLoc.size() > 0, () -> "GenomeLoc must have size > 0 but got " + genomeLoc);

        final String contig = genomeLoc.getContig();
        final SAMSequenceDictionary sequenceDictionary = referenceReader.getSequenceDictionary();
        final SAMSequenceRecord sequence = sequenceDictionary.getSequence(contig);
        if ( sequence == null ) {
            throw new UserException.MissingContigInSequenceDictionary("Contig: " + contig + " not found in reference dictionary." +
                    "\nPlease check that you are using a compatible reference for your data." +
                    "\nReference Contigs: " + ReadUtils.prettyPrintSequenceRecords(sequenceDictionary));
        }
        return referenceReader.getSubsequenceAt(contig,
                Math.max(1, genomeLoc.getStart() - padding),
                Math.min(sequence.getSequenceLength(), genomeLoc.getEnd() + padding)).getBases();
    }

    /**
     * See {@link #getAssemblyRegionReference} with padding == 0
     */
    public byte[] getAssemblyRegionReference( final ReferenceSequenceFile referenceReader ) {
        return getAssemblyRegionReference(referenceReader, 0);
    }

    /**
     * Get the reference bases from referenceReader spanned by the padded span of this active region,
     * including additional padding bp on either side.  If this expanded region would exceed the boundaries
     * of the active region's contig, the returned result will be truncated to only include on-genome reference
     * bases
     *
     * @param referenceReader the source of the reference genome bases
     * @param padding the padding, in BP, we want to add to either side of this active region padded region
     * @return a non-null array of bytes holding the reference bases in referenceReader
     */
    public byte[] getAssemblyRegionReference(final ReferenceSequenceFile referenceReader, final int padding ) {
        return getReference(referenceReader, padding, paddedSpan);
    }

    public void setFinalized(final boolean value) {
        hasBeenFinalized = value;
    }

    public boolean isFinalized() {
        return hasBeenFinalized;
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy