All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.utils.pileup.PileupElement Maven / Gradle / Ivy

There is a newer version: 4.6.0.0
Show newest version
package org.broadinstitute.hellbender.utils.pileup;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.util.Locatable;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;

import java.util.*;

import static org.broadinstitute.hellbender.utils.BaseUtils.Base.D;

/**
 * Represents an individual base in a reads pileup.
 */
public final class PileupElement {

    private static final EnumSet ON_GENOME_OPERATORS =
            EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.D);

    public static final byte DELETION_BASE = D.base;
    public static final byte DELETION_QUAL = (byte) 16;
    public static final byte A_FOLLOWED_BY_INSERTION_BASE = (byte) 87;
    public static final byte C_FOLLOWED_BY_INSERTION_BASE = (byte) 88;
    public static final byte T_FOLLOWED_BY_INSERTION_BASE = (byte) 89;
    public static final byte G_FOLLOWED_BY_INSERTION_BASE = (byte) 90;

    private final GATKRead read;         // the read this base belongs to
    private final int offset;            // the offset in the bases array for this base

    private final CigarElement currentCigarElement;
    private final int currentCigarOffset;
    private final int offsetInCurrentCigar;

    /**
     * Create a new pileup element
     *
     * @param read a non-null read to pileup
     * @param baseOffset the offset into the read's base / qual vector aligned to this position on the genome. If the
     *                   current cigar element is a deletion, offset should be the offset of the last M/=/X position.
     * @param currentElement a non-null CigarElement that indicates the cigar element aligning the read to the genome
     * @param currentCigarOffset the offset of currentElement in read.getCigar().getElement(currentCigarOffset) == currentElement)
     * @param offsetInCurrentCigar how far into the currentElement are we in our alignment to the genome?
     */
    public PileupElement(final GATKRead read,
                         final int baseOffset,
                         final CigarElement currentElement,
                         final int currentCigarOffset,
                         final int offsetInCurrentCigar) {
        // Note: bounds checking on the indices proved quite expensive and affected the performance of
        // the HaplotypeCaller, as this class is a major hotspot -- therefore we are living a little
        // dangerously by going without runtime bounds checks here.
        
        this.read = read;
        this.offset = baseOffset;
        this.currentCigarElement = currentElement;
        this.currentCigarOffset = currentCigarOffset;
        this.offsetInCurrentCigar = offsetInCurrentCigar;
    }

    /**
     * Create a new PileupElement that's a copy of toCopy
     * @param toCopy the element we want to copy
     */
    public PileupElement(final PileupElement toCopy) {
        this(toCopy.read, toCopy.offset, toCopy.currentCigarElement, toCopy.currentCigarOffset, toCopy.offsetInCurrentCigar);
    }

    /**
     * Create a pileup element for read at offset.
     *
     * offset must correspond to a valid read offset given the read's cigar, or an IllegalStateException will be throw
     *
     * @param read a read
     * @param offset the offset into the bases we'd like to use in the pileup
     * @return a valid PileupElement with read and at offset
     */
    public static PileupElement createPileupForReadAndOffset(final GATKRead read, final int offset) {
        final AlignmentStateMachine stateMachine = new AlignmentStateMachine(read);

        while ( stateMachine.stepForwardOnGenome() != null ) {
            if ( stateMachine.getReadOffset() == offset ) {
                return stateMachine.makePileupElement();
            }
        }

        throw new IllegalStateException("Tried to create a pileup for read " + read + " with offset " + offset +
                " but we never saw such an offset in the alignment state machine");
    }

    /**
     * Is this element a deletion w.r.t. the reference genome?
     *
     * @return true if this is a deletion, false otherwise
     */
    public boolean isDeletion() {
        return currentCigarElement.getOperator() == CigarOperator.D;
    }

    /**
     * Is the current element immediately before a deletion, but itself not a deletion?
     *
     * Suppose we are aligning a read with cigar 3M2D1M.  This function is true
     * if we are in the last cigar position of the 3M, but not if we are in the 2D itself.
     *
     * @return true if the next alignment position is a deletion w.r.t. the reference genome
     */
    public boolean isBeforeDeletionStart() {
        return ! isDeletion() && atEndOfCurrentCigar() && hasOperator(getNextOnGenomeCigarElement(), CigarOperator.D);
    }

    /**
     * Is the current element immediately after a deletion, but itself not a deletion?
     *
     * Suppose we are aligning a read with cigar 1M2D3M.  This function is true
     * if we are in the first cigar position of the 3M, but not if we are in the 2D itself or
     * in any but the first position of the 3M.
     *
     * @return true if the previous alignment position is a deletion w.r.t. the reference genome
     */
    public boolean isAfterDeletionEnd() {
        return ! isDeletion() && atStartOfCurrentCigar() && hasOperator(getPreviousOnGenomeCigarElement(), CigarOperator.D);
    }

    /**
     * Get the read for this pileup element. Returns the live object stored in this pileup element.
     * @return a non-null Read
     */
    public GATKRead getRead() {
        return read;
    }

    /**
     * Get the offset of the this element into the read that aligns that read's base to this genomic position.
     * If the current element is a deletion then offset is the offset of the last base containing offset.
     *
     * @return a valid offset into the read's bases
     */
    public int getOffset() {
        return offset;
    }

    /**
     * Get the base aligned to the genome at this location
     * If the current element is a deletion returns {@link org.broadinstitute.hellbender.utils.pileup.PileupElement#DELETION_BASE}.
     * @return a base encoded as a byte
     */
    public byte getBase() {
        return isDeletion() ? DELETION_BASE : read.getBase(offset);
    }

    /**
     * Get the base quality score of the base at this aligned position on the genome
     * @return a phred-scaled quality score as a byte
     */
    public byte getQual() {
        return isDeletion() ? DELETION_QUAL : read.getBaseQuality(offset);
    }

    /**
     * Get the Base Insertion quality at this pileup position
     * @return a phred-scaled quality score as a byte
     */
    public byte getBaseInsertionQual() {
        return isDeletion() ? DELETION_QUAL : ReadUtils.getBaseInsertionQualities(read)[offset];
    }

    /**
     * Get the Base Deletion quality at this pileup position
     * @return a phred-scaled quality score as a byte
     */
    public byte getBaseDeletionQual() {
        return isDeletion() ? DELETION_QUAL : ReadUtils.getBaseDeletionQualities(read)[offset];
    }

    /**
     * Helpful function to get the immediately following cigar element, for an insertion or deletion
     *
     * if this state precedes a deletion (i.e., next position on genome) or insertion (immediately between
     * this and the next position) returns the CigarElement corresponding to this event.  Otherwise returns
     * null.
     *
     * @return a CigarElement, or null if the next alignment state isn't an insertion or deletion.
     */
    private CigarElement getNextIndelCigarElement() {
        if ( isBeforeDeletionStart() ) {
            final CigarElement element = getNextOnGenomeCigarElement();
            Utils.validate(element != null && element.getOperator() == CigarOperator.D, () -> "Immediately before deletion but the next cigar element isn't a deletion " + element);
            return element;
        } else if ( isBeforeInsertion() ) {
            final CigarElement element = getBetweenNextPosition().get(0);
            Utils.validate(element.getOperator() == CigarOperator.I, () -> "Immediately before insertion but the next cigar element isn't an insertion " + element);
            return element;
        } else {
            return null;
        }
    }

    /**
     * Get the mapping quality of the read of this element
     * @return the mapping quality of the underlying read
     */
    public int getMappingQual() {
        return read.getMappingQuality();
    }

    @Override
    public String toString() {
        return String.format("%s @ %d = %c Q%d", read.getName(), offset, (char) getBase(), getQual());
    }

    /**
     * Get the cigar element aligning this element to the genome
     * @return a non-null CigarElement
     */
    public CigarElement getCurrentCigarElement() {
        return currentCigarElement;
    }

    /**
     * Get the offset of this cigar element in the Cigar of the current read (0-based)
     *
     * Suppose the cigar is 1M2D3I4D.  If we are in the 1M state this function returns
     * 0.  If we are in 2D, the result is 1.  If we are in the 4D, the result is 3.
     *
     * @return an offset into the read.getCigar() that brings us to the current cigar element
     */
    public int getCurrentCigarOffset() {
        return currentCigarOffset;
    }

    /**
     * Get the cigar elements that occur before the current position but after the previous position on the genome
     *
     * For example, if we are in the 3M state of 1M2I3M state then 2I occurs before this position.
     *
     * Note that this function does not care where we are in the current cigar element.  In the previous
     * example this list of elements contains the 2I state regardless of where you are in the 3M.
     *
     * Note this returns the list of all elements that occur between this and the prev site, so for
     * example we might have 5S10I2M and this function would return [5S, 10I].
     *
     * @return a non-null list of CigarElements
     */
    public List getBetweenPrevPosition() {
        return atStartOfCurrentCigar() ? getBetween(Direction.PREV) : Collections.emptyList();
    }

    /**
     * Get the cigar elements that occur after the current position but before the next position on the genome
     *
     * @see #getBetweenPrevPosition() for more details
     *
     * @return a non-null list of CigarElements
     */
    public List getBetweenNextPosition() {
        return atEndOfCurrentCigar() ? getBetween(Direction.NEXT) : Collections.emptyList();
    }

    /**
     * Get the length of an immediately following insertion or deletion event, or 0 if no such event exists
     *
     * Only returns a positive value when this pileup element is immediately before an indel.  Being
     * immediately before a deletion means that this pileup element isn't an deletion, and that the
     * next genomic alignment for this read is a deletion.  For the insertion case, this means
     * that an insertion cigar occurs immediately after this element, between this one and the
     * next genomic position.
     *
     * Note this function may be expensive, so multiple uses should be cached by the caller
     *
     * @return length of the event (number of inserted or deleted bases), or 0
     */
    public int getLengthOfImmediatelyFollowingIndel() {
        final CigarElement element = getNextIndelCigarElement();
        return element == null ? 0 : element.getLength();
    }

    /**
     * Get the bases for an insertion that immediately follows this alignment state, or null if none exists
     *
     * @see #getLengthOfImmediatelyFollowingIndel() for details on the meaning of immediately.
     *
     * If the immediately following state isn't an insertion, returns null
     *
     * @return actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read.
     */
    public String getBasesOfImmediatelyFollowingInsertion() {
        final CigarElement element = getNextIndelCigarElement();
        if ( element != null && element.getOperator() == CigarOperator.I ) {
            final int getFrom = offset + 1;
            final byte[] bases = Arrays.copyOfRange(read.getBases(), getFrom, getFrom + element.getLength());
            return new String(bases);
        } else {
            return null;
        }
    }

    /**
     * Get the offset into the *current* cigar element for this alignment position
     *
     * We can be anywhere from offset 0 (first position) to length - 1 of the current
     * cigar element aligning us to this genomic position.
     *
     * @return a valid offset into the current cigar element
     */
    public int getOffsetInCurrentCigar() {
        return offsetInCurrentCigar;
    }

    /**  Direction of search for some helper functions */
    @VisibleForTesting
    enum Direction { PREV(-1), NEXT(1);
        private final int increment;

        public int getIncrement() {
            return increment;
        }

        private Direction(final int increment){
            this.increment = increment;
        }
    }

    /**
     * Helper function to get cigar elements between this and either the prev or next genomic position.
     * Note that this method stops accumulating elements at the first "on-genome" operator it encounters.
     *
     * @param direction PREVIOUS if we want before, NEXT if we want after
     * @return a non-null list of cigar elements between this and the neighboring position in direction
     */
    private List getBetween(final Direction direction) {
        final int increment = direction.getIncrement();

        final List elements = new ArrayList<>();
        final List cigarElements = read.getCigarElements();
        final int nCigarElements = cigarElements.size();
        for ( int i = currentCigarOffset + increment; i >= 0 && i < nCigarElements; i += increment) {
            final CigarElement elt = cigarElements.get(i);
            if ( ON_GENOME_OPERATORS.contains(elt.getOperator()) ) {
                break;
            } else {
                elements.add(elt);
            }
        }
        if (increment < 0){
            Collections.reverse(elements);
        }
        return elements;
    }

    /**
     * Helper function to get cigar operator right next to this position.
     *
     * @param direction PREVIOUS if we want before, NEXT if we want after
     * @return the next Cigar operator in the given direction, or null if there is none
     */
    @VisibleForTesting
    CigarOperator getAdjacentOperator(final Direction direction) {
        final int increment = direction.getIncrement();

        final int i = currentCigarOffset + increment;
        if (i < 0 || i >= read.numCigarElements()) {
            return null;
        }
        return read.getCigarElement(i).getOperator();
    }
    /**
     * Get the cigar element of the previous genomic aligned position
     *
     * For example, we might have 1M2I3M, and be sitting at the someone in the 3M.  This
     * function would return 1M, as the 2I isn't on the genome.  Note this function skips
     * all of the positions that would occur in the current element.  So the result
     * is always 1M regardless of whether we're in the first, second, or third position of the 3M
     * cigar.
     *
     * @return a CigarElement, or null (indicating that no previous element exists)
     */
    public CigarElement getPreviousOnGenomeCigarElement() {
        return getNearestOnGenomeCigarElement(Direction.PREV);
    }

    /**
     * Get the cigar element of the next genomic aligned position
     *
     * @see #getPreviousOnGenomeCigarElement() for more details
     *
     * @return a CigarElement, or null (indicating that no next element exists)
     */
    public CigarElement getNextOnGenomeCigarElement() {
        return getNearestOnGenomeCigarElement(Direction.NEXT);
    }

    /**
     * Helper function to get the cigar element of the next or previous genomic position
     * @param direction the direction to look in
     * @return nearest on-genome CigarElement or null if no such element exists
     */
    private CigarElement getNearestOnGenomeCigarElement(final Direction direction) {
        final int increment = direction.getIncrement();
        final int nCigarElements = read.numCigarElements();

        for ( int i = currentCigarOffset + increment; i >= 0 && i < nCigarElements; i += increment) {
            final CigarElement elt = read.getCigarElement(i);
            if ( ON_GENOME_OPERATORS.contains(elt.getOperator()) ) {
                return elt;
            }
        }

        // getting here means that you didn't find anything
        return null;
    }

    /**
     * Does the cigar element (which may be null) have operation toMatch?
     *
     * @param maybeCigarElement a CigarElement that might be null
     * @param toMatch a CigarOperator we want to match against the one in maybeCigarElement
     * @return true if maybeCigarElement isn't null and has operator toMatch
     */
    private boolean hasOperator(final CigarElement maybeCigarElement, final CigarOperator toMatch) {
        return maybeCigarElement != null && maybeCigarElement.getOperator() == toMatch;
    }

    /**
     * Does an insertion occur immediately before the current position on the genome?
     *
     * @return true if an insertion occurs immediately before the current position on the genome, false otherwise
     */
    public boolean isAfterInsertion() { return isImmediatelyAfter(CigarOperator.I); }

    /**
     * Does an insertion occur immediately after the current position on the genome?
     *
     * @return true an insertion occurs immediately after the current position on the genome, false otherwise
     */
    public boolean isBeforeInsertion() {
        return isImmediatelyBefore(CigarOperator.I);
    }

    /**
     * Does a soft-clipping event occur immediately before the current position on the genome?
     *
     * @return true if a soft-clipping event occurs immediately before the current position on the genome, false otherwise
     */
    public boolean isAfterSoftClip() {
        return isImmediatelyAfter(CigarOperator.S);
    }

    /**
     * Does a soft-clipping event occur immediately after the current position on the genome?
     *
     * @return true if a soft-clipping event occurs immediately after the current position on the genome, false otherwise.
     */
    public boolean isBeforeSoftClip() {
        return isImmediatelyBefore(CigarOperator.S);
    }

    /**
     * @return true if a given operator event occurs immediately before the current position on the genome, false otherwise.
     */
    @VisibleForTesting
    boolean isImmediatelyAfter(final CigarOperator op) {
        return atStartOfCurrentCigar() && getAdjacentOperator(Direction.PREV) == op;
    }

    /**
    * @return true if a given operator event occurs immediately after the current position on the genome, false otherwise.
     */
    @VisibleForTesting
    boolean isImmediatelyBefore(final CigarOperator op) {
        return atEndOfCurrentCigar() && getAdjacentOperator(Direction.NEXT) == op;
    }

    /**
     * Does a soft-clipping event occur immediately before or after the current position on the genome?
     *
     * @return true if a soft-clipping event occurs immediately before or after the current position on the genome, false otherwise.
     */
    public boolean isNextToSoftClip() { return isAfterSoftClip() || isBeforeSoftClip(); }

    /**
     * Is the current position at the end of the current cigar?
     *
     * For example, if we are in element 3M, this function returns true if we are at offsetInCurrentCigar
     * of 2, but not 0 or 1.
     *
     * @return true if we're at the end of the current cigar
     */
    public boolean atEndOfCurrentCigar() {
        return offsetInCurrentCigar == currentCigarElement.getLength() - 1;
    }

    /**
     * Is the current position at the start of the current cigar?
     *
     * For example, if we are in element 3M, this function returns true if we are at offsetInCurrentCigar
     * of 0, but not 1 or 2.
     *
     * @return true if we're at the start of the current cigar
     */
    public boolean atStartOfCurrentCigar() {
        return offsetInCurrentCigar == 0;
    }

    /**
     * Can the base in this pileup element be used in comparative tests?
     *
     * @param p the pileup element to consider
     *
     * @return true if this base is part of a meaningful read for comparison, false otherwise
     */
    public static boolean isUsableBaseForAnnotation(final PileupElement p) {
        return !( p.isDeletion() ||
                p.getMappingQual() == 0 ||
                p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
                ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE);
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy