org.broadinstitute.hellbender.utils.pileup.ReadPileup Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.utils.pileup;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.util.Locatable;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.qc.Pileup;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.fragments.FragmentCollection;
import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;

import java.util.*;
import java.util.function.Predicate;
import java.util.function.ToIntFunction;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

/**
 * Represents a pileup of reads at a given position.
 */
public class ReadPileup implements Iterable {
    private final Locatable loc;
    private final List pileupElements;

    /** Constant used by samtools to downgrade a quality for overlapping reads that disagrees in their base. */
    public static final double SAMTOOLS_OVERLAP_LOW_CONFIDENCE = 0.8;

    /**
     * Create a new pileup at loc, using the reads and their corresponding
     * offsets.
     * Note: This constructor keeps an alias to the given list.
     */
    public ReadPileup(final Locatable loc, final List pileup) {
        this.loc = loc;
        this.pileupElements = pileup;
    }

    /**
     * Create a new pileup at loc, using an stratified pileup
     * Note: the current implementation of ReadPileup does not efficiently retrieve the stratified pileup
     */
    public ReadPileup(final Locatable loc, final Map stratifiedPileup) {
        // NOTE: this bit of code is a potential performance hotspot for the HaplotypeCaller.
        // This straightforward loop outperforms the equivalent streaming expression by over 2x.
        List allElements = new ArrayList<>(stratifiedPileup.size() * 1000);
        for ( final Map.Entry pileupEntry : stratifiedPileup.entrySet() ) {
            allElements.addAll(pileupEntry.getValue().pileupElements);
        }

        this.loc = loc;
        this.pileupElements = allElements;
    }

    /**
     * Create a new pileup without any aligned reads
     */
    public ReadPileup(final Locatable loc) {
        this(loc, new ArrayList());
    }

    /**
     * Create a new pileup with the given reads.
     */
    public ReadPileup(final Locatable loc, final List reads, final int offset) {
        this(loc, readsOffsetsToPileup(reads, offset));
    }

    /**
     * Create a new pileup with the given reads.
     */
    public ReadPileup(final Locatable loc, final List reads, final List offsets) {
        this(loc, readsOffsetsToPileup(reads, offsets));
    }

    /**
     * Get the pileup of reads covering a locus.  This is useful, for example, in VariantWalkers, which work on
     * ReadsContexts and not AlignmentContexts.
     */
    public ReadPileup(final Locatable loc, final Iterable reads) {
        final List pile = StreamSupport.stream(reads.spliterator(), false)
                .filter(ReadFilterLibrary.PASSES_VENDOR_QUALITY_CHECK.and(ReadFilterLibrary.NOT_DUPLICATE))
                .map(AlignmentStateMachine::new)
                .map(asm -> {
                    while ( asm.stepForwardOnGenome() != null && asm.getGenomePosition() < loc.getStart()) { }
                    return asm.getGenomePosition() == loc.getStart() ? asm.makePileupElement() : null;
                }).filter(Objects::nonNull).collect(Collectors.toList());

        this.loc = loc;
        pileupElements = pile;
    }

    /**
     * Returns the first element corresponding to the given read or null there is no such element.
     *
     * @param read or null if elements with no reads are to be retrieved from this pileup.
     * @return the first element corresponding to the given read or null there is no such element.
     */
    @VisibleForTesting
    PileupElement getElementForRead(final GATKRead read) {
        return getElementStream().filter(el -> Objects.equals(el.getRead(), read)).findAny().orElse(null);
    }

    /**
     * Helper routine for converting reads and offset lists to a PileupElement list.
     */
    private static List readsOffsetsToPileup(final List reads, final List offsets) {
        if (reads.size() != offsets.size()) {
            throw new IllegalArgumentException("Reads and offset lists have different sizes!");
        }

        final List pileup = new ArrayList<>(reads.size());
        for (int i = 0; i < reads.size(); i++) {
            pileup.add(PileupElement.createPileupForReadAndOffset(reads.get(i), offsets.get(i)));
        }

        return pileup;
    }

    /**
     * Helper routine for converting reads and a single offset to a PileupElement list.
     */
    private static List readsOffsetsToPileup(final List reads, final int offset) {
        return reads.stream().map(r -> PileupElement.createPileupForReadAndOffset(r, offset)).collect(Collectors.toList());
    }

    /**
     * Make a new pileup consisting of elements of this pileup that satisfy the predicate.
     * NOTE: the new pileup will not be independent of the old one (no deep copy of the underlying data is performed).
     */
    public ReadPileup makeFilteredPileup(final Predicate filter) {
        return new ReadPileup(loc, getElementStream().filter(filter).collect(Collectors.toList()));
    }

    /**
     * Make a new pileup from elements whose reads have read groups that agree with the given lane ID.
     * (if they have a name equal to the ID or starting with ID followed by a period ".").
     * Also, if both laneID and read group ID are {@code null}, the read will be included.
     * Returns empty pileup if no suitable elements are found.
     * NOTE: the new pileup will not be independent of the old one (no deep copy of the underlying data is performed).
     */
    public ReadPileup getPileupForLane(final String laneID) {
        return makeFilteredPileup(p -> {
            final GATKRead read = p.getRead();
            final String readGroupID = read.getReadGroup();
            if (laneID == null && readGroupID == null) {
                return true;
            }
            if (laneID != null && readGroupID != null) {
                final boolean laneSame = readGroupID.startsWith(laneID + "."); // lane is the same, but sample identifier is different
                final boolean exactlySame = readGroupID.equals(laneID);        // in case there is no sample identifier, they have to be exactly the same
                if (laneSame || exactlySame) {
                    return true;
                }
            }
            return false;
        });
    }

    /**
     * Make a new pileup from elements whose reads belong to the given sample.
     * Passing null sample as an argument retrieves reads whose read group or sample name is {@code null}
     * NOTE: the new pileup will not be independent of the old one (no deep copy of the underlying data is performed).
     */
    public ReadPileup getPileupForSample(final String sample, final SAMFileHeader header) {
        return makeFilteredPileup(pe -> Objects.equals(ReadUtils.getSampleName(pe.getRead(), header), sample));
    }

    /**
     * Gets a set of the read groups represented in this pileup.
     * Note: contains null if a read has a null read group
     */
    public Set getReadGroupIDs() {
        return getElementStream().map(pe -> pe.getRead().getReadGroup()).collect(Collectors.toSet());
    }

    /**
     * Gets a set of the samples represented in this pileup.
     * Note: contains null if a read has a null read group or a null sample name.
     */
    public Set getSamples(final SAMFileHeader header) {
        return getElementStream().map(pe -> pe.getRead()).map(r -> ReadUtils.getSampleName(r, header)).collect(Collectors.toSet());
    }

    /**
     * Splits the ReadPileup by sample
     *
     * @param header            the header to retrieve the samples from
     * @param unknownSampleName the sample name if there is no read group/sample name; {@code null} if lack of RG is not expected
     * @return a Map of sample name to ReadPileup (including empty pileups for a sample)
     * @throws org.broadinstitute.hellbender.exceptions.UserException.ReadMissingReadGroup if unknownSampleName is {@code null} and there are reads without RG/sample name
     */
    public Map splitBySample(final SAMFileHeader header, final String unknownSampleName) {
        final Map toReturn = new HashMap<>();
        for (final String sample : getSamples(header)) {
            final ReadPileup pileupBySample = getPileupForSample(sample, header);
            if (sample != null) {
                toReturn.put(sample, pileupBySample);
            } else {
                if (unknownSampleName == null) {
                    throw new UserException.ReadMissingReadGroup(pileupBySample.iterator().next().getRead());
                }
                toReturn.put(unknownSampleName, pileupBySample);
            }
        }
        return toReturn;
    }

    /**
     * The best way to access PileupElements where you only care about the bases and quals in the pileup.
     * 
     * for (PileupElement p : this) { doSomething(p); }
     * 

     * Provides efficient iteration of the data.
     */
    @Override
    public Iterator iterator() {
        // Profiling has determined that returning a custom unmodifiable iterator is faster than
        // Collections.unmodifiableList(pileupElements).iterator()
        return new Iterator() {
            private final int len = pileupElements.size();
            private int i = 0;

            @Override
            public boolean hasNext() {
                return i < len;
            }

            @Override
            public PileupElement next() {
                return pileupElements.get(i++);
            }

            @Override
            public void remove() {
                throw new UnsupportedOperationException("Cannot remove from a pileup element iterator");
            }
        };
    }

    /**
     * Iterator over sorted by read start PileupElements.
     */
    public Iterator sortedIterator() {
        return getElementStream()
                .sorted((l, r) -> Integer.compare(l.getRead().getStart(), r.getRead().getStart()))
                .iterator();
    }

    /**
     * The number of elements in this pileup.
     */
    public int size() {
        return pileupElements.size();
    }

    /**
     * @return true if there are 0 elements in the pileup, false otherwise
     */
    public boolean isEmpty() {
        return size() == 0;
    }

    /**
     * @return the location of this pileup.
     */
    public Locatable getLocation() {
        return loc;
    }

    /**
     * Get counts of A, C, G, T in order, which returns a int[4] vector with counts according
     * to BaseUtils.simpleBaseToBaseIndex for each base.
     * Deletions are not counted.
     */
    public int[] getBaseCounts() {
        final int[] counts = new int[4];

        for (final PileupElement pile : this) {
            // skip deletion sites
            if (!pile.isDeletion()) {
                final int index = BaseUtils.simpleBaseToBaseIndex(pile.getBase());
                if (index != -1) {
                    counts[index]++;
                }
            }
        }

        return counts;
    }

    /**
     * Fixes the quality of all the elements that come from an overlapping pair in the same way as
     * samtools does {@see tweak_overlap_quality function in
     * samtools}.
     * 

     * Setting the quality of one of the bases to 0 effectively removes the redundant base for
     * calling. In addition, if the bases overlap we have increased confidence if they agree (or
     * reduced if they don't). Thus, the algorithm proceeds as following:
     * 

     * 1. If the bases are the same, the quality of the first element is the sum of both qualities
     * and the quality of the second is reduced to 0.
     * 2. If the bases are different, the base with the highest quality is reduced with a factor of
     * 0.8, and the quality of the lowest is reduced to 0.
     * 
     * Note: Resulting qualities higher than {@link QualityUtils#MAX_SAM_QUAL_SCORE} are capped.
     */
    public void fixOverlaps() {
        final FragmentCollection fragments = FragmentCollection.create(this);
        fragments.getOverlappingPairs().stream()
                .forEach(
                        elements -> fixPairOverlappingQualities(elements.getLeft(), elements.getRight())
                );
    }

    /**
     * Fixes the quality of two elements that come from an overlapping pair in the same way as
     * samtools does {@see tweak_overlap_quality function in
     * samtools}.
     * The only difference with the samtools API is the cap for high values ({@link QualityUtils#MAX_SAM_QUAL_SCORE}).
     */
    @VisibleForTesting
    static void fixPairOverlappingQualities(final PileupElement firstElement,
                                            final PileupElement secondElement) {
        // only if they do not represent deletions
        if (!secondElement.isDeletion() && !firstElement.isDeletion()) {
            final byte[] firstQuals = firstElement.getRead().getBaseQualities();
            final byte[] secondQuals = secondElement.getRead().getBaseQualities();
            if (firstElement.getBase() == secondElement.getBase()) {
                // if both have the same base, extra confidence in the firts of them
                firstQuals[firstElement.getOffset()] =
                        (byte) (firstQuals[firstElement.getOffset()] + secondQuals[secondElement
                                .getOffset()]);
                // cap to maximum byte value
                if (firstQuals[firstElement.getOffset()] < 0
                        || firstQuals[firstElement.getOffset()] > QualityUtils.MAX_SAM_QUAL_SCORE) {
                    firstQuals[firstElement.getOffset()] = QualityUtils.MAX_SAM_QUAL_SCORE;
                }
                secondQuals[secondElement.getOffset()] = 0;
            } else {
                // if not, we lost confidence in the one with higher quality
                if (firstElement.getQual() >= secondElement.getQual()) {
                    firstQuals[firstElement.getOffset()] =
                            (byte) (SAMTOOLS_OVERLAP_LOW_CONFIDENCE * firstQuals[firstElement.getOffset()]);
                    secondQuals[secondElement.getOffset()] = 0;
                } else {
                    secondQuals[secondElement.getOffset()] =
                            (byte) (SAMTOOLS_OVERLAP_LOW_CONFIDENCE * secondQuals[secondElement.getOffset()]);
                    firstQuals[firstElement.getOffset()] = 0;
                }
            }
            firstElement.getRead().setBaseQualities(firstQuals);
            secondElement.getRead().setBaseQualities(secondQuals);
        }
    }

    public final static Comparator baseQualTieBreaker = new Comparator() {
        @Override
        public int compare(PileupElement o1, PileupElement o2) {
            return Byte.compare(o1.getQual(), o2.getQual());
        }
    };

    public final static Comparator mapQualTieBreaker = new Comparator() {
        @Override
        public int compare(PileupElement o1, PileupElement o2) {
            return Integer.compare(o1.getMappingQual(), o2.getMappingQual());
        }
    };

    /**
     * Returns a new ReadPileup where only one read from an overlapping read
     * pair is retained.  If the two reads in question disagree to their basecall,
     * neither read is retained.  If they agree on the base, the read with the higher
     * base quality observation is retained
     *
     * @return the newly filtered pileup
     */
    public ReadPileup getOverlappingFragmentFilteredPileup(SAMFileHeader header) {
        return getOverlappingFragmentFilteredPileup(true, baseQualTieBreaker, header);
    }

    /**
     * Returns a new ReadPileup where only one read from an overlapping read
     * pair is retained.  If discardDiscordant and the two reads in question disagree to their basecall,
     * neither read is retained.  Otherwise, the read with the higher
     * quality (base or mapping, depending on baseQualNotMapQual) observation is retained
     *
     * @return the newly filtered pileup
     */
    public ReadPileup getOverlappingFragmentFilteredPileup(boolean discardDiscordant, Comparator tieBreaker, SAMFileHeader header) {
        List filteredPileupList = new ArrayList();

        for (ReadPileup pileup : this.splitBySample(header, null).values()) {
            Collection elements = filterSingleSampleForOverlaps(pileup, tieBreaker, discardDiscordant);
            filteredPileupList.addAll(elements);
        }

        return new ReadPileup(loc, filteredPileupList);
    }

    private Collection filterSingleSampleForOverlaps(ReadPileup pileup, Comparator tieBreaker, boolean discardDiscordant) {
        Map filteredPileup = new HashMap();
        Set readNamesDeleted = new HashSet<>();

        for (PileupElement p : pileup) {
            String readName = p.getRead().getName();

            // if we've never seen this read before, life is good
            if (!filteredPileup.containsKey(readName)) {
                if(!readNamesDeleted.contains(readName)) {
                    filteredPileup.put(readName, p);
                }
            } else {
                PileupElement existing = filteredPileup.get(readName);

                // if the reads disagree at this position, throw them all out.  Otherwise
                // keep the element with the highest quality score
                if (discardDiscordant && existing.getBase() != p.getBase()) {
                    filteredPileup.remove(readName);
                    readNamesDeleted.add(readName);
                } else if (tieBreaker.compare(existing, p) < 0) {
                    filteredPileup.put(readName, p);
                }
            }
        }
        return(filteredPileup.values());
    }

    @Override
    public String toString() {
        return String.format("%s %s %s %s",
                loc.getContig(),
                loc.getStart(),
                new String(getBases()),
                getQualsString());
    }

    /**
     * Format, assuming a single-sample, in a samtools-like string.
     * Each line represents a genomic position, consisting of chromosome name, coordinate,
     * reference base, read bases and read qualities
     *
     * @param ref the reference base
     * @return pileup line
     */
    public String getPileupString(final char ref) {
        // In the pileup format,
        return String.format("%s %s %c %s %s",
                getLocation().getContig(), getLocation().getStart(),    // chromosome name and coordinate
                ref,                                                     // reference base
                new String(getBases()),
                getQualsString());
    }

    /**
     * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time
     */
    public List getReads() {
        return getElementStream().map(pe -> pe.getRead()).collect(Collectors.toList());
    }

    private Stream getElementStream() {
        return pileupElements.stream();
    }

    /**
     * Returns the number of elements that satisfy the predicate.
     */
    public int getNumberOfElements(final Predicate peFilter) {
        Utils.nonNull(peFilter);
        //Note: pileups are small so int is fine.
        return (int) getElementStream().filter(peFilter).count();
    }

    /**
     * Returns a list of the offsets in this pileup.
     * Note: this call costs O(n) and allocates fresh lists each time
     */
    public List getOffsets() {
        return getElementStream().map(pe -> pe.getOffset()).collect(Collectors.toList());
    }

    //Extracts an int array by mapping each element in the pileup to int.
    private int[] extractIntArray(final ToIntFunction map) {
        return getElementStream().mapToInt(map).toArray();
    }

    /**
     * Returns an array of the bases in this pileup.
     * Note: this call costs O(n) and allocates fresh array each time
     */
    public byte[] getBases() {
        return toByteArray(extractIntArray(pe -> pe.getBase()));
    }

    /**
     * Returns an array of the quals in this pileup.
     * Note: this call costs O(n) and allocates fresh array each time
     */
    public byte[] getBaseQuals() {
        return toByteArray(extractIntArray(pe -> pe.getQual()));
    }

    //Converts array of ints to array of bytes by hard casting (loses precision if ints are large).
    private byte[] toByteArray(final int[] ints) {
        final byte[] bytes = new byte[ints.length];
        for (int i = 0; i < ints.length; i++) {
            bytes[i] = (byte) ints[i];
        }
        return bytes;
    }

    /**
     * Get an array of the mapping qualities.
     */
    public int[] getMappingQuals() {
        return extractIntArray(pe -> pe.getMappingQual());
    }

    private String getQualsString() {
        final byte[] quals = getBaseQuals();
        for (int i = 0; i < quals.length; i++) {
            quals[i] = (byte) (33 + quals[i]);  //as per SAM spec
        }
        return new String(quals);
    }
}