org.broadinstitute.hellbender.utils.pileup.ReadPileup Maven / Gradle / Ivy
package org.broadinstitute.hellbender.utils.pileup;
import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.util.Locatable;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.qc.Pileup;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.fragments.FragmentCollection;
import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import java.util.*;
import java.util.function.Predicate;
import java.util.function.ToIntFunction;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
/**
* Represents a pileup of reads at a given position.
*/
public class ReadPileup implements Iterable {
private final Locatable loc;
private final List pileupElements;
/** Constant used by samtools to downgrade a quality for overlapping reads that disagrees in their base. */
public static final double SAMTOOLS_OVERLAP_LOW_CONFIDENCE = 0.8;
/**
* Create a new pileup at loc, using the reads and their corresponding
* offsets.
* Note: This constructor keeps an alias to the given list.
*/
public ReadPileup(final Locatable loc, final List pileup) {
this.loc = loc;
this.pileupElements = pileup;
}
/**
* Create a new pileup at loc, using an stratified pileup
* Note: the current implementation of ReadPileup does not efficiently retrieve the stratified pileup
*/
public ReadPileup(final Locatable loc, final Map stratifiedPileup) {
// NOTE: this bit of code is a potential performance hotspot for the HaplotypeCaller.
// This straightforward loop outperforms the equivalent streaming expression by over 2x.
List allElements = new ArrayList<>(stratifiedPileup.size() * 1000);
for ( final Map.Entry pileupEntry : stratifiedPileup.entrySet() ) {
allElements.addAll(pileupEntry.getValue().pileupElements);
}
this.loc = loc;
this.pileupElements = allElements;
}
/**
* Create a new pileup without any aligned reads
*/
public ReadPileup(final Locatable loc) {
this(loc, new ArrayList());
}
/**
* Create a new pileup with the given reads.
*/
public ReadPileup(final Locatable loc, final List reads, final int offset) {
this(loc, readsOffsetsToPileup(reads, offset));
}
/**
* Create a new pileup with the given reads.
*/
public ReadPileup(final Locatable loc, final List reads, final List offsets) {
this(loc, readsOffsetsToPileup(reads, offsets));
}
/**
* Get the pileup of reads covering a locus. This is useful, for example, in VariantWalkers, which work on
* ReadsContexts and not AlignmentContexts.
*/
public ReadPileup(final Locatable loc, final Iterable reads) {
final List pile = StreamSupport.stream(reads.spliterator(), false)
.filter(ReadFilterLibrary.PASSES_VENDOR_QUALITY_CHECK.and(ReadFilterLibrary.NOT_DUPLICATE))
.map(AlignmentStateMachine::new)
.map(asm -> {
while ( asm.stepForwardOnGenome() != null && asm.getGenomePosition() < loc.getStart()) { }
return asm.getGenomePosition() == loc.getStart() ? asm.makePileupElement() : null;
}).filter(Objects::nonNull).collect(Collectors.toList());
this.loc = loc;
pileupElements = pile;
}
/**
* Returns the first element corresponding to the given read or null there is no such element.
*
* @param read or null if elements with no reads are to be retrieved from this pileup.
* @return the first element corresponding to the given read or null there is no such element.
*/
@VisibleForTesting
PileupElement getElementForRead(final GATKRead read) {
return getElementStream().filter(el -> Objects.equals(el.getRead(), read)).findAny().orElse(null);
}
/**
* Helper routine for converting reads and offset lists to a PileupElement list.
*/
private static List readsOffsetsToPileup(final List reads, final List offsets) {
if (reads.size() != offsets.size()) {
throw new IllegalArgumentException("Reads and offset lists have different sizes!");
}
final List pileup = new ArrayList<>(reads.size());
for (int i = 0; i < reads.size(); i++) {
pileup.add(PileupElement.createPileupForReadAndOffset(reads.get(i), offsets.get(i)));
}
return pileup;
}
/**
* Helper routine for converting reads and a single offset to a PileupElement list.
*/
private static List readsOffsetsToPileup(final List reads, final int offset) {
return reads.stream().map(r -> PileupElement.createPileupForReadAndOffset(r, offset)).collect(Collectors.toList());
}
/**
* Make a new pileup consisting of elements of this pileup that satisfy the predicate.
* NOTE: the new pileup will not be independent of the old one (no deep copy of the underlying data is performed).
*/
public ReadPileup makeFilteredPileup(final Predicate filter) {
return new ReadPileup(loc, getElementStream().filter(filter).collect(Collectors.toList()));
}
/**
* Make a new pileup from elements whose reads have read groups that agree with the given lane ID.
* (if they have a name equal to the ID or starting with ID followed by a period ".").
* Also, if both laneID and read group ID are {@code null}, the read will be included.
* Returns empty pileup if no suitable elements are found.
* NOTE: the new pileup will not be independent of the old one (no deep copy of the underlying data is performed).
*/
public ReadPileup getPileupForLane(final String laneID) {
return makeFilteredPileup(p -> {
final GATKRead read = p.getRead();
final String readGroupID = read.getReadGroup();
if (laneID == null && readGroupID == null) {
return true;
}
if (laneID != null && readGroupID != null) {
final boolean laneSame = readGroupID.startsWith(laneID + "."); // lane is the same, but sample identifier is different
final boolean exactlySame = readGroupID.equals(laneID); // in case there is no sample identifier, they have to be exactly the same
if (laneSame || exactlySame) {
return true;
}
}
return false;
});
}
/**
* Make a new pileup from elements whose reads belong to the given sample.
* Passing null sample as an argument retrieves reads whose read group or sample name is {@code null}
* NOTE: the new pileup will not be independent of the old one (no deep copy of the underlying data is performed).
*/
public ReadPileup getPileupForSample(final String sample, final SAMFileHeader header) {
return makeFilteredPileup(pe -> Objects.equals(ReadUtils.getSampleName(pe.getRead(), header), sample));
}
/**
* Gets a set of the read groups represented in this pileup.
* Note: contains null if a read has a null read group
*/
public Set getReadGroupIDs() {
return getElementStream().map(pe -> pe.getRead().getReadGroup()).collect(Collectors.toSet());
}
/**
* Gets a set of the samples represented in this pileup.
* Note: contains null if a read has a null read group or a null sample name.
*/
public Set getSamples(final SAMFileHeader header) {
return getElementStream().map(pe -> pe.getRead()).map(r -> ReadUtils.getSampleName(r, header)).collect(Collectors.toSet());
}
/**
* Splits the ReadPileup by sample
*
* @param header the header to retrieve the samples from
* @param unknownSampleName the sample name if there is no read group/sample name; {@code null} if lack of RG is not expected
* @return a Map of sample name to ReadPileup (including empty pileups for a sample)
* @throws org.broadinstitute.hellbender.exceptions.UserException.ReadMissingReadGroup if unknownSampleName is {@code null} and there are reads without RG/sample name
*/
public Map splitBySample(final SAMFileHeader header, final String unknownSampleName) {
final Map toReturn = new HashMap<>();
for (final String sample : getSamples(header)) {
final ReadPileup pileupBySample = getPileupForSample(sample, header);
if (sample != null) {
toReturn.put(sample, pileupBySample);
} else {
if (unknownSampleName == null) {
throw new UserException.ReadMissingReadGroup(pileupBySample.iterator().next().getRead());
}
toReturn.put(unknownSampleName, pileupBySample);
}
}
return toReturn;
}
/**
* The best way to access PileupElements where you only care about the bases and quals in the pileup.
*
* for (PileupElement p : this) { doSomething(p); }
*
* Provides efficient iteration of the data.
*/
@Override
public Iterator iterator() {
// Profiling has determined that returning a custom unmodifiable iterator is faster than
// Collections.unmodifiableList(pileupElements).iterator()
return new Iterator() {
private final int len = pileupElements.size();
private int i = 0;
@Override
public boolean hasNext() {
return i < len;
}
@Override
public PileupElement next() {
return pileupElements.get(i++);
}
@Override
public void remove() {
throw new UnsupportedOperationException("Cannot remove from a pileup element iterator");
}
};
}
/**
* Iterator over sorted by read start PileupElements.
*/
public Iterator sortedIterator() {
return getElementStream()
.sorted((l, r) -> Integer.compare(l.getRead().getStart(), r.getRead().getStart()))
.iterator();
}
/**
* The number of elements in this pileup.
*/
public int size() {
return pileupElements.size();
}
/**
* @return true if there are 0 elements in the pileup, false otherwise
*/
public boolean isEmpty() {
return size() == 0;
}
/**
* @return the location of this pileup.
*/
public Locatable getLocation() {
return loc;
}
/**
* Get counts of A, C, G, T in order, which returns a int[4] vector with counts according
* to BaseUtils.simpleBaseToBaseIndex for each base.
* Deletions are not counted.
*/
public int[] getBaseCounts() {
final int[] counts = new int[4];
for (final PileupElement pile : this) {
// skip deletion sites
if (!pile.isDeletion()) {
final int index = BaseUtils.simpleBaseToBaseIndex(pile.getBase());
if (index != -1) {
counts[index]++;
}
}
}
return counts;
}
/**
* Fixes the quality of all the elements that come from an overlapping pair in the same way as
* samtools does {@see tweak_overlap_quality function in
* samtools}.
*
* Setting the quality of one of the bases to 0 effectively removes the redundant base for
* calling. In addition, if the bases overlap we have increased confidence if they agree (or
* reduced if they don't). Thus, the algorithm proceeds as following:
*
* 1. If the bases are the same, the quality of the first element is the sum of both qualities
* and the quality of the second is reduced to 0.
* 2. If the bases are different, the base with the highest quality is reduced with a factor of
* 0.8, and the quality of the lowest is reduced to 0.
*
* Note: Resulting qualities higher than {@link QualityUtils#MAX_SAM_QUAL_SCORE} are capped.
*/
public void fixOverlaps() {
final FragmentCollection fragments = FragmentCollection.create(this);
fragments.getOverlappingPairs().stream()
.forEach(
elements -> fixPairOverlappingQualities(elements.getLeft(), elements.getRight())
);
}
/**
* Fixes the quality of two elements that come from an overlapping pair in the same way as
* samtools does {@see tweak_overlap_quality function in
* samtools}.
* The only difference with the samtools API is the cap for high values ({@link QualityUtils#MAX_SAM_QUAL_SCORE}).
*/
@VisibleForTesting
static void fixPairOverlappingQualities(final PileupElement firstElement,
final PileupElement secondElement) {
// only if they do not represent deletions
if (!secondElement.isDeletion() && !firstElement.isDeletion()) {
final byte[] firstQuals = firstElement.getRead().getBaseQualities();
final byte[] secondQuals = secondElement.getRead().getBaseQualities();
if (firstElement.getBase() == secondElement.getBase()) {
// if both have the same base, extra confidence in the firts of them
firstQuals[firstElement.getOffset()] =
(byte) (firstQuals[firstElement.getOffset()] + secondQuals[secondElement
.getOffset()]);
// cap to maximum byte value
if (firstQuals[firstElement.getOffset()] < 0
|| firstQuals[firstElement.getOffset()] > QualityUtils.MAX_SAM_QUAL_SCORE) {
firstQuals[firstElement.getOffset()] = QualityUtils.MAX_SAM_QUAL_SCORE;
}
secondQuals[secondElement.getOffset()] = 0;
} else {
// if not, we lost confidence in the one with higher quality
if (firstElement.getQual() >= secondElement.getQual()) {
firstQuals[firstElement.getOffset()] =
(byte) (SAMTOOLS_OVERLAP_LOW_CONFIDENCE * firstQuals[firstElement.getOffset()]);
secondQuals[secondElement.getOffset()] = 0;
} else {
secondQuals[secondElement.getOffset()] =
(byte) (SAMTOOLS_OVERLAP_LOW_CONFIDENCE * secondQuals[secondElement.getOffset()]);
firstQuals[firstElement.getOffset()] = 0;
}
}
firstElement.getRead().setBaseQualities(firstQuals);
secondElement.getRead().setBaseQualities(secondQuals);
}
}
public final static Comparator baseQualTieBreaker = new Comparator() {
@Override
public int compare(PileupElement o1, PileupElement o2) {
return Byte.compare(o1.getQual(), o2.getQual());
}
};
public final static Comparator mapQualTieBreaker = new Comparator() {
@Override
public int compare(PileupElement o1, PileupElement o2) {
return Integer.compare(o1.getMappingQual(), o2.getMappingQual());
}
};
/**
* Returns a new ReadPileup where only one read from an overlapping read
* pair is retained. If the two reads in question disagree to their basecall,
* neither read is retained. If they agree on the base, the read with the higher
* base quality observation is retained
*
* @return the newly filtered pileup
*/
public ReadPileup getOverlappingFragmentFilteredPileup(SAMFileHeader header) {
return getOverlappingFragmentFilteredPileup(true, baseQualTieBreaker, header);
}
/**
* Returns a new ReadPileup where only one read from an overlapping read
* pair is retained. If discardDiscordant and the two reads in question disagree to their basecall,
* neither read is retained. Otherwise, the read with the higher
* quality (base or mapping, depending on baseQualNotMapQual) observation is retained
*
* @return the newly filtered pileup
*/
public ReadPileup getOverlappingFragmentFilteredPileup(boolean discardDiscordant, Comparator tieBreaker, SAMFileHeader header) {
List filteredPileupList = new ArrayList();
for (ReadPileup pileup : this.splitBySample(header, null).values()) {
Collection elements = filterSingleSampleForOverlaps(pileup, tieBreaker, discardDiscordant);
filteredPileupList.addAll(elements);
}
return new ReadPileup(loc, filteredPileupList);
}
private Collection filterSingleSampleForOverlaps(ReadPileup pileup, Comparator tieBreaker, boolean discardDiscordant) {
Map filteredPileup = new HashMap();
Set readNamesDeleted = new HashSet<>();
for (PileupElement p : pileup) {
String readName = p.getRead().getName();
// if we've never seen this read before, life is good
if (!filteredPileup.containsKey(readName)) {
if(!readNamesDeleted.contains(readName)) {
filteredPileup.put(readName, p);
}
} else {
PileupElement existing = filteredPileup.get(readName);
// if the reads disagree at this position, throw them all out. Otherwise
// keep the element with the highest quality score
if (discardDiscordant && existing.getBase() != p.getBase()) {
filteredPileup.remove(readName);
readNamesDeleted.add(readName);
} else if (tieBreaker.compare(existing, p) < 0) {
filteredPileup.put(readName, p);
}
}
}
return(filteredPileup.values());
}
@Override
public String toString() {
return String.format("%s %s %s %s",
loc.getContig(),
loc.getStart(),
new String(getBases()),
getQualsString());
}
/**
* Format, assuming a single-sample, in a samtools-like string.
* Each line represents a genomic position, consisting of chromosome name, coordinate,
* reference base, read bases and read qualities
*
* @param ref the reference base
* @return pileup line
*/
public String getPileupString(final char ref) {
// In the pileup format,
return String.format("%s %s %c %s %s",
getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate
ref, // reference base
new String(getBases()),
getQualsString());
}
/**
* Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time
*/
public List getReads() {
return getElementStream().map(pe -> pe.getRead()).collect(Collectors.toList());
}
private Stream getElementStream() {
return pileupElements.stream();
}
/**
* Returns the number of elements that satisfy the predicate.
*/
public int getNumberOfElements(final Predicate peFilter) {
Utils.nonNull(peFilter);
//Note: pileups are small so int is fine.
return (int) getElementStream().filter(peFilter).count();
}
/**
* Returns a list of the offsets in this pileup.
* Note: this call costs O(n) and allocates fresh lists each time
*/
public List getOffsets() {
return getElementStream().map(pe -> pe.getOffset()).collect(Collectors.toList());
}
//Extracts an int array by mapping each element in the pileup to int.
private int[] extractIntArray(final ToIntFunction map) {
return getElementStream().mapToInt(map).toArray();
}
/**
* Returns an array of the bases in this pileup.
* Note: this call costs O(n) and allocates fresh array each time
*/
public byte[] getBases() {
return toByteArray(extractIntArray(pe -> pe.getBase()));
}
/**
* Returns an array of the quals in this pileup.
* Note: this call costs O(n) and allocates fresh array each time
*/
public byte[] getBaseQuals() {
return toByteArray(extractIntArray(pe -> pe.getQual()));
}
//Converts array of ints to array of bytes by hard casting (loses precision if ints are large).
private byte[] toByteArray(final int[] ints) {
final byte[] bytes = new byte[ints.length];
for (int i = 0; i < ints.length; i++) {
bytes[i] = (byte) ints[i];
}
return bytes;
}
/**
* Get an array of the mapping qualities.
*/
public int[] getMappingQuals() {
return extractIntArray(pe -> pe.getMappingQual());
}
private String getQualsString() {
final byte[] quals = getBaseQuals();
for (int i = 0; i < quals.length; i++) {
quals[i] = (byte) (33 + quals[i]); //as per SAM spec
}
return new String(quals);
}
}