org.broadinstitute.hellbender.tools.walkers.rnaseq.OverhangFixingManager Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.walkers.rnaseq;
import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.TextCigarCodec;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.util.Tuple;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.GenomeLoc;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
import org.broadinstitute.hellbender.utils.read.*;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
/**
* The class manages reads and splices and tries to apply overhang clipping when appropriate. Overhangs correspond to
* places where a read is aligned partially over Intronic splits generated by SplitNCigarReads. The class is designed
* perform two passes identical passes, where in the first pass every place that the manager changes the relevant
* mate strand information is recorded to be repaired with setPredictedMateInformation(). Running the activateWriting()
* switches the tool to emit output to the writer.
* Important note: although for efficiency the manager does try to send reads to the underlying writer in coordinate
* sorted order, it does NOT guarantee that it will do so in every case! So unless there's a good reason not to,
* methods that instantiate this manager should pass in a writer that does not assume the reads are pre-sorted.
*/
public class OverhangFixingManager {
private final Map> mateChangedReads;
protected static final Logger logger = LogManager.getLogger(OverhangFixingManager.class);
private static final boolean DEBUG = false;
// how many reads should we store in memory before flushing the queue?
private final int maxRecordsInMemory;
// how many mismatches do we tolerate in the overhangs?
private final int maxMismatchesInOverhang;
// how many bases do we tolerate in the overhang before deciding not to clip?
private final int maxBasesInOverhang;
// should we not bother fixing overhangs?
private final boolean doNotFixOverhangs;
// should we process secondary reads at all
private final boolean processSecondaryReads;
// should reads be written to file output or recorded to repair mate information
private boolean outputToFile;
// header for the reads
private final SAMFileHeader header;
// where we ultimately write out our records
private final GATKReadWriter writer;
// fasta reference reader to check overhanging edges in the exome reference sequence
private final ReferenceSequenceFile referenceReader;
// the genome unclippedLoc parser
private final GenomeLocParser genomeLocParser;
// the read cache
private static final int INITIAL_CAPACITY = 5000;
private final PriorityQueue> waitingReadGroups;
private int waitingReads;
// the set of current splices to use
private final Set splices = new TreeSet<>(new SpliceComparator());
protected static final int MAX_SPLICES_TO_KEEP = 1000;
/**
*
* @param header header for the reads
* @param writer actual writer
* @param genomeLocParser the GenomeLocParser object
* @param referenceReader the reference reader
* @param maxRecordsInMemory max records to keep in memory
* @param maxMismatchesInOverhangs max number of mismatches permitted in the overhangs before requiring clipping
* @param maxBasesInOverhangs max number of bases permitted in the overhangs before deciding not to clip
* @param doNotFixOverhangs if true, don't clip overhangs at all
* @param processSecondaryReads if true, allow secondary reads to be overhang clipped. If false, secondary reads
* will still be written out to the writer but will not be edited by the clipper.
*/
public OverhangFixingManager(final SAMFileHeader header,
final GATKReadWriter writer,
final GenomeLocParser genomeLocParser,
final ReferenceSequenceFile referenceReader,
final int maxRecordsInMemory,
final int maxMismatchesInOverhangs,
final int maxBasesInOverhangs,
final boolean doNotFixOverhangs,
final boolean processSecondaryReads) {
this.header = header;
this.writer = writer;
this.genomeLocParser = genomeLocParser;
this.referenceReader = referenceReader;
this.maxRecordsInMemory = maxRecordsInMemory;
this.maxMismatchesInOverhang = maxMismatchesInOverhangs;
this.maxBasesInOverhang = maxBasesInOverhangs;
this.doNotFixOverhangs = doNotFixOverhangs;
this.waitingReadGroups = new PriorityQueue>(INITIAL_CAPACITY, new SplitReadComparator());
this.outputToFile = false;
this.mateChangedReads = new HashMap<>();
this.processSecondaryReads = processSecondaryReads;
}
final int getNReadsInQueue() { return waitingReads; }
/**
* For testing purposes only
*
* @return the list of reads currently in the queue
*/
List> getReadsInQueueForTesting() {
return new ArrayList>(waitingReadGroups);
}
/**
* For testing purposes only
*
* @return the list of splices currently in the queue
*/
public List getSplicesForTesting() {
return new ArrayList<>(splices);
}
/**
* Add a new observed split to the list to use
*
* @param contig the contig
* @param start the start of the split, inclusive
* @param end the end of the split, inclusive
* @return the splice created, for testing purposes
*/
@VisibleForTesting
public Splice addSplicePosition(final String contig, final int start, final int end) {
if ( doNotFixOverhangs ) {
return null;
}
// is this a new splice? if not, we are done
final Splice splice = new Splice(contig, start, end);
if ( splices.contains(splice) ) {
return null;
}
// initialize it with the reference context
// we don't want to do this until we know for sure that it's a new splice position
splice.initialize(referenceReader);
// clear the set of old split positions seen if we hit a new contig
final boolean sameContig = splices.isEmpty() || splices.iterator().next().loc.getContig().equals(contig);
if ( !sameContig ) {
splices.clear();
}
// run this position against the existing reads
waitingReadGroups.parallelStream().forEach( readGroup -> {
final int size = readGroup.size();
for (int i = 0; i < size; i++ ) {
fixSplit(readGroup.get(i), splice);
}
} );
splices.add(splice);
if ( splices.size() > MAX_SPLICES_TO_KEEP ) {
cleanSplices();
}
return splice;
}
/**
* Add a family of split reads to the manager
*
* @param readGroup the family of reads to add to the manager (families are assumed be supplementary alignments to each other)
*/
public void addReadGroup(final List readGroup) {
Utils.nonEmpty(readGroup, "readGroup added to manager is empty, which is not allowed");
// if the new read is on a different contig or we have too many reads, then we need to flush the queue and clear the map
final boolean tooManyReads = getNReadsInQueue() >= maxRecordsInMemory;
GATKRead topRead = ((getNReadsInQueue()>0)? waitingReadGroups.peek().get(0).read : null);
GATKRead firstNewGroup = readGroup.get(0);
final boolean encounteredNewContig = getNReadsInQueue() > 0
&& !topRead.isUnmapped()
&& !firstNewGroup.isUnmapped()
&& !topRead.getContig().equals(firstNewGroup.getContig());
if ( tooManyReads || encounteredNewContig ) {
if ( DEBUG ) {
logger.warn("Flushing queue on " + (tooManyReads ? "too many reads" : ("move to new contig: " + firstNewGroup.getContig() + " from " + topRead.getContig())) + " at " + firstNewGroup.getStart());
}
final int targetQueueSize = encounteredNewContig ? 0 : maxRecordsInMemory / 2;
writeReads(targetQueueSize);
}
List newReadGroup = readGroup.stream().map(this::getSplitRead).collect(Collectors.toList());
// Check every stored read for an overhang with the new splice
for ( final Splice splice : splices) {
for (int i = 0; i < newReadGroup.size(); i++) {
fixSplit(newReadGroup.get(i), splice);
}
}
// add the new reads to the queue
waitingReadGroups.add(newReadGroup);
waitingReads = waitingReads + newReadGroup.size();
}
/**
* Clean up the list of splices by removing the lowest half of sequential splices
*/
private void cleanSplices() {
final int targetQueueSize = splices.size() / 2;
final Iterator iter = splices.iterator();
for ( int i = 0; i < targetQueueSize; i++ ) {
iter.next();
iter.remove();
}
}
/**
* Try to fix the given splitRead using the given split
*
* @param splitRead the splitRead to fix
* @param splice the split (bad region to clip out)
*/
@VisibleForTesting
void fixSplit(final SplitRead splitRead, final Splice splice) {
// if the splitRead doesn't even overlap the split position then we can just exit
if ( splitRead.unclippedLoc == null || !splice.loc.overlapsP(splitRead.unclippedLoc) ) {
return;
}
// if processSecondaryReads == false, filter out clipping of secondary alignments
if (!processSecondaryReads && splitRead.read.isSecondaryAlignment()) {
return;
}
GenomeLoc readLoc = splitRead.unclippedLoc;
GATKRead read = splitRead.read;
// Compute the number of non-clipped bases consumed according to the cigar
final int readBasesLength = Utils.stream(read.getCigar().getCigarElements())
.filter(c -> c.getOperator().consumesReadBases() && !c.getOperator().isClipping())
.mapToInt(CigarElement::getLength)
.sum();
if ( isLeftOverhang(readLoc, splice.loc) ) {
final int overhang = splice.loc.getStop() - read.getStart() + 1;
if ( overhangingBasesMismatch(read.getBases(), read.getStart() - readLoc.getStart(), readBasesLength, splice.reference, splice.reference.length - overhang, overhang) ) {
final GATKRead clippedRead = ReadClipper.softClipByReadCoordinates(read, 0, splice.loc.getStop() - readLoc.getStart());
splitRead.setRead(clippedRead);
}
}
else if ( isRightOverhang(readLoc, splice.loc) ) {
final int overhang = readLoc.getStop() - splice.loc.getStart() + 1;
if ( overhangingBasesMismatch(read.getBases(), read.getLength() - overhang, readBasesLength, splice.reference, 0, read.getEnd() - splice.loc.getStart() + 1) ) {
final GATKRead clippedRead = ReadClipper.softClipByReadCoordinates(read, read.getLength() - overhang, read.getLength() - 1);
splitRead.setRead(clippedRead);
}
}
}
/**
* Is this a proper overhang on the left side of the read?
*
* @param readLoc the read's unclippedLoc
* @param spliceLoc the split's unclippedLoc
* @return true if it's a left side overhang
*/
protected static boolean isLeftOverhang(final GenomeLoc readLoc, final GenomeLoc spliceLoc) {
return readLoc.getStart() <= spliceLoc.getStop() && readLoc.getStart() > spliceLoc.getStart() && readLoc.getStop() > spliceLoc.getStop();
}
/**
* Is this a proper overhang on the right side of the read?
*
* @param readLoc the read's unclippedLoc
* @param spliceLoc the split's unclippedLoc
* @return true if it's a right side overhang
*/
protected static boolean isRightOverhang(final GenomeLoc readLoc, final GenomeLoc spliceLoc) {
return readLoc.getStop() >= spliceLoc.getStart() && readLoc.getStop() < spliceLoc.getStop() && readLoc.getStart() < spliceLoc.getStart();
}
/**
* Are there too many mismatches to the reference among the overhanging bases?
*
* @param read the read bases
* @param readStartIndex where to start on the read
* @param readLength the length of the read according to the reference, used to prevent overclipping
* soft-clipped output from SplitNCigarReads)
* @param reference the reference bases
* @param referenceStartIndex where to start on the reference
* @param spanToTest how many bases to test
* @return true if too many overhanging bases mismatch, false otherwise
*/
protected boolean overhangingBasesMismatch(final byte[] read,
final int readStartIndex,
final int readLength,
final byte[] reference,
final int referenceStartIndex,
final int spanToTest) {
// don't process too small a span, too large a span, or a span that is most of a read
if ( spanToTest < 1 || spanToTest > maxBasesInOverhang || spanToTest > readLength / 2 ) {
return false;
}
int numMismatchesSeen = 0;
for ( int i = 0; i < spanToTest; i++ ) {
if ( read[readStartIndex + i] != reference[referenceStartIndex + i] ) {
if ( ++numMismatchesSeen > maxMismatchesInOverhang) {
return true;
}
}
}
// we can still mismatch overall if at least half of the bases mismatch
return numMismatchesSeen >= ((spanToTest+1)/2);
}
/**
* Close out the manager stream by clearing the read cache
*/
public void flush() {
writeReads(0);
}
/**
* Writes read groups off the top of waitingReads until the total number of reads in the set is less than the
* target queue size. If outputToFile == false, then the it will instead mark the first item to setMateChanged in
* mateChagnedReads
*/
private void writeReads(int targetQueueSize) {
// write out all of the remaining reads
while ( getNReadsInQueue() > targetQueueSize ) {
List waitingGroup = waitingReadGroups.poll();
waitingReads = waitingReads - waitingGroup.size();
// Repair the supplementary groups together and add them into the writer
if (outputToFile) {
SplitNCigarReads.repairSupplementaryTags(waitingGroup.stream()
.map( r -> r.read )
.collect(Collectors.toList()), header);
for (SplitRead splitRead : waitingGroup) {
writer.addRead(splitRead.read);
}
// On the first traversal we want to store reads that would be the mate
} else {
// Don't mark the readgroup if it is secondary (mate information should always point to the primary alignment)
if (!waitingGroup.get(0).read.isSecondaryAlignment() && waitingGroup.get(0).hasBeenOverhangClipped()) {
waitingGroup.get(0).setMateChanged();
}
}
}
}
/**
* Activates output writing for the Overhang Fixing Manager. This command is used to allow the manager to write
* clipped and unclipped reads to the underlying file writer.
*/
public void activateWriting() {
if (outputToFile) {
throw new GATKException("Cannot activate writing for OverhangClippingManager multiple times");
}
flush();
splices.clear();
logger.info("Overhang Fixing Manager saved "+mateChangedReads.size()+" reads in the first pass");
outputToFile = true;
}
// class to represent the reads with their soft-clip-included GenomeLocs
public final class SplitRead {
// Relevant information to determine if the read has been clipped by the manager
private final Cigar oldCigar;
private final int oldStart;
public GATKRead read;
public GenomeLoc unclippedLoc;
public SplitRead(final GATKRead read) {
oldCigar = read.getCigar();
oldStart = read.getStart();
setRead(read);
}
public void setRead(final GATKRead read) {
this.read = read;
int softStart = read.getSoftStart();
int softEnd = read.getSoftEnd();
// Don't assign an unclipped loc if the read if it doesn't consume reference bases
if ( ! read.isUnmapped() && softStart < softEnd) {
unclippedLoc = genomeLocParser.createGenomeLoc(read.getContig(), softStart, softEnd);
}
}
// Returns true if either of the required mate information fields have been changed by the clipper
public boolean hasBeenOverhangClipped() {
return (!oldCigar.equals(read.getCigar())) || (oldStart != read.getStart());
}
// Adds the relevant information for repairing the mate to setMateChanged keyed on a string composed of the start position
public void setMateChanged() {
if (!read.isUnmapped()) {
mateChangedReads.put( makeKey(read.getName(), !read.isFirstOfPair(), oldStart ),
new Tuple<>(read.getStart(), TextCigarCodec.encode(read.getCigar())));
}
}
}
// Generates the string key to be used for storing mates that had their cigar strings changed between iterations over the input reads
@VisibleForTesting
static String makeKey(String name, boolean firstOfPair, int mateStart) {
// NOTE: We use the '@' character here because it is explicitly forbidden for use in read names by the SAM Spec and we don't want to risk accidental matches across read pairs
return name + "@" + (firstOfPair ? 1 : 0) + "@" + mateStart;
}
/**
* Will edit the mate MC tag and mate start position field for given read if that read has been recorded being edited
* by the OverhangFixingManager before. Returns true if the read was edited by the tool, false otherwise.
*
* @param read the read to be edited
*/
public boolean setPredictedMateInformation(GATKRead read) {
if (!outputToFile) {
return false;
}
if (!read.isEmpty() && read.isPaired()) {
String keystring = makeKey(read.getName(), read.isFirstOfPair(), read.getMateStart());
if (mateChangedReads.containsKey(keystring)) {
Tuple value = mateChangedReads.get(keystring);
// update the start position so it is accurate
read.setMatePosition(read.getMateContig(), value.a);
// if the MC tag is present, update it too
if (read.hasAttribute("MC")) {read.setAttribute("MC", value.b);}
return true;
}
}
return false;
}
// class to represent the comparator for the split reads
private final class SplitReadComparator implements Comparator>, Serializable {
private static final long serialVersionUID = 7956407034441782842L;
private final ReadCoordinateComparator readComparator;
public SplitReadComparator() {
readComparator = new ReadCoordinateComparator(header);
}
@Override
public int compare(final List readgroup1, final List readgroup2) {
return readComparator.compare(readgroup1.get(0).read, readgroup2.get(0).read);
}
}
@VisibleForTesting
SplitRead getSplitRead(final GATKRead read){
return new SplitRead(read);
}
// class to represent the split positions
protected final class Splice {
public final GenomeLoc loc;
public byte[] reference;
public Splice(final String contig, final int start, final int end) {
loc = genomeLocParser.createGenomeLoc(contig, start, end);
}
public void initialize(final ReferenceSequenceFile referenceReader) {
reference = referenceReader.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases();
}
@Override
public boolean equals(final Object other) {
return other != null && (other instanceof Splice) && this.loc.equals(((Splice)other).loc);
}
@Override
public int hashCode() {
return loc.hashCode();
}
}
// class to represent the comparator for the split reads
private final class SpliceComparator implements Comparator, Serializable {
private static final long serialVersionUID = -7783679773557594065L;
@Override
public int compare(final Splice position1, final Splice position2) {
return position1.loc.compareTo(position2.loc);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy