org.broadinstitute.hellbender.utils.read.CigarUtils Maven / Gradle / Ivy
package org.broadinstitute.hellbender.utils.read;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.TextCigarCodec;
import it.unimi.dsi.fastutil.booleans.BooleanArrayList;
import org.apache.commons.lang3.tuple.Triple;
import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
import org.broadinstitute.gatk.nativebindings.smithwaterman.SWParameters;
import org.broadinstitute.hellbender.utils.IndexRange;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.function.Predicate;
public final class CigarUtils {
// used in the bubble state machine to apply Smith-Waterman to the bubble sequence
// these values were chosen via optimization against the NA12878 knowledge base
public static final SWParameters NEW_SW_PARAMETERS = new SWParameters(200, -150, -260, -11);
// In Mutect2 and HaplotypeCaller reads are realigned to their *best* haplotypes, which is very different from a generic alignment.
// The {@code NEW_SW_PARAMETERS} penalize a substitution error more than an indel up to a length of 9 bases!
// Suppose, for example, that a read has a single substitution error, say C -> T, on its last base. Those parameters
// would prefer to extend a deletion until the next T on the reference is found in order to avoid the substitution, which is absurd.
// Since these parameters are for aligning a read to the biological sequence we believe it comes from, the parameters
// we choose should correspond to sequencer error. They *do not* have anything to do with the prevalence of true variation!
public static final SWParameters ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS = new SWParameters(10, -15, -30, -5);
private static final String SW_PAD = "NNNNNNNNNN";
private CigarUtils(){}
/**
* Inverts the order of the operators in the cigar.
* Eg 10M1D20M -> 20M1D10M
*/
public static Cigar invertCigar (final Cigar cigar) {
Utils.nonNull(cigar);
final List els = new ArrayList<>(cigar.getCigarElements());
Collections.reverse(els);
return new Cigar(els);
}
/**
* Compute the number of reference bases between the start (inclusive) and end (exclusive) cigar elements.
* Reference bases are counted as the number of positions w.r.t. the reference spanned by an alignment to that reference.
* For example original position = 10. cigar: 2M3I2D1M. If you remove the 2M the new starting position is 12.
* If you remove the 2M3I it is still 12. If you remove 2M3I2D (not reasonable cigar), you will get position 14.
*/
public static int countRefBasesAndClips(final List elems, final int cigarStartIndex, final int cigarEndIndex){
return countRefBasesAndMaybeAlsoClips(elems, cigarStartIndex, cigarEndIndex, true, true);
}
public static int countRefBasesAndSoftClips(final List elems, final int cigarStartIndex, final int cigarEndIndex){
return countRefBasesAndMaybeAlsoClips(elems, cigarStartIndex, cigarEndIndex, true, false);
}
private static int countRefBasesAndMaybeAlsoClips(final List elems, final int cigarStartIndex, final int cigarEndIndex, final boolean includeSoftClips, final boolean includeHardClips) {
Utils.nonNull(elems);
Utils.validateArg(cigarStartIndex >= 0 && cigarEndIndex <= elems.size() && cigarStartIndex <= cigarEndIndex, () -> "invalid index:" + 0 + " -" + elems.size());
int result = 0;
for (final CigarElement elem : elems.subList(cigarStartIndex, cigarEndIndex)) {
final CigarOperator op = elem.getOperator();
if (op.consumesReferenceBases() || (includeSoftClips && op == CigarOperator.SOFT_CLIP) || (includeHardClips && op == CigarOperator.HARD_CLIP)) {
result += elem.getLength();
}
}
return result;
}
/**
* Removes all clipping and padding operators from the cigar.
*/
public static Cigar removeClipsAndPadding(final Cigar cigar) {
Utils.nonNull(cigar, "cigar is null");
final List elements = new ArrayList<>(cigar.numCigarElements());
for ( final CigarElement ce : cigar.getCigarElements() ) {
if ( !isClipOrPaddingOperator(ce.getOperator()) ) {
elements.add(ce);
}
}
return new Cigar(elements);
}
private static boolean isClipOrPaddingOperator(final CigarOperator op) {
return op == CigarOperator.S || op == CigarOperator.H || op == CigarOperator.P;
}
/**
* Returns whether the list has any N operators.
*/
public static boolean containsNOperator(final List cigarElements) {
return Utils.nonNull(cigarElements).stream().anyMatch(el -> el.getOperator() == CigarOperator.N);
}
/**
* A good Cigar object obeys the following rules:
* - is valid as per SAM spec {@link Cigar#isValid(String, long)}.
* - has no consecutive I/D elements
* - does not start or end with deletions (with or without preceding clips).
*/
public static boolean isGood(final Cigar c) {
Utils.nonNull(c, "cigar is null");
//Note: the string comes from the SAMRecord so it must be a wellformed CIGAR (that is, in "\*|([0-9]+[MIDNSHPX=])+" as per SAM spec).
//We don't have to check that
if (c.isValid(null, -1) != null){ //if it's invalid, then it's not good
return false;
}
final List elems = c.getCigarElements();
return !(hasConsecutiveIndels(elems) || startsOrEndsWithDeletionIgnoringClips(elems));
}
/**
* Checks if cigar has consecutive I/D elements.
*/
private static boolean hasConsecutiveIndels(final List elems) {
boolean prevIndel = false;
for (final CigarElement elem : elems) {
final CigarOperator op = elem.getOperator();
final boolean isIndel = (op == CigarOperator.INSERTION || op == CigarOperator.DELETION);
if (prevIndel && isIndel) {
return true;
}
prevIndel = isIndel;
}
return false;
}
/**
* Checks if cigar starts with a deletion (ignoring any clips at the beginning).
*/
private static boolean startsOrEndsWithDeletionIgnoringClips(final List elems) {
for (final boolean leftSide : new boolean[] {true, false}) {
for (final CigarElement elem : leftSide ? elems : Lists.reverse(elems)) {
final CigarOperator op = elem.getOperator();
if (op == CigarOperator.DELETION) { //first non-clipping is deletion
return true;
} else if (!op.isClipping()) { // first non-clipping is non deletion
break;
}
}
}
return false;
}
/**
* Calculate the cigar elements for this path against the reference sequence.
*
* This assumes that the reference and alt sequence are haplotypes derived from a de Bruijn graph or SeqGraph and have the same
* ref source and ref sink vertices. That is, the alt sequence start and end are assumed anchored to the reference start and end, which
* occur at the ends of the padded assembly region. Hence, unlike read alignment, there is no concept of a start or end coordinate here.
* Furthermore, it is important to note that in the rare case that the alt cigar begins or ends with a deletion, we must keep the leading
* or trailing deletion in order to maintain the original reference span of the alt haplotype. This can occur, for example, when the ref
* haplotype starts with N repeats of a long sequence and the alt haplotype starts with N-1 repeats.
*
* @param aligner
* @param refSeq the reference sequence that all of the bases in this path should align to
* @return a Cigar mapping this path to refSeq, or null if no reasonable alignment could be found
*/
public static Cigar calculateCigar(final byte[] refSeq, final byte[] altSeq, final SmithWatermanAligner aligner, final SWOverhangStrategy strategy) {
Utils.nonNull(refSeq, "refSeq");
Utils.nonNull(altSeq, "altSeq");
if ( altSeq.length == 0 ) {
// horrible edge case from the unit tests, where this path has no bases
return new Cigar(Collections.singletonList(new CigarElement(refSeq.length, CigarOperator.D)));
}
//Note: this is a performance optimization.
// If two strings are equal (a O(n) check) then it's trivial to get CIGAR for them.
// Furthermore, if their lengths are equal and their element-by-element comparison yields two or fewer mismatches
// it's also a trivial M-only CIGAR, because in order to have equal length one would need at least one insertion and
// one deletion, in which case two substitutions is a better alignment.
if (altSeq.length == refSeq.length){
int mismatchCount = 0;
for (int n = 0; n < refSeq.length && mismatchCount <= 2; n++) {
mismatchCount += (altSeq[n] == refSeq[n] ? 0 : 1);
}
if (mismatchCount <= 2) {
final Cigar matching = new Cigar();
matching.add(new CigarElement(refSeq.length, CigarOperator.MATCH_OR_MISMATCH));
return matching;
}
}
final Cigar nonStandard;
final String paddedRef = SW_PAD + new String(refSeq) + SW_PAD;
final String paddedPath = SW_PAD + new String(altSeq) + SW_PAD;
final SmithWatermanAlignment alignment = aligner.align(paddedRef.getBytes(), paddedPath.getBytes(), NEW_SW_PARAMETERS, strategy);
if ( isSWFailure(alignment) ) {
return null;
}
// cut off the padding bases
final int baseStart = SW_PAD.length();
final int baseEnd = paddedPath.length() - SW_PAD.length() - 1; // -1 because it's inclusive
final CigarBuilder.Result trimmedCigarAndDeletionsRemoved = AlignmentUtils.trimCigarByBases(alignment.getCigar(), baseStart, baseEnd);
nonStandard = trimmedCigarAndDeletionsRemoved.getCigar();
// leading deletion removed by cigar trimming shift the alignment start to the right
final int trimmedLeadingDeletions = trimmedCigarAndDeletionsRemoved.getLeadingDeletionBasesRemoved();
// trailing deletions should be kept in order to left-align
final int trimmedTrailingDeletions = trimmedCigarAndDeletionsRemoved.getTrailingDeletionBasesRemoved();
if ( trimmedTrailingDeletions > 0 ) {
nonStandard.add(new CigarElement(trimmedTrailingDeletions, CigarOperator.D));
}
final CigarBuilder.Result leftAlignmentResult = AlignmentUtils.leftAlignIndels(nonStandard, refSeq, altSeq, trimmedLeadingDeletions);
// we must account for possible leading deletions removed when trimming the padding and when left-aligning
// trailing deletions removed when trimming have already been restored for left-alignment, but left-alingment may have removed them again.
final int totalLeadingDeletionsRemoved = trimmedLeadingDeletions + leftAlignmentResult.getLeadingDeletionBasesRemoved();
final int totalTrailingDeletionsRemoved = leftAlignmentResult.getTrailingDeletionBasesRemoved();
if (totalLeadingDeletionsRemoved == 0 && totalTrailingDeletionsRemoved == 0) {
return leftAlignmentResult.getCigar();
} else {
final List resultElements = new ArrayList<>();
if (totalLeadingDeletionsRemoved > 0) {
resultElements.add(new CigarElement(totalLeadingDeletionsRemoved, CigarOperator.D));
}
resultElements.addAll(leftAlignmentResult.getCigar().getCigarElements());
if (totalTrailingDeletionsRemoved > 0) {
resultElements.add(new CigarElement(totalTrailingDeletionsRemoved, CigarOperator.D));
}
return new Cigar(resultElements);
}
}
/**
* Make sure that the SW didn't fail in some terrible way, and throw exception if it did
*/
private static boolean isSWFailure(final SmithWatermanAlignment alignment) {
// check that the alignment starts at the first base, which it should given the padding
if ( alignment.getAlignmentOffset() > 0 ) {
return true;
}
// check that we aren't getting any S operators (which would be very bad downstream)
for ( final CigarElement ce : alignment.getCigar().getCigarElements() ) {
if ( ce.getOperator() == CigarOperator.S )
return true;
// soft clips at the end of the alignment are really insertions
}
return false;
}
/**
* Returns the length of the original read considering all clippings based on this cigar.
*
* The result of applying this method on a empty cigar is zero.
*
* @param cigar the input cigar.
* @throws IllegalArgumentException if the input {@code cigar} is {@code null}.
* @return 0 or greater.
*/
public static int countUnclippedReadBases(final Cigar cigar) {
Utils.nonNull(cigar, "the input cigar cannot be null");
return cigar.getCigarElements().stream()
.filter(ce -> {
final CigarOperator operator = ce.getOperator();
return operator.isClipping() || operator.consumesReadBases();
})
.mapToInt(CigarElement::getLength)
.sum();
}
/**
* Count the number of soft- or hard- clipped bases from either the left or right end of a cigar
*/
public static int countClippedBases(final Cigar cigar, final ClippingTail tail, final CigarOperator typeOfClip) {
Utils.validateArg(typeOfClip.isClipping(), "typeOfClip must be a clipping operator");
final int size = cigar.numCigarElements();
if (size < 2) {
Utils.validateArg(size == 1 && !cigar.getFirstCigarElement().getOperator().isClipping(), "cigar is empty or completely clipped.");
return 0;
}
int result = 0;
for (int n = 0; n < size; n++) {
final int index = (tail == ClippingTail.LEFT_TAIL ? n : size - n - 1);
final CigarElement element = cigar.getCigarElement(index);
if (!element.getOperator().isClipping()) {
return result;
} else if (element.getOperator() == typeOfClip) {
result += element.getLength();
}
}
throw new IllegalArgumentException("Input cigar " + cigar + " is completely clipped.");
}
/**
* Count the number clipped bases (both soft and hard) from either the left or right end of a cigar
*/
public static int countClippedBases(final Cigar cigar, final ClippingTail tail) {
return countClippedBases(cigar, tail, CigarOperator.SOFT_CLIP) + countClippedBases(cigar, tail, CigarOperator.HARD_CLIP);
}
/**
* Count the number of soft- and hard-clipped bases over both ends of a cigar
*/
public static int countClippedBases(final Cigar cigar, final CigarOperator clippingType) {
return countClippedBases(cigar, ClippingTail.LEFT_TAIL, clippingType) + countClippedBases(cigar, ClippingTail.RIGHT_TAIL, clippingType);
}
/**
* Count the number of clipped bases (both soft and hard) over both ends of a cigar
*/
public static int countClippedBases(final Cigar cigar) {
return countClippedBases(cigar, ClippingTail.LEFT_TAIL) + countClippedBases(cigar, ClippingTail.RIGHT_TAIL);
}
public static int countAlignedBases(final Cigar cigar ) {
return Utils.nonNull(cigar).getCigarElements().stream()
.filter(cigarElement -> cigarElement.getOperator().isAlignment())
.mapToInt(CigarElement::getLength)
.sum();
}
/**
* replace soft clips (S) with match (M) operators, normalizing the result by all the transformations of the {@link CigarBuilder} class:
* merging consecutive identical operators and removing zero-length elements. For example 10S10M -> 20M and 10S10M10I10I -> 20M20I.
*/
public static Cigar revertSoftClips(final Cigar originalCigar) {
final CigarBuilder builder = new CigarBuilder();
for (final CigarElement element : originalCigar.getCigarElements()) {
if (element.getOperator() == CigarOperator.SOFT_CLIP) {
builder.add(new CigarElement(element.getLength(), CigarOperator.MATCH_OR_MISMATCH));
} else {
builder.add(element);
}
}
return builder.make();
}
/**
* Given a cigar string, soft clip up to leftClipEnd and soft clip starting at rightClipBegin
* @param start initial index to clip within read bases, inclusive
* @param stop final index to clip within read bases exclusive
* @param clippingOperator type of clipping -- must be either hard clip or soft clip
*/
public static Cigar clipCigar(final Cigar cigar, final int start, final int stop, CigarOperator clippingOperator) {
Utils.validateArg(clippingOperator.isClipping(), "Not a clipping operator");
final boolean clipLeft = start == 0;
final CigarBuilder newCigar = new CigarBuilder();
int elementStart = 0;
for (final CigarElement element : cigar.getCigarElements()) {
final CigarOperator operator = element.getOperator();
// copy hard clips
if (operator == CigarOperator.HARD_CLIP) {
newCigar.add(new CigarElement(element.getLength(), element.getOperator()));
continue;
}
final int elementEnd = elementStart + (operator.consumesReadBases() ? element.getLength() : 0);
// element precedes start or follows end of clip, copy it to new cigar
if (elementEnd <= start || elementStart >= stop) {
// edge case: deletions at edge of clipping are meaningless and we skip them
if (operator.consumesReadBases() || (elementStart != start && elementStart != stop)) {
newCigar.add(new CigarElement(element.getLength(), operator));
}
} else { // otherwise, some or all of the element is soft-clipped
final int unclippedLength = clipLeft ? elementEnd - stop : start - elementStart;
final int clippedLength = element.getLength() - unclippedLength;
if (unclippedLength <= 0) { // totally clipped
if (operator.consumesReadBases()) {
newCigar.add(new CigarElement(element.getLength(), clippingOperator));
}
} else if (clipLeft) {
newCigar.add(new CigarElement(clippedLength, clippingOperator));
newCigar.add(new CigarElement(unclippedLength, operator));
} else {
newCigar.add(new CigarElement(unclippedLength, operator));
newCigar.add(new CigarElement(clippedLength, clippingOperator));
}
}
elementStart = elementEnd;
}
return newCigar.make();
}
/**
* How many bases to the right does a read's alignment start shift given its cigar and the number of left soft clips
*/
public static int alignmentStartShift(final Cigar cigar, final int numClipped) {
int refBasesClipped = 0;
int elementStart = 0; // this and elementEnd are indices in the read's bases
for (final CigarElement element : cigar.getCigarElements()) {
final CigarOperator operator = element.getOperator();
// hard clips consume neither read bases nor reference bases and are irrelevant
if (operator == CigarOperator.HARD_CLIP) {
continue;
}
final int elementEnd = elementStart + (operator.consumesReadBases() ? element.getLength() : 0);
if (elementEnd <= numClipped) { // totally within clipped span -- this includes deletions immediately following clipping
refBasesClipped += operator.consumesReferenceBases() ? element.getLength() : 0;
} else if (elementStart < numClipped) { // clip in middle of element, which means the element necessarily consumes read bases
final int clippedLength = numClipped - elementStart;
refBasesClipped += operator.consumesReferenceBases() ? clippedLength : 0;
break;
}
elementStart = elementEnd;
}
return refBasesClipped;
}
/**
* Computes the corresponding distance needs to be walked on the read, given the Cigar and distance walked on the reference.
* @param cigar cigar along the 5-3 direction of read (when read is mapped to reverse strand, bwa mem output cigar should be inverted)
* @param start start position (1-based) on the read (note it should not count the hard clipped bases, as usual)
* @param refDist distance to walk on the reference
* @param backward whether to walk backwards along the read or not
* @return corresponding walk distance on read (always positive)
* @throws IllegalArgumentException if input cigar contains padding operation or 'N', or
* either of {@code start} or distance is non-positive, or
* {@code start} is larger than read length, or
* requested reference walk distance is longer than the total read bases in cigar, or
* computed read walk distance would "walk off" the read
*/
public static int computeAssociatedDistOnRead(final Cigar cigar, final int start, final int refDist, final boolean backward) {
Utils.validateArg(refDist > 0 && start > 0, () -> "start " + start + " or distance " + refDist + " is non-positive.");
final List elements = backward ? Lists.reverse(cigar.getCigarElements()) : cigar.getCigarElements();
final int readLength = elements.stream().mapToInt(ce -> ce.getOperator().consumesReadBases() ? ce.getLength() : 0).sum();
final int readBasesToSkip = backward ? readLength - start : start - 1;
int readBasesConsumed = 0;
int refBasesConsumed = 0;
for (final CigarElement element : elements){
final int readBasesConsumedBeforeElement = readBasesConsumed;
readBasesConsumed += element.getOperator().consumesReadBases() ? element.getLength() : 0;
// skip cigar elements that end before the read start or start after the reference end
if (readBasesConsumed <= readBasesToSkip) {
continue;
}
refBasesConsumed += element.getOperator().consumesReferenceBases() ? element.getLength() - Math.max(readBasesToSkip - readBasesConsumedBeforeElement, 0) : 0;
if (refBasesConsumed >= refDist) {
final int excessRefBasesInElement = Math.max(refBasesConsumed - refDist, 0);
return readBasesConsumed - readBasesToSkip - (element.getOperator().consumesReadBases() ? excessRefBasesInElement : 0);
}
}
throw new IllegalArgumentException("Cigar " + cigar + "does not contain at least " + refDist + " reference bases past red start " + start + ".");
}
/**
* Convert the 'I' CigarElement, if it is at either end (terminal) of the input cigar, to a corresponding 'S' operator.
* Note that we allow CIGAR of the format '10H10S10I10M', but disallows the format if after the conversion the cigar turns into a giant clip,
* e.g. '10H10S10I10S10H' is not allowed (if allowed, it becomes a giant clip of '10H30S10H' which is non-sense).
*
* @return a pair of number of clipped (hard and soft, including the ones from the converted terminal 'I') bases at the front and back of the
* input {@code cigarAlongInput5to3Direction}.
*
* @throws IllegalArgumentException when the checks as described above fail.
*/
public static Cigar convertTerminalInsertionToSoftClip(final Cigar cigar) {
if (cigar.numCigarElements() < 2 ) {
return cigar;
}
final CigarBuilder builder = new CigarBuilder();
for (int n = 0; n < cigar.numCigarElements(); n++) {
final CigarElement element = cigar.getCigarElement(n);
if (element.getOperator() != CigarOperator.INSERTION) { // not an insertion
builder.add(element);
} else if (n == 0 || n == cigar.numCigarElements() - 1) { // terminal insertion with no clipping -- convert to soft clip
builder.add(new CigarElement(element.getLength(), CigarOperator.SOFT_CLIP));
} else if (cigar.getCigarElement(n-1).getOperator().isClipping() || cigar.getCigarElement(n+1).getOperator().isClipping()) { // insertion preceding or following clip
builder.add(new CigarElement(element.getLength(), CigarOperator.SOFT_CLIP));
} else { // interior insertion
builder.add(element);
}
}
return builder.make();
}
}