
picard.sam.markduplicates.UmiAwareDuplicateSetIterator Maven / Gradle / Ivy
The newest version!
/*
* The MIT License
*
* Copyright (c) 2017 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/**
* This acts as an iterator over duplicate sets. If a particular duplicate
* set consists of records that contain UMIs this iterator breaks up a single
* duplicate set into multiple duplicate based on the content of the UMIs.
* Since there may be sequencing errors in the UMIs, this class allows for
* simple error correction based on edit distances between the UMIs.
*
* @author fleharty
*/
package picard.sam.markduplicates;
import htsjdk.samtools.DuplicateSet;
import htsjdk.samtools.DuplicateSetIterator;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import java.util.*;
/**
* UmiAwareDuplicateSetIterator is an iterator that wraps a duplicate set iterator
* in such a way that each duplicate set may be broken up into subsets according
* to UMIs in the records. Some tolerance for errors in the UMIs is allowed, and
* the degree of this is controlled by the maxEditDistanceToJoin parameter.
*/
class UmiAwareDuplicateSetIterator implements CloseableIterator {
private final DuplicateSetIterator wrappedIterator;
private Iterator nextSetsIterator;
private final int maxEditDistanceToJoin;
private final String umiTag;
private final String molecularIdentifierTag;
private final boolean allowMissingUmis;
private boolean isOpen = false;
private final boolean duplexUmi;
private final Map umiMetricsMap;
private boolean haveWeSeenFirstRead = false;
private long observedUmiBases = 0;
/**
* Creates a UMI aware duplicate set iterator
*
* @param wrappedIterator Iterator of DuplicatesSets to use and break-up by UMI.
* @param maxEditDistanceToJoin The edit distance between UMIs that will be used to union UMIs into groups
* @param umiTag The tag used in the bam file that designates the UMI
* @param allowMissingUmis Allow for SAM Records that do not have UMIs
* @param umiMetricsMap Map of UMI Metrics indexed by library name
*/
UmiAwareDuplicateSetIterator(final DuplicateSetIterator wrappedIterator, final int maxEditDistanceToJoin,
final String umiTag, final String molecularIdentifierTag,
final boolean allowMissingUmis, final boolean duplexUmi,
final Map umiMetricsMap) {
this.wrappedIterator = wrappedIterator;
this.maxEditDistanceToJoin = maxEditDistanceToJoin;
this.umiTag = umiTag;
this.molecularIdentifierTag = molecularIdentifierTag;
this.allowMissingUmis = allowMissingUmis;
this.umiMetricsMap = umiMetricsMap;
this.duplexUmi = duplexUmi;
isOpen = true;
nextSetsIterator = Collections.emptyIterator();
}
@Override
public void close() {
isOpen = false;
wrappedIterator.close();
// Calculate derived fields for UMI metrics over each library
for (final UmiMetrics metrics : umiMetricsMap.values()) {
metrics.calculateDerivedFields();
}
}
@Override
public boolean hasNext() {
if (!isOpen) {
return false;
} else {
if (nextSetsIterator.hasNext() || wrappedIterator.hasNext()) {
return true;
} else {
isOpen = false;
return false;
}
}
}
@Override
public DuplicateSet next() {
if (!nextSetsIterator.hasNext()) {
process(wrappedIterator.next());
}
return nextSetsIterator.next();
}
/**
* Takes a duplicate set and breaks it up into possible smaller sets according to the UMI,
* and updates nextSetsIterator to be an iterator on that set of DuplicateSets.
*
* @param set Duplicate set that may be broken up into subsets according the UMIs
*/
private void process(final DuplicateSet set) {
// Ensure that the nextSetsIterator isn't already occupied
if (nextSetsIterator.hasNext()) {
throw new PicardException("nextSetsIterator is expected to be empty, but already contains data.");
}
final UmiGraph umiGraph = new UmiGraph(set, umiTag, molecularIdentifierTag, allowMissingUmis, duplexUmi);
// Get the UMI metrics for the library of this duplicate set, creating a new one if necessary.
final String library = set.getRepresentative().getReadGroup().getLibrary();
final UmiMetrics metrics = umiMetricsMap.computeIfAbsent(library, UmiMetrics::new);
final List duplicateSets = umiGraph.joinUmisIntoDuplicateSets(maxEditDistanceToJoin);
// Collect statistics on numbers of observed and inferred UMIs
// and total numbers of observed and inferred UMIs
for (final DuplicateSet ds : duplicateSets) {
final List records = ds.getRecords();
for (final SAMRecord rec : records) {
final String currentUmi = UmiUtil.getTopStrandNormalizedUmi(rec, umiTag, duplexUmi);
if (currentUmi != null) {
// All UMIs should be the same length, the code presently does not support variable length UMIs.
// If the UMI contains a N, we don't want to include it in our other metrics but we still want
// to keep track of it.
if (currentUmi.contains("N")) {
metrics.addUmiObservationN();
} else {
final int umiLength = UmiUtil.getUmiLength(currentUmi);
if (!haveWeSeenFirstRead) {
metrics.MEAN_UMI_LENGTH = umiLength;
haveWeSeenFirstRead = true;
} else {
if (metrics.MEAN_UMI_LENGTH != umiLength) {
throw new PicardException("UMIs of differing lengths were found.");
}
}
// Update UMI metrics associated with each record
// The hammingDistance between N and a base is a distance of 1. Comparing N to N is 0 distance.
final String inferredUmi = (String) rec.getTransientAttribute(UmiUtil.INFERRED_UMI_TRANSIENT_TAG);
metrics.OBSERVED_BASE_ERRORS += StringUtil.hammingDistance(currentUmi, inferredUmi);
observedUmiBases += umiLength;
metrics.addUmiObservation(currentUmi, inferredUmi);
}
}
}
}
// Update UMI metrics associated with each duplicate set
metrics.DUPLICATE_SETS_WITH_UMI += duplicateSets.size();
metrics.DUPLICATE_SETS_IGNORING_UMI++;
nextSetsIterator = duplicateSets.iterator();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy