picard.sam.markduplicates.UmiAwareDuplicateSetIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of picard Show documentation
picard
The newest version!
/*
 * The MIT License
 *
 * Copyright (c) 2017 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

/**
 * This acts as an iterator over duplicate sets.  If a particular duplicate
 * set consists of records that contain UMIs this iterator breaks up a single
 * duplicate set into multiple duplicate based on the content of the UMIs.
 * Since there may be sequencing errors in the UMIs, this class allows for
 * simple error correction based on edit distances between the UMIs.
 *
 * @author fleharty
 */

package picard.sam.markduplicates;

import htsjdk.samtools.DuplicateSet;
import htsjdk.samtools.DuplicateSetIterator;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;

import java.util.*;


/**
 * UmiAwareDuplicateSetIterator is an iterator that wraps a duplicate set iterator
 * in such a way that each duplicate set may be broken up into subsets according
 * to UMIs in the records.  Some tolerance for errors in the UMIs is allowed, and
 * the degree of this is controlled by the maxEditDistanceToJoin parameter.
 */
class UmiAwareDuplicateSetIterator implements CloseableIterator {
    private final DuplicateSetIterator wrappedIterator;
    private Iterator nextSetsIterator;
    private final int maxEditDistanceToJoin;
    private final String umiTag;
    private final String molecularIdentifierTag;
    private final boolean allowMissingUmis;
    private boolean isOpen = false;
    private final boolean duplexUmi;
    private final Map umiMetricsMap;
    private boolean haveWeSeenFirstRead = false;

    private long observedUmiBases = 0;

    /**
     * Creates a UMI aware duplicate set iterator
     *
     * @param wrappedIterator       Iterator of DuplicatesSets to use and break-up by UMI.
     * @param maxEditDistanceToJoin The edit distance between UMIs that will be used to union UMIs into groups
     * @param umiTag                The tag used in the bam file that designates the UMI
     * @param allowMissingUmis      Allow for SAM Records that do not have UMIs
     * @param umiMetricsMap         Map of UMI Metrics indexed by library name
     */
    UmiAwareDuplicateSetIterator(final DuplicateSetIterator wrappedIterator, final int maxEditDistanceToJoin,
                                 final String umiTag, final String molecularIdentifierTag,
                                 final boolean allowMissingUmis, final boolean duplexUmi,
                                 final Map umiMetricsMap) {
        this.wrappedIterator = wrappedIterator;
        this.maxEditDistanceToJoin = maxEditDistanceToJoin;
        this.umiTag = umiTag;
        this.molecularIdentifierTag = molecularIdentifierTag;
        this.allowMissingUmis = allowMissingUmis;
        this.umiMetricsMap = umiMetricsMap;
        this.duplexUmi = duplexUmi;
        isOpen = true;
        nextSetsIterator = Collections.emptyIterator();
    }

    @Override
    public void close() {
        isOpen = false;
        wrappedIterator.close();

        // Calculate derived fields for UMI metrics over each library
        for (final UmiMetrics metrics : umiMetricsMap.values()) {
            metrics.calculateDerivedFields();
        }
    }

    @Override
    public boolean hasNext() {
        if (!isOpen) {
            return false;
        } else {
            if (nextSetsIterator.hasNext() || wrappedIterator.hasNext()) {
                return true;
            } else {
                isOpen = false;
                return false;
            }
        }
    }

    @Override
    public DuplicateSet next() {
        if (!nextSetsIterator.hasNext()) {
            process(wrappedIterator.next());
        }
        return nextSetsIterator.next();
    }

    /**
     * Takes a duplicate set and breaks it up into possible smaller sets according to the UMI,
     * and updates nextSetsIterator to be an iterator on that set of DuplicateSets.
     *
     * @param set Duplicate set that may be broken up into subsets according the UMIs
     */
    private void process(final DuplicateSet set) {

        // Ensure that the nextSetsIterator isn't already occupied
        if (nextSetsIterator.hasNext()) {
            throw new PicardException("nextSetsIterator is expected to be empty, but already contains data.");
        }

        final UmiGraph umiGraph = new UmiGraph(set, umiTag, molecularIdentifierTag, allowMissingUmis, duplexUmi);

        // Get the UMI metrics for the library of this duplicate set, creating a new one if necessary.
        final String library = set.getRepresentative().getReadGroup().getLibrary();
        final UmiMetrics metrics = umiMetricsMap.computeIfAbsent(library, UmiMetrics::new);

        final List duplicateSets = umiGraph.joinUmisIntoDuplicateSets(maxEditDistanceToJoin);

        // Collect statistics on numbers of observed and inferred UMIs
        // and total numbers of observed and inferred UMIs
        for (final DuplicateSet ds : duplicateSets) {
            final List records = ds.getRecords();

            for (final SAMRecord rec : records) {
                final String currentUmi = UmiUtil.getTopStrandNormalizedUmi(rec, umiTag, duplexUmi);

                if (currentUmi != null) {
                    // All UMIs should be the same length, the code presently does not support variable length UMIs.
                    // If the UMI contains a N, we don't want to include it in our other metrics but we still want
                    // to keep track of it.
                    if (currentUmi.contains("N")) {
                        metrics.addUmiObservationN();
                    } else {
                        final int umiLength = UmiUtil.getUmiLength(currentUmi);
                        if (!haveWeSeenFirstRead) {
                            metrics.MEAN_UMI_LENGTH = umiLength;
                            haveWeSeenFirstRead = true;
                        } else {
                            if (metrics.MEAN_UMI_LENGTH != umiLength) {
                                throw new PicardException("UMIs of differing lengths were found.");
                            }
                        }

                        // Update UMI metrics associated with each record
                        // The hammingDistance between N and a base is a distance of 1. Comparing N to N is 0 distance.
                        final String inferredUmi = (String) rec.getTransientAttribute(UmiUtil.INFERRED_UMI_TRANSIENT_TAG);
                        metrics.OBSERVED_BASE_ERRORS += StringUtil.hammingDistance(currentUmi, inferredUmi);
                        observedUmiBases += umiLength;
                        metrics.addUmiObservation(currentUmi, inferredUmi);
                    }
                }
            }
        }

        // Update UMI metrics associated with each duplicate set
        metrics.DUPLICATE_SETS_WITH_UMI += duplicateSets.size();
        metrics.DUPLICATE_SETS_IGNORING_UMI++;

        nextSetsIterator = duplicateSets.iterator();
    }
}