picard.sam.markduplicates.UmiGraph Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of picard Show documentation
picard
There is a newer version: 3.3.0
/*
 * The MIT License
 *
 * Copyright (c) 2016 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package picard.sam.markduplicates;

import htsjdk.samtools.DuplicateSet;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.util.GraphUtils;

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import static java.util.stream.Collectors.counting;
/**
 * UmiGraph is used to identify UMIs that come from the same original source molecule.  The assumption
 * is that UMIs with small edit distances are likely to be read errors on the sequencer rather than
 * distinct molecules.
 *
 * The algorithm used here is to join all pairs of UMIs that are within maxEditDistanceToJoin.  It is possible
 * for a set of UMIs A, B and C to all be considered as part of the same source molecule even if two of the UMIs
 * have a Hamming distance larger than maxEditDistanceToJoin.  Suppose A = "ATCC", B = "AACC", and C = "AACG"
 * and maxEditDistanceToJoin = 1.  In this case, A and B are 1 Hamming distance so they are joined, and B and C
 * are 1 Hamming distance so they are joined.  Because A and B are joined and because B and C are joined, this results
 * in A and C being joined even though they have a distance of 2.
 *
 * @author fleharty
 */
public class UmiGraph {
    private final List records;      // SAMRecords from the original duplicate set considered to break up by UMI
    private final Map umiCounts;  // Map of UMI sequences and how many times they have been observed
    private final int[] duplicateSetID;         // ID of the duplicate set that the UMI belongs to, the index is the UMI ID
    private final String[] umi;                 // Sequence of actual UMI, the index is the UMI ID
    private final int numUmis;                  // Number of observed UMIs
    private final String umiTag;                // UMI tag used in the SAM/BAM/CRAM file ie. RX
    private final String assignedUmiTag;        // Assigned UMI tag used in the SAM/BAM/CRAM file ie. MI
    private final boolean allowMissingUmis;     // Allow for missing UMIs

    public UmiGraph(DuplicateSet set, String umiTag, String assignedUmiTag, boolean allowMissingUmis) {
        this.umiTag = umiTag;
        this.assignedUmiTag = assignedUmiTag;
        this.allowMissingUmis = allowMissingUmis;
        records = set.getRecords();

        // First ensure that all the reads have a UMI, if any reads are missing a UMI throw an exception unless allowMissingUmis is true
        for (SAMRecord rec : records) {
            if (rec.getStringAttribute(umiTag) == null) {
                if (allowMissingUmis) {
                    rec.setAttribute(umiTag, "");
                } else {
                    throw new PicardException("Read " + rec.getReadName() + " does not contain a UMI with the " + umiTag + " attribute.");
                }
            }
        }

        // Count the number of times each UMI occurs
        umiCounts = records.stream().collect(Collectors.groupingBy(p -> p.getStringAttribute(umiTag), counting()));

        // At first we consider every UMI as if it were its own duplicate set
        numUmis = umiCounts.size();
        umi = new String[numUmis];
        duplicateSetID = IntStream.rangeClosed(0, numUmis-1).toArray();

        int i = 0;
        for (String key : umiCounts.keySet()) {
            umi[i] = key;
            i++;
        }
    }

    List joinUmisIntoDuplicateSets(final int maxEditDistanceToJoin) {
        // Compare all UMIs to each other.  If they are within maxEditDistanceToJoin
        // join them to the same duplicate set using the union-find algorithm.

        GraphUtils.Graph umiGraph = new GraphUtils.Graph<>();
        for (int i = 0; i < numUmis; i++) {
            umiGraph.addNode(i);
            for (int j = i + 1; j < numUmis; j++) {
                if (StringUtil.isWithinHammingDistance(umi[i], umi[j], maxEditDistanceToJoin)) {
                    umiGraph.addEdge(i, j);
                }
            }
        }

        final Map umiClusterMap = umiGraph.cluster();
        // This ensures that all duplicate sets have unique IDs.  During Union-Find a tree is constructed
        // where each UMI points to parent UMI.  This ensures that all UMIs that belong to the same duplicate
        // set point to the same parent UMI.  Note that the parent UMI is only used as a representative UMI and
        // is not at all related to the assigned UMI.
        for (int i = 0; i < numUmis; i++) {
            duplicateSetID[i] = umiClusterMap.get(i);
        }

        final Map> duplicateSets = new HashMap<>();

        // Assign UMIs to duplicateSets
        final Map duplicateSetsFromUmis = getDuplicateSetsFromUmis();
        for (SAMRecord rec : records) {
            final String umi = rec.getStringAttribute(umiTag);
            final Integer duplicateSetIndex = duplicateSetsFromUmis.get(umi);

            if (duplicateSets.containsKey(duplicateSetIndex)) {
                duplicateSets.get(duplicateSetIndex).add(rec);
            } else {
                final List n = new ArrayList<>();
                n.add(rec);
                duplicateSets.put(duplicateSetIndex, n);
            }
        }

        final List duplicateSetList = new ArrayList<>();
        for (final Map.Entry> entry : duplicateSets.entrySet()) {
            final DuplicateSet ds = new DuplicateSet();
            final List recordList = entry.getValue();

            // Add records to the DuplicateSet
            recordList.forEach(ds::add);

            // For a particular duplicate set, identify the most common UMI
            // and use this as an assigned UMI.
            long maxCount = 0;
            String assignedUmi = null;
            for (SAMRecord rec : recordList) {
                final String umi = rec.getStringAttribute(umiTag);

                if (umiCounts.get(umi) > maxCount) {
                   maxCount = umiCounts.get(umi);
                   assignedUmi = umi;
                }
            }

            // Set the records to contain the assigned UMI
            for (final SAMRecord rec : recordList) {
                if (allowMissingUmis && rec.getStringAttribute(umiTag).isEmpty()) {
                    // The SAM spec doesn't support empty tags, so we set it to null if it is empty.
                    rec.setAttribute(umiTag, null);
                } else {
                    rec.setAttribute(assignedUmiTag, assignedUmi);
                }
            }

            duplicateSetList.add(ds);
        }

        return duplicateSetList;
    }

    // Create a map that maps a umi to the duplicateSetID
    private Map getDuplicateSetsFromUmis() {
        final Map duplicateSetsFromUmis = new HashMap<>();
        for (int i = 0; i < duplicateSetID.length; i++) {
            duplicateSetsFromUmis.put(umi[i], duplicateSetID[i]);
        }
        return duplicateSetsFromUmis;
    }
}