picard.sam.markduplicates.UmiGraph Maven / Gradle / Ivy
/*
* The MIT License
*
* Copyright (c) 2016 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam.markduplicates;
import htsjdk.samtools.DuplicateSet;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.util.GraphUtils;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static java.util.stream.Collectors.counting;
/**
* UmiGraph is used to identify UMIs that come from the same original source molecule. The assumption
* is that UMIs with small edit distances are likely to be read errors on the sequencer rather than
* distinct molecules.
*
* The algorithm used here is to join all pairs of UMIs that are within maxEditDistanceToJoin. It is possible
* for a set of UMIs A, B and C to all be considered as part of the same source molecule even if two of the UMIs
* have a Hamming distance larger than maxEditDistanceToJoin. Suppose A = "ATCC", B = "AACC", and C = "AACG"
* and maxEditDistanceToJoin = 1. In this case, A and B are 1 Hamming distance so they are joined, and B and C
* are 1 Hamming distance so they are joined. Because A and B are joined and because B and C are joined, this results
* in A and C being joined even though they have a distance of 2.
*
* @author fleharty
*/
public class UmiGraph {
private final List records; // SAMRecords from the original duplicate set considered to break up by UMI
private final Map umiCounts; // Map of UMI sequences and how many times they have been observed
private final int[] duplicateSetID; // ID of the duplicate set that the UMI belongs to, the index is the UMI ID
private final String[] umi; // Sequence of actual UMI, the index is the UMI ID
private final int numUmis; // Number of observed UMIs
private final String umiTag; // UMI tag used in the SAM/BAM/CRAM file ie. RX
private final String assignedUmiTag; // Assigned UMI tag used in the SAM/BAM/CRAM file ie. MI
private final boolean allowMissingUmis; // Allow for missing UMIs
public UmiGraph(DuplicateSet set, String umiTag, String assignedUmiTag, boolean allowMissingUmis) {
this.umiTag = umiTag;
this.assignedUmiTag = assignedUmiTag;
this.allowMissingUmis = allowMissingUmis;
records = set.getRecords();
// First ensure that all the reads have a UMI, if any reads are missing a UMI throw an exception unless allowMissingUmis is true
for (SAMRecord rec : records) {
if (rec.getStringAttribute(umiTag) == null) {
if (allowMissingUmis) {
rec.setAttribute(umiTag, "");
} else {
throw new PicardException("Read " + rec.getReadName() + " does not contain a UMI with the " + umiTag + " attribute.");
}
}
}
// Count the number of times each UMI occurs
umiCounts = records.stream().collect(Collectors.groupingBy(p -> p.getStringAttribute(umiTag), counting()));
// At first we consider every UMI as if it were its own duplicate set
numUmis = umiCounts.size();
umi = new String[numUmis];
duplicateSetID = IntStream.rangeClosed(0, numUmis-1).toArray();
int i = 0;
for (String key : umiCounts.keySet()) {
umi[i] = key;
i++;
}
}
List joinUmisIntoDuplicateSets(final int maxEditDistanceToJoin) {
// Compare all UMIs to each other. If they are within maxEditDistanceToJoin
// join them to the same duplicate set using the union-find algorithm.
GraphUtils.Graph umiGraph = new GraphUtils.Graph<>();
for (int i = 0; i < numUmis; i++) {
umiGraph.addNode(i);
for (int j = i + 1; j < numUmis; j++) {
if (StringUtil.isWithinHammingDistance(umi[i], umi[j], maxEditDistanceToJoin)) {
umiGraph.addEdge(i, j);
}
}
}
final Map umiClusterMap = umiGraph.cluster();
// This ensures that all duplicate sets have unique IDs. During Union-Find a tree is constructed
// where each UMI points to parent UMI. This ensures that all UMIs that belong to the same duplicate
// set point to the same parent UMI. Note that the parent UMI is only used as a representative UMI and
// is not at all related to the assigned UMI.
for (int i = 0; i < numUmis; i++) {
duplicateSetID[i] = umiClusterMap.get(i);
}
final Map> duplicateSets = new HashMap<>();
// Assign UMIs to duplicateSets
final Map duplicateSetsFromUmis = getDuplicateSetsFromUmis();
for (SAMRecord rec : records) {
final String umi = rec.getStringAttribute(umiTag);
final Integer duplicateSetIndex = duplicateSetsFromUmis.get(umi);
if (duplicateSets.containsKey(duplicateSetIndex)) {
duplicateSets.get(duplicateSetIndex).add(rec);
} else {
final List n = new ArrayList<>();
n.add(rec);
duplicateSets.put(duplicateSetIndex, n);
}
}
final List duplicateSetList = new ArrayList<>();
for (final Map.Entry> entry : duplicateSets.entrySet()) {
final DuplicateSet ds = new DuplicateSet();
final List recordList = entry.getValue();
// Add records to the DuplicateSet
recordList.forEach(ds::add);
// For a particular duplicate set, identify the most common UMI
// and use this as an assigned UMI.
long maxCount = 0;
String assignedUmi = null;
for (SAMRecord rec : recordList) {
final String umi = rec.getStringAttribute(umiTag);
if (umiCounts.get(umi) > maxCount) {
maxCount = umiCounts.get(umi);
assignedUmi = umi;
}
}
// Set the records to contain the assigned UMI
for (final SAMRecord rec : recordList) {
if (allowMissingUmis && rec.getStringAttribute(umiTag).isEmpty()) {
// The SAM spec doesn't support empty tags, so we set it to null if it is empty.
rec.setAttribute(umiTag, null);
} else {
rec.setAttribute(assignedUmiTag, assignedUmi);
}
}
duplicateSetList.add(ds);
}
return duplicateSetList;
}
// Create a map that maps a umi to the duplicateSetID
private Map getDuplicateSetsFromUmis() {
final Map duplicateSetsFromUmis = new HashMap<>();
for (int i = 0; i < duplicateSetID.length; i++) {
duplicateSetsFromUmis.put(umi[i], duplicateSetID[i]);
}
return duplicateSetsFromUmis;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy