All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.sv.cluster.SVClusterEngine Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.sv.cluster;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import org.broadinstitute.hellbender.tools.sv.SVCallRecord;
import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
import org.broadinstitute.hellbender.tools.sv.SVLocatable;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
 * 

Base class for clustering items that possess start/end genomic coordinates. Efficient algorithms are implemented for * single-linkage and max-clique clustering that leverage the low dimensionality of position-based clustering criteria. * These algorithms are suitable for clustering problems testing for overlapping events in coordinate-sorted order, with * additional possible criteria, when the maximum feasible starting position for an item can be easily estimated (e.g. * reciprocal overlap, end-point distance).

* *

Based on the method from Marschall T, Costa VG, Canzar S, et al. CLEVER: Clique-enumerating variant finder. * Bioinformatics. 2012;28(22):2875-2882.

* *

NOTE: precise implementation of {@link SVClusterLinkage#getMaxClusterableStartingPosition(SVLocatable)} * is important for efficiency because it determines when a cluster can be finalized and omitted from further clustering tests.

*/ public class SVClusterEngine { /** * Available clustering algorithms */ public enum CLUSTERING_TYPE { SINGLE_LINKAGE, MAX_CLIQUE } private final Function collapser; // Flattens clusters into a single representative item for output private final SVClusterLinkage linkage; private Map idToClusterMap; // Active clusters private final Map idToItemMap; // Active items protected final CLUSTERING_TYPE clusteringType; private final Comparator itemComparator; private String currentContig; private int nextItemId; private int nextClusterId; private int lastStart; private Integer minActiveStartingPositionItemId; /** * @param clusteringType algorithm choice * @param collapser function that ingests a collection of clustered items and returns a single representative item */ public SVClusterEngine(final CLUSTERING_TYPE clusteringType, final Function collapser, final SVClusterLinkage linkage, final SAMSequenceDictionary dictionary) { this.clusteringType = clusteringType; this.collapser = Utils.nonNull(collapser); this.linkage = Utils.nonNull(linkage); idToClusterMap = new HashMap<>(); currentContig = null; idToItemMap = new HashMap<>(); itemComparator = SVCallRecordUtils.getSVLocatableComparator(dictionary); nextItemId = 0; nextClusterId = 0; lastStart = 0; minActiveStartingPositionItemId = null; } @VisibleForTesting public Function getCollapser() { return collapser; } @VisibleForTesting public SVClusterLinkage getLinkage() { return linkage; } public SVCallRecord getMinActiveStartingPositionItem() { Utils.validate(minActiveStartingPositionItemId == null || idToItemMap.containsKey(minActiveStartingPositionItemId), "Unregistered item id " + minActiveStartingPositionItemId); return idToItemMap.get(minActiveStartingPositionItemId); } /** * Returns true if there are any active or finalized clusters. */ public final boolean isEmpty() { return idToClusterMap.isEmpty(); } /** * Adds and clusters the given item. Note that items must be added in order of increasing start position. * @param item item to cluster */ public final List addAndFlush(final SVCallRecord item) { // Start a new cluster if on a new contig if (!item.getContigA().equals(currentContig)) { final List result = flush(); currentContig = item.getContigA(); lastStart = 0; seedCluster(registerItem(item)); return result; } else { final int itemId = registerItem(item); final List clusterIdsToProcess = cluster(itemId); return processClusters(clusterIdsToProcess); } } private final int registerItem(final SVCallRecord item) { Utils.validate(item.getPositionA() >= lastStart, "Items must be added in order of increasing start coordinate"); lastStart = item.getPositionA(); final int itemId = nextItemId++; idToItemMap.put(itemId, item); if (minActiveStartingPositionItemId == null || item.getPositionA() < getMinActiveStartingPositionItem().getPositionA()) { minActiveStartingPositionItemId = itemId; } return itemId; } private final int getMaxClusterableStartingPositionByIds(final Collection itemIds) { Utils.nonNull(itemIds); Utils.nonEmpty(itemIds); return linkage.getMaxClusterableStartingPosition(itemIds.stream().map(this::getItem).collect(Collectors.toList())); } /** * Add a new {@param } to the current clusters and determine which are complete * @param itemId id of registered item to add * @return the IDs for clusters that are complete and ready for processing */ private final List cluster(final Integer itemId) { final SVCallRecord item = getItem(itemId); // Get list of item IDs from active clusters that cluster with this item final Set linkedItems = idToClusterMap.values().stream().map(Cluster::getItemIds) .flatMap(List::stream) .distinct() .filter(other -> !other.equals(itemId) && linkage.areClusterable(item, getItem(other))) .collect(Collectors.toCollection(LinkedHashSet::new)); // Find clusters to which this item belongs, and which active clusters we're definitely done with final List clusterIdsToProcess = new ArrayList<>(); // Clusters to which we simply add the item final List clustersToAugment = new ArrayList<>(); // New clusters, formed from subsets of currently active clusters, to which we will add the item final Set> clustersToSeedWith = new HashSet<>(); // Use set to prevent creating duplicate clusters for (final Map.Entry entry : idToClusterMap.entrySet()) { final Integer clusterIndex = entry.getKey(); final Cluster cluster = entry.getValue(); final List clusterItems = cluster.getItemIds(); if (item.getPositionA() > cluster.getMaxClusterableStart()) { clusterIdsToProcess.add(clusterIndex); //this cluster is complete -- process it when we're done } else { if (clusteringType.equals(CLUSTERING_TYPE.MAX_CLIQUE)) { final List linkedClusterItems = clusterItems.stream().filter(linkedItems::contains).collect(Collectors.toList()); final int numLinkedItems = linkedClusterItems.size(); if (numLinkedItems == clusterItems.size()) { clustersToAugment.add(clusterIndex); } else if (numLinkedItems > 0) { clustersToSeedWith.add(linkedClusterItems); } } else if (clusteringType.equals(CLUSTERING_TYPE.SINGLE_LINKAGE)) { final boolean matchesCluster = clusterItems.stream().anyMatch(linkedItems::contains); if (matchesCluster) { clustersToAugment.add(clusterIndex); } } else { throw new IllegalArgumentException("Clustering algorithm for type " + clusteringType.name() + " not implemented"); } } } // Create new clusters from subsets (max-clique only) if (!clustersToSeedWith.isEmpty()) { // Currently existing clusters to which we will add the current item final List> augmentedClusterItemLists = clustersToAugment.stream() .map(idToClusterMap::get) .map(Cluster::getItemIds) .map(HashSet::new) .collect(Collectors.toList()); final List> triggeredClusterItemSets = new ArrayList<>(clustersToSeedWith.size() + augmentedClusterItemLists.size()); // New clusters formed from subsets of the currently active clusters triggeredClusterItemSets.addAll(clustersToSeedWith.stream().map(HashSet::new).collect(Collectors.toList())); triggeredClusterItemSets.addAll(augmentedClusterItemLists); triggeredClusterItemSets.sort(Comparator.comparingInt(Set::size)); for (int i = 0; i < triggeredClusterItemSets.size(); i++) { final Set seedItems = triggeredClusterItemSets.get(i); // Check that this cluster is not a sub-cluster of any of the others being created boolean isSubset = false; for (int j = i + 1; j < triggeredClusterItemSets.size(); j++) { if (triggeredClusterItemSets.get(j).containsAll(seedItems)) { isSubset = true; break; } } if (!isSubset) { seedWithExistingCluster(itemId, seedItems); } } } // Add to or merge existing clusters if (clusteringType.equals(CLUSTERING_TYPE.SINGLE_LINKAGE)) { if (!clustersToAugment.isEmpty()) { combineClusters(clustersToAugment, itemId); } } else { for (final Integer clusterId : clustersToAugment) { addToCluster(clusterId, itemId); } } // If there weren't any matches, create a new singleton cluster if (clustersToAugment.isEmpty() && clustersToSeedWith.isEmpty()) { seedCluster(itemId); } return clusterIdsToProcess; } /** * Creates a new cluster by agglomerating clusters with the given ids together (and deleting them from the currently * active set), along with an additional item. * @param clusterIds ids of clusters to combine * @param itemId id of item to add to new cluster */ private final void combineClusters(final Collection clusterIds, final Integer itemId) { final List clusters = clusterIds.stream().map(this::getCluster).collect(Collectors.toList()); clusterIds.stream().forEach(idToClusterMap::remove); final List clusterItems = clusters.stream() .map(Cluster::getItemIds) .flatMap(List::stream) .distinct() .collect(Collectors.toList()); final List newClusterItems = new ArrayList<>(clusterItems.size() + 1); newClusterItems.addAll(clusterItems); newClusterItems.add(itemId); idToClusterMap.put(nextClusterId++, new Cluster(getMaxClusterableStartingPositionByIds(newClusterItems), newClusterItems)); } /** * Finalizes a single cluster, removing it from the currently active set and adding it to the output buffer. */ private final SVCallRecord processCluster(final int clusterIndex) { final Cluster cluster = getCluster(clusterIndex); idToClusterMap.remove(clusterIndex); final List clusterItemIds = cluster.getItemIds(); final OutputCluster outputCluster = new OutputCluster(clusterItemIds.stream().map(idToItemMap::get).collect(Collectors.toList())); final SVCallRecord result = collapser.apply(outputCluster); // Clean up item id map if (clusterItemIds.size() == 1) { // Singletons won't be present in any other clusters idToItemMap.remove(clusterItemIds.get(0)); } else { // Need to check that items aren't present in any other clusters final Set activeItemIds = idToClusterMap.values().stream() .map(Cluster::getItemIds) .flatMap(List::stream) .collect(Collectors.toSet()); final List itemsToRemove = idToItemMap.keySet().stream().filter(i -> !activeItemIds.contains(i)) .collect(Collectors.toList()); for (final Integer i : itemsToRemove) { idToItemMap.remove(i); } } // Update min active start position if (clusterItemIds.contains(minActiveStartingPositionItemId)) { findAndSetMinActiveStart(); } return result; } /** * Scans active items for the current min active starting position. */ private final void findAndSetMinActiveStart() { minActiveStartingPositionItemId = null; SVCallRecord minActiveStartingPositionItem = null; for (final Integer itemId : idToItemMap.keySet()) { final SVCallRecord item = idToItemMap.get(itemId); if (minActiveStartingPositionItemId == null || itemComparator.compare(item, minActiveStartingPositionItem) < 0) { minActiveStartingPositionItemId = itemId; minActiveStartingPositionItem = idToItemMap.get(itemId); } } } /** * Finalizes a set of clusters. */ private final List processClusters(final List clusterIdsToProcess) { final List result = new ArrayList<>(clusterIdsToProcess.size()); for (final Integer clusterId : clusterIdsToProcess) { result.add(processCluster(clusterId)); } return result; } /** * Finalizes all active clusters and adds them to the output buffer. Also clears the currently active set of clusters * and items. */ public final List flush() { final List clustersToFlush = new ArrayList<>(idToClusterMap.keySet()); final List result = new ArrayList<>(clustersToFlush.size()); for (final Integer clusterId : clustersToFlush) { result.add(processCluster(clusterId)); } idToItemMap.clear(); minActiveStartingPositionItemId = null; nextItemId = 0; nextClusterId = 0; return result; } /** * Creates a new singleton cluster containing the given item. */ private final void seedCluster(final Integer item) { final List newClusters = new ArrayList<>(1); newClusters.add(item); idToClusterMap.put(nextClusterId++, new Cluster(linkage.getMaxClusterableStartingPosition(getItem(item)), newClusters)); } /** * Create a new cluster containing a new item and a set of existing items, and add it to the set of active clusters. * Note that if there exists an identical activate cluster, it will not be added. * @param item new item (assumed registered) * @param seedItems existing items */ private final void seedWithExistingCluster(final Integer item, final Collection seedItems) { final List newClusterItems = new ArrayList<>(1 + seedItems.size()); newClusterItems.addAll(seedItems); newClusterItems.add(item); idToClusterMap.put(nextClusterId++, new Cluster(getMaxClusterableStartingPositionByIds(newClusterItems), newClusterItems)); } private final Cluster getCluster(final int id) { Utils.validateArg(idToClusterMap.containsKey(id), "Cluster ID " + id + " does not exist."); return idToClusterMap.get(id); } private final SVCallRecord getItem(final int id) { Utils.validateArg(idToItemMap.containsKey(id), "Item ID " + id + " does not exist."); return idToItemMap.get(id); } /** * Add the item specified by {@param itemId} to the cluster specified by {@param clusterIndex} * and expand the clustering interval * @param clusterId * @param itemId */ private final void addToCluster(final int clusterId, final Integer itemId) { final Cluster cluster = getCluster(clusterId); final List clusterItems = cluster.getItemIds(); clusterItems.add(itemId); final SVCallRecord item = getItem(itemId); final int itemClusterableStartPosition = linkage.getMaxClusterableStartingPosition(item); cluster.setMaxClusterableStart(Math.max(cluster.getMaxClusterableStart(), itemClusterableStartPosition)); } public static final class OutputCluster { final List items; public OutputCluster(final List items) { this.items = items; } public List getItems() { return items; } } /** * Container class for clustered items */ private static final class Cluster { private int maxClusterableStart; private final List itemIds; public Cluster(final int maxClusterableStart, final List itemIds) { Utils.nonNull(itemIds); this.maxClusterableStart = maxClusterableStart; this.itemIds = itemIds; } public int getMaxClusterableStart() { return maxClusterableStart; } public void setMaxClusterableStart(final int position) { maxClusterableStart = position; } public List getItemIds() { return itemIds; } /** * Note we do not check for equality on max clusterable start position, which could be dependent on the * state of the engine. */ @Override public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof Cluster)) return false; Cluster cluster = (Cluster) o; return itemIds.equals(cluster.itemIds); } @Override public int hashCode() { return Objects.hash(itemIds); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy