
com.bakdata.deduplication.clustering.ConsistentClustering Maven / Gradle / Ivy
/*
* The MIT License
*
* Copyright (c) 2018 bakdata GmbH
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
package com.bakdata.deduplication.clustering;
import com.bakdata.deduplication.candidate_selection.Candidate;
import com.bakdata.deduplication.classifier.ClassifiedCandidate;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.Getter;
import lombok.NonNull;
import lombok.Value;
/**
* Wraps another clustering and keeps clusters together, when the wrapped clustering would split it.
* Example: consider a stable marriage-based clustering where A1-B have been previously matched and subsequently clustered.
* If a strong A2-B would replace that pair and thus split the cluster, this consistent clustering returns a cluster [A1, A2, B] instead.
*
* This clustering is similar to {@link TransitiveClosure} but allows the wrapped clustering to split temporary (=not-returned) clusters. Thus, in the example above, we have the following two situations:
* - If A1-B and A2-B would be passed in the same invocation of {@link #cluster(List)}, only cluster [A2, B] would be returned.
* - If A-B is passed in a first invocation, this invocation returns [A1, B]. The following invocation with A2-B would then return [A1, A2, B].
*
* It thus trades off clustering accuracy to increase reliability of subsequent data processing.
*
* @param
*/
@Value
@Builder
public class ConsistentClustering, T, I extends Comparable super I>> implements Clustering {
@NonNull
Clustering clustering;
Function idExtractor;
@Getter(lazy = true, value = AccessLevel.PRIVATE)
TransitiveClosure internalClosure = TransitiveClosure.builder()
.idExtractor(this.idExtractor)
.clusterIdGenerator(this.clustering.getClusterIdGenerator())
.build();
@Override
public List> cluster(final List> classified) {
final List> clusters = this.clustering.cluster(classified);
if (clusters.isEmpty()) {
return clusters;
}
// the returned cluster is not affected from this clustering
if (clusters.size() == 1 && this.noRecordInIndex(clusters)) {
return clusters;
}
final T firstElement = clusters.get(0).get(0);
final List> candidates = clusters.stream()
.flatMap(cluster -> cluster.getElements().stream().map(record -> new Candidate<>(firstElement, record)))
.collect(Collectors.toList());
final List> transitiveClusters = this.getInternalClosure().clusterDuplicates(candidates);
if (transitiveClusters.size() != 1) {
throw new IllegalStateException("Expected exactly one transitive cluster");
}
if (clusters.size() == 1 && clusters.get(0).equals(transitiveClusters.get(0))) {
// previously split cluster have been remerged, so we can remove it from our internal closure
this.getInternalClosure().removeCluster(clusters.get(0));
}
return transitiveClusters;
}
@Override
public Function, C> getClusterIdGenerator() {
return this.clustering.getClusterIdGenerator();
}
private boolean noRecordInIndex(final Collection> clusters) {
final Map> clusterIndex = this.getInternalClosure().getClusterIndex();
return clusters.stream().flatMap(cluster -> cluster.getElements().stream())
.allMatch(record -> clusterIndex.get(this.idExtractor.apply(record)) == null);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy