
com.bakdata.deduplication.clustering.OracleClustering Maven / Gradle / Ivy
package com.bakdata.deduplication.clustering;
import com.bakdata.deduplication.classifier.Classification;
import com.bakdata.deduplication.classifier.ClassifiedCandidate;
import java.util.AbstractMap;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Collectors;
import lombok.Getter;
import lombok.NonNull;
import lombok.Value;
@Value
public class OracleClustering, T, I> implements Clustering {
private static final Classification DUPLICATE = Classification.builder().result(Classification.ClassificationResult.DUPLICATE).confidence(1).build();
private static final Classification NON_DUPLICATE = Classification.builder().result(Classification.ClassificationResult.NON_DUPLICATE).confidence(1).build();
Collection> goldClusters;
@NonNull
Function idExtractor;
@Getter(lazy = true)
Map> idToCluster = this.goldClusters.stream()
.flatMap(cluster ->
cluster.getElements().stream()
.map(e -> new AbstractMap.SimpleEntry<>(this.idExtractor.apply(e), cluster)))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
@Override
public List> cluster(final List> classified) {
return classified.stream()
.map(candidate -> this.getIdToCluster().get(this.idExtractor.apply(candidate.getCandidate().getOldRecord())))
.filter(Objects::nonNull)
.distinct()
.collect(Collectors.toList());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy