All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bakdata.deduplication.clustering.OracleClustering Maven / Gradle / Ivy

package com.bakdata.deduplication.clustering;

import com.bakdata.deduplication.classifier.Classification;
import com.bakdata.deduplication.classifier.ClassifiedCandidate;
import java.util.AbstractMap;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Collectors;
import lombok.Getter;
import lombok.NonNull;
import lombok.Value;

@Value
public class OracleClustering, T, I> implements Clustering {
    private static final Classification DUPLICATE = Classification.builder().result(Classification.ClassificationResult.DUPLICATE).confidence(1).build();
    private static final Classification NON_DUPLICATE = Classification.builder().result(Classification.ClassificationResult.NON_DUPLICATE).confidence(1).build();
    Collection> goldClusters;
    @NonNull
    Function idExtractor;
    @Getter(lazy = true)
    Map> idToCluster = this.goldClusters.stream()
            .flatMap(cluster ->
                cluster.getElements().stream()
                    .map(e -> new AbstractMap.SimpleEntry<>(this.idExtractor.apply(e), cluster)))
            .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));


    @Override
    public List> cluster(final List> classified) {
        return classified.stream()
            .map(candidate -> this.getIdToCluster().get(this.idExtractor.apply(candidate.getCandidate().getOldRecord())))
                .filter(Objects::nonNull)
                .distinct()
                .collect(Collectors.toList());
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy