All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bakdata.deduplication.classifier.OracleClassifier Maven / Gradle / Ivy

package com.bakdata.deduplication.classifier;

import com.bakdata.deduplication.candidate_selection.Candidate;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import lombok.Getter;
import lombok.NonNull;
import lombok.Value;

@Value
public class OracleClassifier implements Classifier {
    private static final Classification DUPLICATE =
        Classification.builder().result(Classification.ClassificationResult.DUPLICATE).confidence(1).build();
    private static final Classification NON_DUPLICATE =
        Classification.builder().result(Classification.ClassificationResult.NON_DUPLICATE).confidence(1).build();

    @NonNull
    Set> goldDuplicates;
    @Getter(lazy = true)
    Set> symmetricDuplicates = this.calculateSymmetricDuplicates();

    private Set> calculateSymmetricDuplicates() {
        return this.getGoldDuplicates().stream()
            .flatMap(duplicate ->
                Stream.of(duplicate, new Candidate<>(duplicate.getOldRecord(), duplicate.getNewRecord())))
            .collect(Collectors.toSet());
    }

    @Override
    public Classification classify(final Candidate candidate) {
        return this.getSymmetricDuplicates().contains(candidate) ? DUPLICATE : NON_DUPLICATE;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy