
com.bakdata.deduplication.classifier.OracleClassifier Maven / Gradle / Ivy
package com.bakdata.deduplication.classifier;
import com.bakdata.deduplication.candidate_selection.Candidate;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import lombok.Getter;
import lombok.NonNull;
import lombok.Value;
@Value
public class OracleClassifier implements Classifier {
private static final Classification DUPLICATE =
Classification.builder().result(Classification.ClassificationResult.DUPLICATE).confidence(1).build();
private static final Classification NON_DUPLICATE =
Classification.builder().result(Classification.ClassificationResult.NON_DUPLICATE).confidence(1).build();
@NonNull
Set> goldDuplicates;
@Getter(lazy = true)
Set> symmetricDuplicates = this.calculateSymmetricDuplicates();
private Set> calculateSymmetricDuplicates() {
return this.getGoldDuplicates().stream()
.flatMap(duplicate ->
Stream.of(duplicate, new Candidate<>(duplicate.getOldRecord(), duplicate.getNewRecord())))
.collect(Collectors.toSet());
}
@Override
public Classification classify(final Candidate candidate) {
return this.getSymmetricDuplicates().contains(candidate) ? DUPLICATE : NON_DUPLICATE;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy