All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bakdata.deduplication.classifier.RuleBasedClassifier Maven / Gradle / Ivy

/*
 * The MIT License
 *
 * Copyright (c) 2018 bakdata GmbH
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
package com.bakdata.deduplication.classifier;

import com.bakdata.deduplication.candidate_selection.Candidate;
import com.bakdata.deduplication.similarity.SimilarityContext;
import com.bakdata.deduplication.similarity.SimilarityException;
import com.bakdata.deduplication.similarity.SimilarityMeasure;
import java.util.List;
import java.util.Optional;
import java.util.function.BiPredicate;
import lombok.Builder;
import lombok.Singular;
import lombok.Value;

/**
 * Successively applies a list of rules to the record and returns the respective {@link Classification} with the following cases:
 * 
    *
  • If any rule classifies the pair unambiguously as {@link Classification.ClassificationResult#DUPLICATE} or {@link Classification.ClassificationResult#NON_DUPLICATE}, the classification is immediately returned.
  • *
  • If some rule classifies the pair as {@link Classification.ClassificationResult#POSSIBLE_DUPLICATE}, the remaining rules with be evaluated to see if an unambiguous classification will be reached, in which case that classification is returned. If the results are only ambiguous, the last {@code POSSIBLE_DUPLICATE} classification will be returned.
  • *
  • If no rule can be applied, the result is {@link #UNKNOWN}.
  • *
*
* The {@code Classification} will contain a description naming the triggered rule and converts the rule score into a confidence score. * * @param */ @Value @Builder public class RuleBasedClassifier implements Classifier { public static final float DOES_NOT_APPLY = Float.NaN; public static final Classification UNKNOWN = Classification.builder() .confidence(0) .result(Classification.ClassificationResult.UNKNOWN) .build(); @Singular List> rules; @Builder.Default Classification defaultClassification = UNKNOWN; @Override public Classification classify(final Candidate candidate) { final SimilarityContext context = new SimilarityContext(); Classification classification = this.defaultClassification; for (final Rule rule : this.rules) { classification = this.evaluateRule(rule, candidate, context).orElse(classification); if (!classification.getResult().isAmbiguous()) { break; } } if (!context.getExceptions().isEmpty()) { throw this.createException(candidate, context); } return classification.getResult().isAmbiguous() ? this.defaultClassification : classification; } private SimilarityException createException(final Candidate candidate, final SimilarityContext context) { final SimilarityException fusionException = new SimilarityException("Could not classify candidate " + candidate, context.getExceptions().get(0)); context.getExceptions().stream().skip(1).forEach(fusionException::addSuppressed); return fusionException; } private Optional evaluateRule(final Rule rule, final Candidate candidate, final SimilarityContext context) { return context.safeExecute(() -> rule.evaluate(candidate.getNewRecord(), candidate.getOldRecord(), context)).map(score -> { if (Float.isNaN(score)) { return UNKNOWN; } if (score <= -0.0f) { return Classification.builder() .result(Classification.ClassificationResult.NON_DUPLICATE) .confidence(-score) .explanation(rule.getName()) .build(); } else { return Classification.builder() .result(Classification.ClassificationResult.DUPLICATE) .confidence(score) .explanation(rule.getName()) .build(); } }); } @SuppressWarnings({"WeakerAccess", "UnusedReturnValue"}) public static class RuleBasedClassifierBuilder { public RuleBasedClassifierBuilder positiveRule(final String name, final BiPredicate applicablePredicate, final SimilarityMeasure similarityMeasure) { return this.positiveRule(name, (left, right, context) -> applicablePredicate.test(left, right) ? similarityMeasure.getSimilarity(left, right, context) : DOES_NOT_APPLY); } public RuleBasedClassifierBuilder positiveRule(final String name, final SimilarityMeasure similarityMeasure) { return this.rule(new Rule<>(name, similarityMeasure.unknownIf(s -> s <= 0))); } public RuleBasedClassifierBuilder negativeRule(final String name, final BiPredicate applicablePredicate, final SimilarityMeasure similarityMeasure) { return this.negativeRule(name, (left, right, context) -> applicablePredicate.test(left, right) ? similarityMeasure.getSimilarity(left, right, context) : DOES_NOT_APPLY); } public RuleBasedClassifierBuilder negativeRule(final String name, final SimilarityMeasure similarityMeasure) { final SimilarityMeasure negativeSim = (left, right, context) -> -similarityMeasure.getSimilarity(left, right, context); return this.rule(new Rule<>(name, negativeSim.unknownIf(s -> s >= 0))); } public RuleBasedClassifierBuilder defaultRule(final SimilarityMeasure similarityMeasure) { return this.rule(new Rule<>("default", similarityMeasure)); } } @Value public static class Rule { String name; SimilarityMeasure measure; @SuppressWarnings("SameReturnValue") protected static float doesNotApply() { return DOES_NOT_APPLY; } float evaluate(final T left, final T right, final SimilarityContext context) { return this.measure.getSimilarity(left, right, context); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy