All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bakdata.dedupe.person.PersonClassifierWithMultipleNames Maven / Gradle / Ivy

The newest version!
/*
 * MIT License
 *
 * Copyright (c) 2019 bakdata GmbH
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
package com.bakdata.dedupe.person;

import static com.bakdata.dedupe.similarity.CommonSimilarityMeasures.equality;
import static com.bakdata.dedupe.similarity.CommonSimilarityMeasures.jaroWinkler;
import static com.bakdata.dedupe.similarity.CommonSimilarityMeasures.levenshtein;
import static com.bakdata.dedupe.similarity.CommonSimilarityMeasures.max;
import static com.bakdata.dedupe.similarity.CommonSimilarityMeasures.scaledDifference;
import static com.bakdata.dedupe.similarity.CommonSimilarityMeasures.stableMatching;
import static com.bakdata.dedupe.similarity.CommonTransformations.beiderMorse;
import static com.bakdata.dedupe.similarity.CommonTransformations.words;

import com.bakdata.dedupe.classifier.Classifier;
import com.bakdata.dedupe.classifier.RuleBasedClassifier;
import com.bakdata.dedupe.similarity.CommonSimilarityMeasures;
import com.bakdata.dedupe.similarity.SimilarityMeasure;
import com.bakdata.dedupe.similarity.ValueTransformation;
import java.time.format.DateTimeFormatter;
import java.time.temporal.ChronoUnit;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import lombok.Value;
import lombok.experimental.Delegate;

/**
 * Shows a classifier for data points that have been incorrectly reassigned.
 * 

In this case, we assume that the first and last name has not been correctly split in some pre-processing step and * we have the following cases for some person Edgar Allan Poe

*
    *
  1. All correct: first=Edgar Allan, last=Poe
  2. *
  3. Abbreviated: first=Edgar A., last=Poe
  4. *
  5. Incorrect split: first=Edgar, last=Allan Poe
  6. *
  7. Swapped: first=Poe, last=Edgar Allan
  8. *
  9. Scrambled: first=Allan Poe, last=Edgar
  10. *
* * This example addresses these issues, but please verify during an evaluation that the similarity measure does not get * too weak. Usually, a few missed duplicates are not as bad as some wrongly classified non-duplicates! */ @Value @SuppressWarnings("squid:S109") public class PersonClassifierWithMultipleNames implements Classifier { public static final DateTimeFormatter ISO_FORMAT = DateTimeFormatter.ISO_LOCAL_DATE; /** * Case 1 and case 2: correctly split, maybe abbreviated. *

Abbreviation is indirectly handled through Jaro-Winkler. If abbreviation is very common, consider using a * custom similarity measure.

*/ SimilarityMeasure namesCorrectlySplitSimilarity = CommonSimilarityMeasures.weightedAverage() .add(1, Person::getFirstName, max(levenshtein().cutoff(0.9d), jaroWinkler())) .add(1, Person::getLastName, max(equality().of(beiderMorse()), levenshtein().cutoff(0.8d), jaroWinkler())) .build(); /** * Case 3: To handle incorrect split, we concatenate first and last and perform a holistic match with edit * distance. *

Note that we need to be stricter (=higher threshold) than {@link #namesCorrectlySplitSimilarity} to avoid too * many false positives.

*/ SimilarityMeasure namesIncorrectlySplitSimilarity = levenshtein().cutoff(0.9d).of( PersonClassifierWithMultipleNames.concatNames()); /** * Case 4: Swapped first and last name. *

nameSimilarity combines the strictest version of first and last name measures.

*

stableMatching on combined names list will match items only once.

*

*

However, mongeElkan does not work for names, where first an last names are similar because mongeElkan can * match the right hand side several times.

*

{@literal Example: John Johnson and John Smith will match 1) John -> John = 1.0; 2) Johnson -> John = 0.91} *

*/ SimilarityMeasure nameSimilarity = max(levenshtein().cutoff(0.9d), jaroWinkler(), equality().of(beiderMorse())); SimilarityMeasure namesSwappedSimilarity = stableMatching(this.nameSimilarity) .of(person -> List.of(person.getFirstName(), person.getLastName())); /** * Case 5: Scrambled names, which first concatenates names and then splits then into bags of words. *

stableMatching on combined names list will match items only once.

*

Currently subsumes namesSwappedSimilarity but should be made stricter.

*

*

However, mongeElkan does not work for names, where first an last names are similar because mongeElkan can * match the right hand side several times.

*

{@literal Example: John Johnson and John Smith will match 1) John -> John = 1.0; 2) Johnson -> John = 0.91} *

*/ SimilarityMeasure namesScrambledSimilarity = stableMatching(this.nameSimilarity) .of(concatNames().andThen(words())); SimilarityMeasure overallNameSimilarity = max(this.namesCorrectlySplitSimilarity, this.namesIncorrectlySplitSimilarity, this.namesSwappedSimilarity, this.namesScrambledSimilarity); @Delegate Classifier classifier = RuleBasedClassifier.builder() .positiveRule("Basic comparison", CommonSimilarityMeasures.weightedAverage() .add(4, this.overallNameSimilarity) .add(1, Person::getGender, equality()) .add(2, Person::getBirthDate, max(levenshtein().of(ISO_FORMAT::format), scaledDifference(2, ChronoUnit.DAYS))) .build() .scaleWithThreshold(0.9d)) .build(); private static ValueTransformation concatNames() { return (person, context) -> Stream.of(person.getFirstName(), person.getLastName()) .filter(Objects::nonNull) .collect(Collectors.joining(" ")); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy