
com.bakdata.deduplication.fusion.CommonConflictResolutions Maven / Gradle / Ivy
/*
* The MIT License
*
* Copyright (c) 2018 bakdata GmbH
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
package com.bakdata.deduplication.fusion;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ThreadLocalRandom;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import lombok.Value;
import lombok.experimental.UtilityClass;
@UtilityClass
public class CommonConflictResolutions {
private static final ThreadLocalRandom random = ThreadLocalRandom.current();
public static ConflictResolution corresponding(final ResolutionTag> resolutionTag) {
return ((values, context) -> {
if (values.isEmpty()) {
return values;
}
final Set sources = context.retrieveValues(resolutionTag).stream()
.map(AnnotatedValue::getSource).collect(Collectors.toSet());
return values.stream().filter(v -> sources.contains(v.getSource())).collect(Collectors.toList());
});
}
public static ConflictResolution saveAs(final ConflictResolution resolution, final ResolutionTag resolutionTag) {
return new TaggedResolution<>(resolution, resolutionTag);
}
private static > Comparator> comparator() {
return Comparator.comparing(AnnotatedValue::getValue);
}
public static Comparator> comparator(final Comparator comparator) {
return Comparator.comparing(AnnotatedValue::getValue, comparator);
}
public static > ConflictResolution max() {
return ((values, context) -> values.stream().max(comparator())
.map(max -> values.stream().filter(v -> v.getValue().equals(max.getValue())).collect(Collectors.toList()))
.orElse(List.of()));
}
public static TerminalConflictResolution mean() {
return ((values, context) -> values.stream()
.mapToDouble(v -> v.getValue().doubleValue()).average()
// workaround for OptionalDouble not having #map
.stream().boxed().findFirst()
.map(AnnotatedValue::calculated));
}
public static TerminalConflictResolution sum() {
return ((values, context) -> values.stream()
.mapToDouble(v -> v.getValue().doubleValue())
.reduce((agg, v) -> agg + v)
// workaround for OptionalDouble not having #map
.stream().boxed().findFirst()
.map(AnnotatedValue::calculated));
}
public static TerminalConflictResolution random() {
return ((values, context) -> values.isEmpty() ? Optional.empty() : Optional.of(values.get(random.nextInt(values.size()))));
}
public static TerminalConflictResolution first() {
return ((values, context) -> values.stream().findFirst());
}
public static TerminalConflictResolution last() {
return ((values, context) -> values.stream().skip(Math.max(0, values.size() - 1)).findAny());
}
public static ConflictResolution distinct() {
return ((values, context) -> List.copyOf(values.stream()
.collect(Collectors.toMap(AnnotatedValue::getValue,
v -> v,
(v1, v2) -> AnnotatedValue.calculated(v1.getValue()),
LinkedHashMap::new))
.values()));
}
public static > ConflictResolution median() {
return ((values, context) -> {
if (values.isEmpty()) {
return values;
}
final List> sorted = new ArrayList<>(values);
sorted.sort(comparator());
// create copy of list of median value(s), such that original list is not referenced anymore
return List.copyOf(sorted.subList((int) Math.floor(sorted.size() / 2.0), (int) Math.ceil(sorted.size() / 2.0)));
});
}
public static ConflictResolution shortest() {
return ((values, context) -> values.isEmpty() ? values :
values.stream().collect(Collectors.groupingBy(v -> v.getValue().length(), TreeMap::new, Collectors.toList()))
.firstEntry()
.getValue());
}
public static ConflictResolution longest() {
return ((values, context) -> values.isEmpty() ? values :
values.stream().collect(Collectors.groupingBy(v -> v.getValue().length(), TreeMap::new, Collectors.toList()))
.lastEntry()
.getValue());
}
public static ConflictResolution mostFrequent() {
return ((values, context) -> values.isEmpty() ? values :
values.stream().collect(Collectors.groupingBy(AnnotatedValue::getValue))
.entrySet()
.stream()
.collect(Collectors.groupingBy(entry -> entry.getValue().size(), TreeMap::new, Collectors.toList()))
.lastEntry()
.getValue()
.stream()
.flatMap(entry -> entry.getValue().stream())
.collect(Collectors.toList()));
}
public static ConflictResolution earliest() {
return ((values, context) -> values.isEmpty() ? values :
values.stream().collect(Collectors.groupingBy(AnnotatedValue::getDateTime, TreeMap::new, Collectors.toList()))
.firstEntry()
.getValue());
}
public static ConflictResolution latest() {
return ((values, context) -> values.isEmpty() ? values :
values.stream().collect(Collectors.groupingBy(AnnotatedValue::getDateTime, TreeMap::new, Collectors.toList()))
.lastEntry()
.getValue());
}
public static ConflictResolution vote() {
return ((values, context) -> values.isEmpty() ? values :
values.stream().collect(Collectors.groupingBy(AnnotatedValue::getValue))
.entrySet()
.stream()
.collect(Collectors.groupingBy(
entry -> entry.getValue().stream().mapToDouble(v -> v.getSource().getWeight()).sum(),
TreeMap::new,
Collectors.toList()))
.lastEntry()
.getValue()
.stream()
.flatMap(entry -> entry.getValue().stream())
.collect(Collectors.toList()));
}
public static ConflictResolution preferSource(final Source... sourcePriority) {
return preferSource(List.of(sourcePriority));
}
public static ConflictResolution preferSource(final List sourcePriority) {
return ((values, context) -> values.stream()
.map(AnnotatedValue::getSource)
.min(Comparator.comparingInt(sourcePriority::indexOf))
.map(source -> values.stream().filter(v -> v.getSource().equals(source)).collect(Collectors.toList()))
.orElse(List.of()));
}
public static > TerminalConflictResolution> union() {
return unionAll(HashSet::new);
}
public static > TerminalConflictResolution> unionAll() {
return unionAll(ArrayList::new);
}
public static , R extends Collection> TerminalConflictResolution unionAll(
final Supplier extends R> ctor) {
return (annotatedValues, context) -> {
final R collection = ctor.get();
for (final AnnotatedValue annotatedValue : annotatedValues) {
collection.addAll(annotatedValue.getValue());
}
return Optional.of(AnnotatedValue.calculated(collection));
};
}
public static ConflictResolution assumeEqualValue() {
return (annotatedValues, context) -> annotatedValues;
}
public static ConflictResolution transform(final Function super T, R> transform) {
return (annotatedValues, context) -> annotatedValues.stream()
.map(annotatedValue -> annotatedValue.withValue(transform.apply(annotatedValue.getValue())))
.collect(Collectors.toList());
}
public static > ConflictResolution min() {
return ((values, context) -> values.stream().min(comparator())
.map(min -> values.stream().filter(v -> v.getValue().equals(min.getValue())).collect(Collectors.toList()))
.orElse(List.of()));
}
@Value
static class TaggedResolution implements ConflictResolution {
private final ConflictResolution resolution;
private final ResolutionTag resolutionTag;
@Override
public List> resolvePartially(final List> values, final FusionContext context) {
final List> annotatedValues = this.resolution.resolvePartially(values, context);
context.storeValues(this.resolutionTag, annotatedValues);
return annotatedValues;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy