All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bakdata.deduplication.fusion.CommonConflictResolutions Maven / Gradle / Ivy

/*
 * The MIT License
 *
 * Copyright (c) 2018 bakdata GmbH
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
package com.bakdata.deduplication.fusion;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ThreadLocalRandom;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import lombok.Value;
import lombok.experimental.UtilityClass;

@UtilityClass
public class CommonConflictResolutions {
    private static final ThreadLocalRandom random = ThreadLocalRandom.current();

    public static  ConflictResolution corresponding(final ResolutionTag resolutionTag) {
        return ((values, context) -> {
            if (values.isEmpty()) {
                return values;
            }
            final Set sources = context.retrieveValues(resolutionTag).stream()
                    .map(AnnotatedValue::getSource).collect(Collectors.toSet());
            return values.stream().filter(v -> sources.contains(v.getSource())).collect(Collectors.toList());
        });
    }

    public static  ConflictResolution saveAs(final ConflictResolution resolution, final ResolutionTag resolutionTag) {
        return new TaggedResolution<>(resolution, resolutionTag);
    }

    private static > Comparator> comparator() {
        return Comparator.comparing(AnnotatedValue::getValue);
    }

    public static  Comparator> comparator(final Comparator comparator) {
        return Comparator.comparing(AnnotatedValue::getValue, comparator);
    }

    public static > ConflictResolution max() {
        return ((values, context) -> values.stream().max(comparator())
                .map(max -> values.stream().filter(v -> v.getValue().equals(max.getValue())).collect(Collectors.toList()))
                .orElse(List.of()));
    }

    public static  TerminalConflictResolution mean() {
        return ((values, context) -> values.stream()
                .mapToDouble(v -> v.getValue().doubleValue()).average()
                // workaround for OptionalDouble not having #map
                .stream().boxed().findFirst()
                .map(AnnotatedValue::calculated));
    }

    public static  TerminalConflictResolution sum() {
        return ((values, context) -> values.stream()
                .mapToDouble(v -> v.getValue().doubleValue())
                .reduce((agg, v) -> agg + v)
                // workaround for OptionalDouble not having #map
                .stream().boxed().findFirst()
                .map(AnnotatedValue::calculated));
    }

    public static  TerminalConflictResolution random() {
        return ((values, context) -> values.isEmpty() ? Optional.empty() : Optional.of(values.get(random.nextInt(values.size()))));
    }

    public static  TerminalConflictResolution first() {
        return ((values, context) -> values.stream().findFirst());
    }

    public static  TerminalConflictResolution last() {
        return ((values, context) -> values.stream().skip(Math.max(0, values.size() - 1)).findAny());
    }

    public static  ConflictResolution distinct() {
        return ((values, context) -> List.copyOf(values.stream()
                .collect(Collectors.toMap(AnnotatedValue::getValue,
                        v -> v,
                        (v1, v2) -> AnnotatedValue.calculated(v1.getValue()),
                        LinkedHashMap::new))
                .values()));
    }

    public static > ConflictResolution median() {
        return ((values, context) -> {
            if (values.isEmpty()) {
                return values;
            }
            final List> sorted = new ArrayList<>(values);
            sorted.sort(comparator());
            // create copy of list of median value(s), such that original list is not referenced anymore
            return List.copyOf(sorted.subList((int) Math.floor(sorted.size() / 2.0), (int) Math.ceil(sorted.size() / 2.0)));
        });
    }

    public static  ConflictResolution shortest() {
        return ((values, context) -> values.isEmpty() ? values :
                values.stream().collect(Collectors.groupingBy(v -> v.getValue().length(), TreeMap::new, Collectors.toList()))
                        .firstEntry()
                        .getValue());
    }

    public static  ConflictResolution longest() {
        return ((values, context) -> values.isEmpty() ? values :
                values.stream().collect(Collectors.groupingBy(v -> v.getValue().length(), TreeMap::new, Collectors.toList()))
                        .lastEntry()
                        .getValue());
    }

    public static  ConflictResolution mostFrequent() {
        return ((values, context) -> values.isEmpty() ? values :
                values.stream().collect(Collectors.groupingBy(AnnotatedValue::getValue))
                        .entrySet()
                        .stream()
                        .collect(Collectors.groupingBy(entry -> entry.getValue().size(), TreeMap::new, Collectors.toList()))
                        .lastEntry()
                        .getValue()
                        .stream()
                        .flatMap(entry -> entry.getValue().stream())
                        .collect(Collectors.toList()));
    }

    public static  ConflictResolution earliest() {
        return ((values, context) -> values.isEmpty() ? values :
                values.stream().collect(Collectors.groupingBy(AnnotatedValue::getDateTime, TreeMap::new, Collectors.toList()))
                        .firstEntry()
                        .getValue());
    }

    public static  ConflictResolution latest() {
        return ((values, context) -> values.isEmpty() ? values :
                values.stream().collect(Collectors.groupingBy(AnnotatedValue::getDateTime, TreeMap::new, Collectors.toList()))
                        .lastEntry()
                        .getValue());
    }

    public static  ConflictResolution vote() {
        return ((values, context) -> values.isEmpty() ? values :
                values.stream().collect(Collectors.groupingBy(AnnotatedValue::getValue))
                        .entrySet()
                        .stream()
                        .collect(Collectors.groupingBy(
                                entry -> entry.getValue().stream().mapToDouble(v -> v.getSource().getWeight()).sum(),
                                TreeMap::new,
                                Collectors.toList()))
                        .lastEntry()
                        .getValue()
                        .stream()
                        .flatMap(entry -> entry.getValue().stream())
                        .collect(Collectors.toList()));
    }

    public static  ConflictResolution preferSource(final Source... sourcePriority) {
        return preferSource(List.of(sourcePriority));
    }

    public static  ConflictResolution preferSource(final List sourcePriority) {
        return ((values, context) -> values.stream()
                .map(AnnotatedValue::getSource)
                .min(Comparator.comparingInt(sourcePriority::indexOf))
                .map(source -> values.stream().filter(v -> v.getSource().equals(source)).collect(Collectors.toList()))
                .orElse(List.of()));
    }

    public static > TerminalConflictResolution> union() {
        return unionAll(HashSet::new);
    }

    public static > TerminalConflictResolution> unionAll() {
        return unionAll(ArrayList::new);
    }

    public static , R extends Collection> TerminalConflictResolution unionAll(
        final Supplier ctor) {
        return (annotatedValues, context) -> {
            final R collection = ctor.get();
            for (final AnnotatedValue annotatedValue : annotatedValues) {
                collection.addAll(annotatedValue.getValue());
            }
            return Optional.of(AnnotatedValue.calculated(collection));
        };
    }

    public static  ConflictResolution assumeEqualValue() {
        return (annotatedValues, context) -> annotatedValues;
    }

    public static  ConflictResolution transform(final Function transform) {
        return (annotatedValues, context) -> annotatedValues.stream()
                .map(annotatedValue -> annotatedValue.withValue(transform.apply(annotatedValue.getValue())))
                .collect(Collectors.toList());
    }

    public static > ConflictResolution min() {
        return ((values, context) -> values.stream().min(comparator())
                .map(min -> values.stream().filter(v -> v.getValue().equals(min.getValue())).collect(Collectors.toList()))
                .orElse(List.of()));
    }

    @Value
    static class TaggedResolution implements ConflictResolution {
        private final ConflictResolution resolution;
        private final ResolutionTag resolutionTag;

        @Override
        public List> resolvePartially(final List> values, final FusionContext context) {
            final List> annotatedValues = this.resolution.resolvePartially(values, context);
            context.storeValues(this.resolutionTag, annotatedValues);
            return annotatedValues;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy