com.bakdata.dedupe.clustering.Cluster Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of core Show documentation
Base interfaces and data structures for defining a deduplication workflow.
The newest version!
/*
 * MIT License
 *
 * Copyright (c) 2019 bakdata GmbH
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
package com.bakdata.dedupe.clustering;

import java.util.ArrayList;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NonNull;


/**
 * A cluster is a coherent collection of duplicate records.
 * All records inside the cluster are deemed to be pair-wise
 * duplicates. No record outside the cluster is a duplicate with any record inside the cluster.
 * Any dataset can be divided into an exhaustive set of non-overlapping clusters with a {@link
 * com.bakdata.dedupe.classifier.Classifier} and {@link Clustering}. In that sense, the duplicate relation becomes a
 * mathematical partitioning of the dataset.
 * In general, a cluster returned by the {@link Clustering} should always contain at least one element. Temporarily,
 * a cluster may be constructed without elements.
 *
 * @param  the type of the cluster id.
 * @param  the type of the records.
 */
@Data
@AllArgsConstructor
@Builder
public class Cluster, T> {
    /**
     * The identifier of the cluster. Typically, it is a short string or an integral type.
     */
    @NonNull
    C id;
    /**
     * The list of elements. While in general there is no order constraints of the elements, a {@link Clustering}
     * implementation may order the elements for faster access.
     */
    @NonNull
    List elements;

    public Cluster(final @NonNull C id) {
        this(id, new ArrayList<>());
    }

    public void add(final @NonNull T record) {
        this.elements.add(record);
    }

    public int size() {
        return this.elements.size();
    }

    public @NonNull T get(final int index) {
        return this.elements.get(index);
    }

    public boolean contains(final @NonNull T record) {
        return this.elements.contains(record);
    }

    /**
     * Merges this cluster with another cluster into one new cluster.
     *
     * @param idGenerator a generator to create the new id.
     * @param idExtractor A function to extract the id of a record.
     * @param other the other cluster.
     * @return the newly created merged cluster or this iff {@code other == this}.
     */
    public @NonNull  Cluster merge(
            final @NonNull Function, ? extends C> idGenerator,
            final @NonNull Function idExtractor,
            final @NonNull Cluster other) {
        if (other == this) {
            return this;
        }
        final List concatElements = new ArrayList<>(this.elements);
        concatElements.addAll(other.getElements());
        final List ids = concatElements.stream()
                .map(idExtractor)
                .collect(Collectors.toList());
        return new Cluster<>(idGenerator.apply(ids), concatElements);
    }
}