All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.text.similarity.IntersectionSimilarity Maven / Gradle / Ivy

There is a newer version: 1.12.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.text.similarity;

import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.function.Function;

/**
 * Measures the intersection of two sets created from a pair of character sequences.
 *
 * 

It is assumed that the type {@code T} correctly conforms to the requirements for storage * within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements * {@link Object#equals(Object)} and {@link Object#hashCode()}.

* * @param the type of the elements extracted from the character sequence * @since 1.7 * @see Set * @see HashMap */ public class IntersectionSimilarity implements SimilarityScore { /** * Mutable counter class for storing the count of elements. */ private static final class BagCount { /** Private, mutable but must be used as immutable. */ private static final BagCount ZERO = new BagCount(); /** The count. */ int count; private BagCount() { this.count = 0; } } // The following is adapted from commons-collections for a Bag. // A Bag is a collection that can store the count of the number // of copies of each element. /** * A minimal implementation of a Bag that can store elements and a count. * *

For the intended purpose the Bag does not have to be a {@link Collection}. It does not * even have to know its own size. */ private class TinyBag { /** The backing map. */ private final Map map; /** * Create a new tiny bag. * * @param initialCapacity the initial capacity */ TinyBag(final int initialCapacity) { map = new HashMap<>(initialCapacity); } /** * Adds a new element to the bag, incrementing its count in the underlying map. * * @param object the object to add */ void add(final T object) { map.computeIfAbsent(object, k -> new BagCount()).count++; } /** * Returns a Set view of the mappings contained in this bag. * * @return The Set view */ Set> entrySet() { return map.entrySet(); } /** * Returns the number of occurrence of the given element in this bag by * looking up its count in the underlying map. * * @param object the object to search for * @return The number of occurrences of the object, zero if not found */ int getCount(final Object object) { return map.getOrDefault(object, BagCount.ZERO).count; } /** * Get the number of unique elements in the bag. * * @return The unique element size */ int uniqueElementSize() { return map.size(); } } /** * Computes the intersection between two sets. This is the count of all the elements * that are within both sets. * * @param the type of the elements in the set * @param setA the set A * @param setB the set B * @return The intersection */ private static int getIntersection(final Set setA, final Set setB) { int intersection = 0; for (final T element : setA) { if (setB.contains(element)) { intersection++; } } return intersection; } /** The converter used to create the elements from the characters. */ private final Function> converter; /** * Create a new intersection similarity using the provided converter. * *

* If the converter returns a {@link Set} then the intersection result will * not include duplicates. Any other {@link Collection} is used to produce a result * that will include duplicates in the intersect and union. *

* * @param converter the converter used to create the elements from the characters * @throws IllegalArgumentException if the converter is null */ public IntersectionSimilarity(final Function> converter) { if (converter == null) { throw new IllegalArgumentException("Converter must not be null"); } this.converter = converter; } /** * Calculates the intersection of two character sequences passed as input. * * @param left first character sequence * @param right second character sequence * @return The intersection result * @throws IllegalArgumentException if either input sequence is {@code null} */ @Override public IntersectionResult apply(final CharSequence left, final CharSequence right) { if (left == null || right == null) { throw new IllegalArgumentException("Input cannot be null"); } // Create the elements from the sequences final Collection objectsA = converter.apply(left); final Collection objectsB = converter.apply(right); final int sizeA = objectsA.size(); final int sizeB = objectsB.size(); // Short-cut if either collection is empty if (Math.min(sizeA, sizeB) == 0) { // No intersection return new IntersectionResult(sizeA, sizeB, 0); } // Intersection = count the number of shared elements final int intersection; if (objectsA instanceof Set && objectsB instanceof Set) { // If a Set then the elements will only have a count of 1. // Iterate over the smaller set. intersection = sizeA < sizeB ? getIntersection((Set) objectsA, (Set) objectsB) : getIntersection((Set) objectsB, (Set) objectsA); } else { // Create a bag for each collection final TinyBag bagA = toBag(objectsA); final TinyBag bagB = toBag(objectsB); // Iterate over the smaller number of unique elements intersection = bagA.uniqueElementSize() < bagB.uniqueElementSize() ? getIntersection(bagA, bagB) : getIntersection(bagB, bagA); } return new IntersectionResult(sizeA, sizeB, intersection); } /** * Computes the intersection between two bags. This is the sum of the minimum * count of each element that is within both sets. * * @param bagA the bag A * @param bagB the bag B * @return The intersection */ private int getIntersection(final TinyBag bagA, final TinyBag bagB) { int intersection = 0; for (final Entry entry : bagA.entrySet()) { final T element = entry.getKey(); final int count = entry.getValue().count; // The intersection of this entry in both bags is the minimum count intersection += Math.min(count, bagB.getCount(element)); } return intersection; } /** * Converts the collection to a bag. The bag will contain the count of each element * in the collection. * * @param objects the objects * @return The bag */ private TinyBag toBag(final Collection objects) { final TinyBag bag = new TinyBag(objects.size()); objects.forEach(bag::add); return bag; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy