All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.text.similarity.LongestCommonSubsequence Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.text.similarity;

/**
 * A similarity algorithm indicating the length of the longest common subsequence between two strings.
 *
 * 

* The Longest common subsequence algorithm returns the length of the longest subsequence that two strings have in * common. Two strings that are entirely different, return a value of 0, and two strings that return a value * of the commonly shared length implies that the strings are completely the same in value and position. * Note. Generally this algorithm is fairly inefficient, as for length m, n of the input * {@code CharSequence}'s {@code left} and {@code right} respectively, the runtime of the * algorithm is O(m*n). *

* *

* As of version 1.10, a more space-efficient of the algorithm is implemented. The new algorithm has linear space * complexity instead of quadratic. However, time complexity is still quadratic in the size of input strings. *

* *

* The implementation is based on Hirschberg's Longest Commons Substring algorithm (cited below). *

* *

For further reading see:

*
    *
  • * Lothaire, M. Applied combinatorics on words. New York: Cambridge U Press, 2005. 12-13 *
  • *
  • * D. S. Hirschberg, "A linear space algorithm for computing maximal common subsequences," CACM, 1975, pp. 341--343. *
  • *
* * @since 1.0 */ public class LongestCommonSubsequence implements SimilarityScore { /** * Singleton instance. */ static final LongestCommonSubsequence INSTANCE = new LongestCommonSubsequence(); /** * An implementation of "ALG B" from Hirschberg's CACM '71 paper. * Assuming the first input sequence is of size {@code m} and the second input sequence is of size * {@code n}, this method returns the last row of the dynamic programming (DP) table when calculating * the LCS of the two sequences in O(m*n) time and O(n) space. * The last element of the returned array, is the size of the LCS of the two input sequences. * * @param left first input sequence. * @param right second input sequence. * @return last row of the dynamic-programming (DP) table for calculating the LCS of {@code left} and {@code right} * @since 1.10.0 */ private static int[] algorithmB(final CharSequence left, final CharSequence right) { final int m = left.length(); final int n = right.length(); // Creating an array for storing two rows of DP table final int[][] dpRows = new int[2][1 + n]; for (int i = 1; i <= m; i++) { // K(0, j) <- K(1, j) [j = 0...n], as per the paper: // Since we have references in Java, we can swap references instead of literal copying. // We could also use a "binary index" using modulus operator, but directly swapping the // two rows helps readability and keeps the code consistent with the algorithm description // in the paper. final int[] temp = dpRows[0]; dpRows[0] = dpRows[1]; dpRows[1] = temp; for (int j = 1; j <= n; j++) { if (left.charAt(i - 1) == right.charAt(j - 1)) { dpRows[1][j] = dpRows[0][j - 1] + 1; } else { dpRows[1][j] = Math.max(dpRows[1][j - 1], dpRows[0][j]); } } } // LL(j) <- K(1, j) [j=0...n], as per the paper: // We don't need literal copying of the array, we can just return the reference return dpRows[1]; } /** * An implementation of "ALG C" from Hirschberg's CACM '71 paper. * Assuming the first input sequence is of size {@code m} and the second input sequence is of size * {@code n}, this method returns the Longest Common Subsequence (LCS) of the two sequences in * O(m*n) time and O(m+n) space. * * @param left first input sequence. * @param right second input sequence. * @return the LCS of {@code left} and {@code right} * @since 1.10.0 */ private static String algorithmC(final CharSequence left, final CharSequence right) { final int m = left.length(); final int n = right.length(); final StringBuilder out = new StringBuilder(); if (m == 1) { // Handle trivial cases, as per the paper final char leftCh = left.charAt(0); for (int j = 0; j < n; j++) { if (leftCh == right.charAt(j)) { out.append(leftCh); break; } } } else if (n > 0 && m > 1) { final int mid = m / 2; // Find the middle point final CharSequence leftFirstPart = left.subSequence(0, mid); final CharSequence leftSecondPart = left.subSequence(mid, m); // Step 3 of the algorithm: two calls to Algorithm B final int[] l1 = algorithmB(leftFirstPart, right); final int[] l2 = algorithmB(reverse(leftSecondPart), reverse(right)); // Find k, as per the Step 4 of the algorithm int k = 0; int t = 0; for (int j = 0; j <= n; j++) { final int s = l1[j] + l2[n - j]; if (t < s) { t = s; k = j; } } // Step 5: solve simpler problems, recursively out.append(algorithmC(leftFirstPart, right.subSequence(0, k))); out.append(algorithmC(leftSecondPart, right.subSequence(k, n))); } return out.toString(); } // An auxiliary method for CharSequence reversal private static String reverse(final CharSequence s) { return new StringBuilder(s).reverse().toString(); } /** * Computes the longest common subsequence similarity score of two {@code CharSequence}'s passed as * input. * *

* This method implements a more efficient version of LCS algorithm which has quadratic time and * linear space complexity. *

* *

* This method is based on newly implemented {@link #algorithmB(CharSequence, CharSequence)}. * An evaluation using JMH revealed that this method is almost two times faster than its previous version. *

* * @param left first character sequence * @param right second character sequence * @return length of the longest common subsequence of {@code left} and {@code right} * @throws IllegalArgumentException if either String input {@code null} */ @Override public Integer apply(final CharSequence left, final CharSequence right) { // Quick return for invalid inputs if (left == null || right == null) { throw new IllegalArgumentException("Inputs must not be null"); } // Find lengths of two strings final int leftSz = left.length(); final int rightSz = right.length(); // Check if we can avoid calling algorithmB which involves heap space allocation if (leftSz == 0 || rightSz == 0) { return 0; } // Check if we can save even more space if (leftSz < rightSz) { return algorithmB(right, left)[leftSz]; } return algorithmB(left, right)[rightSz]; } /** * Computes the longest common subsequence between the two {@code CharSequence}'s passed as input. * *

* Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However, * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a * subsequence need not have adjacent characters. *

* *

* For reference, we give the definition of a subsequence for the reader: a subsequence is a sequence that * can be derived from another sequence by deleting some elements without changing the order of the remaining * elements. *

* * @param left first character sequence * @param right second character sequence * @return the longest common subsequence found * @throws IllegalArgumentException if either String input {@code null} * @deprecated Deprecated as of 1.2 due to a typo in the method name. * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead. * This method will be removed in 2.0. */ @Deprecated public CharSequence logestCommonSubsequence(final CharSequence left, final CharSequence right) { return longestCommonSubsequence(left, right); } /** * Computes the longest common subsequence between the two {@code CharSequence}'s passed as * input. * *

* This method implements a more efficient version of LCS algorithm which although has quadratic time, it * has linear space complexity. *

* * *

* Note, a substring and subsequence are not necessarily the same thing. Indeed, {@code abcxyzqrs} and * {@code xyzghfm} have both the same common substring and subsequence, namely {@code xyz}. However, * {@code axbyczqrs} and {@code abcxyzqtv} have the longest common subsequence {@code xyzq} because a * subsequence need not have adjacent characters. *

* *

* For reference, we give the definition of a subsequence for the reader: a subsequence is a sequence that * can be derived from another sequence by deleting some elements without changing the order of the remaining * elements. *

* * @param left first character sequence * @param right second character sequence * @return the longest common subsequence found * @throws IllegalArgumentException if either String input {@code null} * @since 1.2 */ public CharSequence longestCommonSubsequence(final CharSequence left, final CharSequence right) { // Quick return if (left == null || right == null) { throw new IllegalArgumentException("Inputs must not be null"); } // Find lengths of two strings final int leftSz = left.length(); final int rightSz = right.length(); // Check if we can avoid calling algorithmC which involves heap space allocation if (leftSz == 0 || rightSz == 0) { return ""; } // Check if we can save even more space if (leftSz < rightSz) { return algorithmC(right, left); } return algorithmC(left, right); } /** * Computes the lcsLengthArray for the sake of doing the actual lcs calculation. This is the * dynamic programming portion of the algorithm, and is the reason for the runtime complexity being * O(m*n), where m=left.length() and n=right.length(). * * @param left first character sequence * @param right second character sequence * @return lcsLengthArray * @deprecated Deprecated as of 1.10. A more efficient implementation for calculating LCS is now available. * Use {@link #longestCommonSubsequence(CharSequence, CharSequence)} instead to directly calculate the LCS. * This method will be removed in 2.0. */ @Deprecated public int[][] longestCommonSubstringLengthArray(final CharSequence left, final CharSequence right) { final int[][] lcsLengthArray = new int[left.length() + 1][right.length() + 1]; for (int i = 0; i < left.length(); i++) { for (int j = 0; j < right.length(); j++) { if (i == 0) { lcsLengthArray[i][j] = 0; } if (j == 0) { lcsLengthArray[i][j] = 0; } if (left.charAt(i) == right.charAt(j)) { lcsLengthArray[i + 1][j + 1] = lcsLengthArray[i][j] + 1; } else { lcsLengthArray[i + 1][j + 1] = Math.max(lcsLengthArray[i + 1][j], lcsLengthArray[i][j + 1]); } } } return lcsLengthArray; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy