com.datastax.dse.graph.internal.utils.StringUtils Maven / Gradle / Ivy
Show all versions of driver-cql-shaded Show documentation
/*
* Copyright DataStax, Inc.
*
* This software can be used solely with DataStax Enterprise. Please consult the license at
* http://www.datastax.com/terms/datastax-dse-driver-license-terms
*/
package com.datastax.dse.graph.internal.utils;
/** Utilities for string manipulation. */
public class StringUtils {
/**
* Finds the Optimal
* string alignment distance – also referred to as the Damerau-Levenshtein distance – between
* two strings.
*
* This is the number of changes needed to change one string into another (insertions,
* deletions or substitutions of a single character, or transpositions of two adjacent
* characters).
*
*
This implementation is based on the Apache Commons Lang implementation of the Levenshtein
* distance, only adding support for transpositions.
*
*
Note that this is the distance used in Lucene for {@code FuzzyTermsEnum}. Lucene itself has
* an implementation of this algorithm, but it is much less efficient in terms of space (also note
* that Lucene's implementation does not return the distance, but a similarity score based on it).
*
* @param s the first string, must not be {@code null}.
* @param t the second string, must not be {@code null}.
* @return The Optimal string alignment distance between the two strings.
* @throws IllegalArgumentException if either String input is {@code null}.
* @see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
* @see
* LuceneLevenshteinDistance
*/
public static int getOptimalStringAlignmentDistance(String s, String t) {
/*
* Code adapted from https://github.com/apache/commons-lang/blob/LANG_2_6/src/main/java/org/apache/commons/lang/StringUtils.java
* which was originally released under the Apache 2.0 license with the following copyright:
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
if (s == null || t == null) {
throw new IllegalArgumentException("Strings must not be null");
}
int n = s.length(); // length of s
int m = t.length(); // length of t
if (n == 0) {
return m;
} else if (m == 0) {
return n;
}
if (n > m) {
// swap the input strings to consume less memory
String tmp = s;
s = t;
t = tmp;
n = m;
m = t.length();
}
// instead of maintaining the full matrix in memory,
// we use a sliding window containing 3 lines:
// the current line being written to, and
// the two previous ones.
int d[] = new int[n + 1]; // current line in the cost matrix
int p1[] = new int[n + 1]; // first line above the current one in the cost matrix
int p2[] = new int[n + 1]; // second line above the current one in the cost matrix
int _d[]; // placeholder to assist in swapping p1, p2 and d
// indexes into strings s and t
int i; // iterates through s
int j; // iterates through t
for (i = 0; i <= n; i++) {
p1[i] = i;
}
for (j = 1; j <= m; j++) {
// jth character of t
char t_j = t.charAt(j - 1);
d[0] = j;
for (i = 1; i <= n; i++) {
char s_i = s.charAt(i - 1);
int cost = s_i == t_j ? 0 : 1;
int deletion = d[i - 1] + 1; // cell to the left + 1
int insertion = p1[i] + 1; // cell to the top + 1
int substitution = p1[i - 1] + cost; // cell diagonally left and up + cost
d[i] = Math.min(Math.min(deletion, insertion), substitution);
// transposition
if (i > 1 && j > 1 && s_i == t.charAt(j - 2) && s.charAt(i - 2) == t_j) {
d[i] = Math.min(d[i], p2[i - 2] + cost);
}
}
// swap arrays
_d = p2;
p2 = p1;
p1 = d;
d = _d;
}
// our last action in the above loop was to switch d and p1, so p1 now
// actually has the most recent cost counts
return p1[n];
}
}