All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.dse.graph.internal.utils.StringUtils Maven / Gradle / Ivy

There is a newer version: 1.9.0
Show newest version
/*
 *      Copyright (C) 2012-2017 DataStax Inc.
 *
 *      This software can be used solely with DataStax Enterprise. Please consult the license at
 *      http://www.datastax.com/terms/datastax-dse-driver-license-terms
 */
package com.datastax.dse.graph.internal.utils;

/**
 * Utilities for string manipulation.
 */
public class StringUtils {

    /**
     * Finds the
     * Optimal string alignment distance –
     * also referred to as the Damerau-Levenshtein distance –
     * between two strings.
     * 

* This is the number of changes needed to change one string into * another (insertions, deletions or substitutions of a single character, * or transpositions of two adjacent characters). *

* This implementation is based on the Apache Commons Lang implementation of * the Levenshtein distance, only adding support for transpositions. *

* Note that this is the distance used in Lucene for {@code FuzzyTermsEnum}. * Lucene itself has an implementation of this algorithm, * but it is much less efficient in terms of space (also note that Lucene's * implementation does not return the distance, but a similarity score based on it). * * @param s the first string, must not be {@code null}. * @param t the second string, must not be {@code null}. * @return The Optimal string alignment distance between the two strings. * @throws IllegalArgumentException if either String input is {@code null}. * @see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String) * @see LuceneLevenshteinDistance */ public static int getOptimalStringAlignmentDistance(String s, String t) { /* * Code adapted from https://github.com/apache/commons-lang/blob/LANG_2_6/src/main/java/org/apache/commons/lang/StringUtils.java * which was originally released under the Apache 2.0 license with the following copyright: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ if (s == null || t == null) { throw new IllegalArgumentException("Strings must not be null"); } int n = s.length(); // length of s int m = t.length(); // length of t if (n == 0) { return m; } else if (m == 0) { return n; } if (n > m) { // swap the input strings to consume less memory String tmp = s; s = t; t = tmp; n = m; m = t.length(); } // instead of maintaining the full matrix in memory, // we use a sliding window containing 3 lines: // the current line being written to, and // the two previous ones. int d[] = new int[n + 1]; // current line in the cost matrix int p1[] = new int[n + 1]; // first line above the current one in the cost matrix int p2[] = new int[n + 1]; // second line above the current one in the cost matrix int _d[]; // placeholder to assist in swapping p1, p2 and d // indexes into strings s and t int i; // iterates through s int j; // iterates through t for (i = 0; i <= n; i++) { p1[i] = i; } for (j = 1; j <= m; j++) { // jth character of t char t_j = t.charAt(j - 1); d[0] = j; for (i = 1; i <= n; i++) { char s_i = s.charAt(i - 1); int cost = s_i == t_j ? 0 : 1; int deletion = d[i - 1] + 1; // cell to the left + 1 int insertion = p1[i] + 1; // cell to the top + 1 int substitution = p1[i - 1] + cost; // cell diagonally left and up + cost d[i] = Math.min(Math.min(deletion, insertion), substitution); // transposition if (i > 1 && j > 1 && s_i == t.charAt(j - 2) && s.charAt(i - 2) == t_j) { d[i] = Math.min(d[i], p2[i - 2] + cost); } } // swap arrays _d = p2; p2 = p1; p1 = d; d = _d; } // our last action in the above loop was to switch d and p1, so p1 now // actually has the most recent cost counts return p1[n]; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy