
cc.mallet.util.Strings Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.util;
import java.util.Arrays;
/**
* Static utility methods for Strings
*/
final public class Strings {
public static int commonPrefixIndex (String[] strings)
{
int prefixLen = strings[0].length();
for (int i = 1; i < strings.length; i++) {
if (strings[i].length() < prefixLen)
prefixLen = strings[i].length();
int j = 0;
if (prefixLen == 0)
return 0;
while (j < prefixLen) {
if (strings[i-1].charAt(j) != strings[i].charAt(j)) {
prefixLen = j;
break;
}
j++;
}
}
return prefixLen;
}
public static String commonPrefix (String[] strings)
{
return strings[0].substring (0, commonPrefixIndex(strings));
}
public static int count (String string, char ch)
{
int idx = -1;
int count = 0;
while ((idx = string.indexOf (ch, idx+1)) >= 0) { count++; };
return count;
}
public static double levenshteinDistance (String s, String t) {
int n = s.length();
int m = t.length();
int d[][]; // matrix
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
int cost; // cost
if (n == 0)
return 1.0;
if (m == 0)
return 1.0;
d = new int[n+1][m+1];
for (i = 0; i <= n; i++)
d[i][0] = i;
for (j = 0; j <= m; j++)
d[0][j] = j;
for (i = 1; i <= n; i++) {
s_i = s.charAt (i - 1);
for (j = 1; j <= m; j++) {
t_j = t.charAt (j - 1);
cost = (s_i == t_j) ? 0 : 1;
d[i][j] = minimum (d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1] + cost);
}
}
int longer = (n > m) ? n : m;
return (double)d[n][m] / longer; // Normalize to 0-1.
}
private static int minimum (int a, int b, int c) {
int mi = a;
if (b < mi) {
mi = b;
}
if (c < mi) {
mi = c;
}
return mi;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy