All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.util.Strings Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

There is a newer version: 2.0.12
Show newest version
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */

package cc.mallet.util;

import java.util.Arrays;

/**
 *  Static utility methods for Strings
 */
final public class Strings {

	public static int commonPrefixIndex (String[] strings)
	{
		int prefixLen = strings[0].length();
		for (int i = 1; i < strings.length; i++) {
			if (strings[i].length() < prefixLen)
				prefixLen = strings[i].length();
			int j = 0;
			if (prefixLen == 0)
				return 0;
			while (j < prefixLen) {
				if (strings[i-1].charAt(j) != strings[i].charAt(j)) {
					prefixLen = j;
					break;
				}
				j++;
			}
		}
		return prefixLen;
	}

	public static String commonPrefix (String[] strings)
	{
		return strings[0].substring (0, commonPrefixIndex(strings));
	}

  public static int count (String string, char ch)
  {
    int idx = -1;
    int count = 0;
    while ((idx = string.indexOf (ch, idx+1)) >= 0) { count++; };
    return count;
  }

	public static double levenshteinDistance (String s, String t) {
		int n = s.length();
		int m = t.length();
		int d[][]; // matrix
		int i; // iterates through s
		int j; // iterates through t
		char s_i; // ith character of s
		char t_j; // jth character of t
		int cost; // cost
		
    if (n == 0) 
      return 1.0;
    if (m == 0) 
      return 1.0;

    d = new int[n+1][m+1];

    for (i = 0; i <= n; i++)
      d[i][0] = i;

    for (j = 0; j <= m; j++) 
      d[0][j] = j;

    for (i = 1; i <= n; i++) {
      s_i = s.charAt (i - 1);

      for (j = 1; j <= m; j++) {
        t_j = t.charAt (j - 1);

				cost = (s_i == t_j) ? 0 : 1;

        d[i][j] = minimum (d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1] + cost);
      }
    }

		int longer = (n > m) ? n : m;
    return (double)d[n][m] / longer; // Normalize to 0-1.
	}

	private static int minimum (int a, int b, int c) {
		int mi = a;		
    if (b < mi) {
      mi = b;
    }
    if (c < mi) {
      mi = c;
    }
    return mi;		
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy