![JAR search and dependency download from the Maven repository](/logo.png)
edu.berkeley.nlp.util.StringUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
package edu.berkeley.nlp.util;
import edu.berkeley.nlp.math.SloppyMath;
import java.io.*;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* StringUtils is a class for random String things.
*
* @author Dan Klein
* @author Christopher Manning
* @author Tim Grow ([email protected])
* @author Chris Cox
* @version 2003/02/03
*/
public class StringUtils {
/**
* Don't let anyone instantiate this class.
*/
private StringUtils() {
}
/**
* Say whether this regular expression can be found inside
* this String. This method provides one of the two "missing"
* convenience methods for regular expressions in the String class
* in JDK1.4. This is the one you'll want to use all the time if
* you're used to Perl. What were they smoking?
*
* @param str String to search for match in
* @param regex String to compile as the regular expression
* @return Whether the regex can be found in str
*/
public static boolean find(String str, String regex) {
return Pattern.compile(regex).matcher(str).find();
}
/**
* Say whether this regular expression can be found at the beginning of
* this String. This method provides one of the two "missing"
* convenience methods for regular expressions in the String class
* in JDK1.4.
*
* @param str String to search for match at start of
* @param regex String to compile as the regular expression
* @return Whether the regex can be found at the start of str
*/
public static boolean lookingAt(String str, String regex) {
return Pattern.compile(regex).matcher(str).lookingAt();
}
/**
* Say whether this regular expression matches
* this String. This method is the same as the String.matches() method,
* and is included just to give a call that is parallel to the other
* static regex methods in this class.
*
* @param str String to search for match at start of
* @param regex String to compile as the regular expression
* @return Whether the regex matches the whole of this str
*/
public static boolean matches(String str, String regex) {
return Pattern.compile(regex).matcher(str).matches();
}
private static final int SLURPBUFFSIZE = 16000;
/**
* Returns all the text in the given File.
*/
public static String slurpFile(File file) throws IOException {
Reader r = new FileReader(file);
return slurpReader(r);
}
public static String slurpGBFileNoExceptions(String filename) {
return slurpFileNoExceptions(filename, "GB18030");
}
/**
* Returns all the text in the given file with the given encoding.
*/
public static String slurpFile(String filename, String encoding) throws IOException {
Reader r = new InputStreamReader(new FileInputStream(filename), encoding);
return slurpReader(r);
}
/**
* Returns all the text in the given file with the given encoding.
* If the file cannot be read (non-existent, etc.),
* then and only then the method returns null
.
*/
public static String slurpFileNoExceptions(String filename, String encoding) {
try {
return slurpFile(filename, encoding);
} catch (Exception e) {
throw new RuntimeException();
}
}
public static String slurpGBFile(String filename) throws IOException {
return slurpFile(filename, "GB18030");
}
/**
* Returns all the text from the given Reader.
*
* @return The text in the file.
*/
public static String slurpReader(Reader reader) {
BufferedReader r = new BufferedReader(reader);
StringBuffer buff = new StringBuffer();
try {
char[] chars = new char[SLURPBUFFSIZE];
while (true) {
int amountRead = r.read(chars, 0, SLURPBUFFSIZE);
if (amountRead < 0) {
break;
}
buff.append(chars, 0, amountRead);
}
r.close();
} catch (Exception e) {
throw new RuntimeException();
}
return buff.toString();
}
/**
* Returns all the text in the given file
*
* @return The text in the file.
*/
public static String slurpFile(String filename) throws IOException {
return slurpReader(new FileReader(filename));
}
/**
* Returns all the text in the given File.
*
* @return The text in the file. May be an empty string if the file
* is empty. If the file cannot be read (non-existent, etc.),
* then and only then the method returns null
.
*/
public static String slurpFileNoExceptions(File file) {
try {
return slurpReader(new FileReader(file));
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* Returns all the text in the given File.
*
* @return The text in the file. May be an empty string if the file
* is empty. If the file cannot be read (non-existent, etc.),
* then and only then the method returns null
.
*/
public static String slurpFileNoExceptions(String filename) {
try {
return slurpFile(filename);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* Returns all the text at the given URL.
*/
public static String slurpGBURL(URL u) throws IOException {
return slurpURL(u, "GB18030");
}
/**
* Returns all the text at the given URL.
*/
public static String slurpGBURLNoExceptions(URL u) {
try {
return slurpGBURL(u);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* Returns all the text at the given URL.
*/
public static String slurpURLNoExceptions(URL u, String encoding) {
try {
return slurpURL(u, encoding);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* Returns all the text at the given URL.
*/
public static String slurpURL(URL u, String encoding) throws IOException {
String lineSeparator = System.getProperty("line.separator");
URLConnection uc = u.openConnection();
uc.setReadTimeout(30000);
InputStream is;
try {
is = uc.getInputStream();
} catch (SocketTimeoutException e) {
//e.printStackTrace();
System.err.println("Time out. Return empty string");
return "";
}
BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding));
String temp;
StringBuffer buff = new StringBuffer(16000); // make biggish
while ((temp = br.readLine()) != null) {
buff.append(temp);
buff.append(lineSeparator);
}
br.close();
return buff.toString();
}
/**
* Returns all the text at the given URL.
*/
public static String slurpURL(URL u) throws IOException {
String lineSeparator = System.getProperty("line.separator");
URLConnection uc = u.openConnection();
InputStream is = uc.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String temp;
StringBuffer buff = new StringBuffer(16000); // make biggish
while ((temp = br.readLine()) != null) {
buff.append(temp);
buff.append(lineSeparator);
}
br.close();
return buff.toString();
}
/**
* Returns all the text at the given URL.
*/
public static String slurpURLNoExceptions(URL u) {
try {
return slurpURL(u);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* Returns all the text at the given URL.
*/
public static String slurpURL(String path) throws Exception {
return slurpURL(new URL(path));
}
/**
* Returns all the text at the given URL. If the file cannot be read (non-existent, etc.),
* then and only then the method returns null
.
*/
public static String slurpURLNoExceptions(String path) {
try {
return slurpURL(path);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* Joins each elem in the Collection with the given glue. For example, given a
* list
* of Integers, you can create a comma-separated list by calling
* join(numbers, ", ").
*/
public static String join(Iterable l, String glue) {
StringBuffer sb = new StringBuffer();
boolean first = true;
for (Object o : l) {
if (!first) {
sb.append(glue);
}
sb.append(o.toString());
first = false;
}
return sb.toString();
}
/**
* Joins each elem in the List with the given glue. For example, given a
* list
* of Integers, you can create a comma-separated list by calling
* join(numbers, ", ").
*/
public static String join(List> l, String glue) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < l.size(); i++) {
if (i > 0) {
sb.append(glue);
}
Object x = l.get(i);
sb.append(x.toString());
}
return sb.toString();
}
/**
* Joins each elem in the array with the given glue. For example, given a list
* of ints, you can create a comma-separated list by calling
* join(numbers, ", ").
*/
public static String join(Object[] elements, String glue) {
return (join(Arrays.asList(elements), glue));
}
/**
* Joins elems with a space.
*/
public static String join(List l) {
return join(l, " ");
}
/**
* Joins elems with a space.
*/
public static String join(Object[] elements) {
return (join(elements, " "));
}
/**
* Splits on whitespace (\\s+).
*/
public static List split(String s) {
return (split(s, "\\s+"));
}
/**
* Splits the given string using the given regex as delimiters.
* This method is the same as the String.split() method (except it throws
* the results in a List),
* and is included just to give a call that is parallel to the other
* static regex methods in this class.
*
* @param str String to split up
* @param regex String to compile as the regular expression
* @return List of Strings resulting from splitting on the regex
*/
public static List split(String str, String regex) {
return (Arrays.asList(str.split(regex)));
}
/**
* Return a String of length a minimum of totalChars characters by
* padding the input String str with spaces. If str is already longer
* than totalChars, it is returned unchanged.
*/
public static String pad(String str, int totalChars) {
if (str == null)
str = "null";
int slen = str.length();
StringBuffer sb = new StringBuffer(str);
for (int i = 0; i < totalChars - slen; i++) {
sb.append(" ");
}
return sb.toString();
}
/**
* Pads the toString value of the given Object.
*/
public static String pad(Object obj, int totalChars) {
return pad(obj.toString(), totalChars);
}
/**
* Pad or trim so as to produce a string of exactly a certain length.
*
* @param str The String to be padded or truncated
* @param num The desired length
*/
public static String padOrTrim(String str, int num) {
if (str == null)
str = "null";
int leng = str.length();
if (leng < num) {
StringBuffer sb = new StringBuffer(str);
for (int i = 0; i < num - leng; i++) {
sb.append(" ");
}
return sb.toString();
} else if (leng > num) {
return str.substring(0, num);
} else {
return str;
}
}
/**
* Pad or trim the toString value of the given Object.
*/
public static String padOrTrim(Object obj, int totalChars) {
return padOrTrim(obj.toString(), totalChars);
}
/**
* Pads the given String to the left with spaces to ensure that it's
* at least totalChars long.
*/
public static String padLeft(String str, int totalChars) {
if (str == null)
str = "null";
StringBuffer sb = new StringBuffer();
for (int i = 0; i < totalChars - str.length(); i++) {
sb.append(" ");
}
sb.append(str);
return sb.toString();
}
public static String padLeft(Object obj, int totalChars) {
return padLeft(obj.toString(), totalChars);
}
public static String padLeft(int i, int totalChars) {
return padLeft(new Integer(i), totalChars);
}
public static String padLeft(double d, int totalChars) {
return padLeft(new Double(d), totalChars);
}
/**
* Returns s if it's at most maxWidth chars, otherwise chops right side to fit.
*/
public static String trim(String s, int maxWidth) {
if (s.length() <= maxWidth) {
return (s);
}
return (s.substring(0, maxWidth));
}
public static String trim(Object obj, int maxWidth) {
return trim(obj.toString(), maxWidth);
}
/**
* Returns a "clean" version of the given filename in which spaces have
* been converted to dashes and all non-alphaneumeric chars are underscores.
*/
public static String fileNameClean(String s) {
char[] chars = s.toCharArray();
StringBuffer sb = new StringBuffer();
for (int i = 0; i < chars.length; i++) {
char c = chars[i];
if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')
|| (c == '_')) {
sb.append(c);
} else {
if (c == ' ' || c == '-') {
sb.append('_');
} else {
sb.append("x" + (int) c + "x");
}
}
}
return sb.toString();
}
/**
* Returns the index of the nth occurrence of ch in s, or -1
* if there are less than n occurrences of ch.
*/
public static int nthIndex(String s, char ch, int n) {
int index = 0;
for (int i = 0; i < n; i++) {
// if we're already at the end of the string,
// and we need to find another ch, return -1
if (index == s.length() - 1) {
return -1;
}
index = s.indexOf(ch, index + 1);
if (index == -1) {
return (-1);
}
}
return index;
}
/**
* This returns a string from decimal digit smallestDigit to decimal digit
* biggest digit. Smallest digit is labeled 1, and the limits are
* inclusive.
*/
public static String truncate(int n, int smallestDigit, int biggestDigit) {
int numDigits = biggestDigit - smallestDigit + 1;
char[] result = new char[numDigits];
for (int j = 1; j < smallestDigit; j++) {
n = n / 10;
}
for (int j = numDigits - 1; j >= 0; j--) {
result[j] = Character.forDigit(n % 10, 10);
n = n / 10;
}
return new String(result);
}
/**
* Parses command line arguments into a Map. Arguments of the form
*
* -flag1 arg1a arg1b ... arg1m -flag2 -flag3 arg3a ... arg3n
*
* will be parsed so that the flag is a key in the Map (including
* the hyphen) and its value will be a {@link String[] } containing
* the optional arguments (if present). The non-flag values not
* captured as flag arguments are collected into a String[] array
* and returned as the value of null
in the Map. In
* this invocation, flags cannot take arguments, so all the {@link
* String} array values other than the value for null
* will be zero-length.
*
* @param args
* @return a {@link Map} of flag names to flag argument {@link
* String[]} arrays.
*/
public static Map argsToMap(String[] args) {
return argsToMap(args, new HashMap());
}
/**
* Parses command line arguments into a Map. Arguments of the form
*
* -flag1 arg1a arg1b ... arg1m -flag2 -flag3 arg3a ... arg3n
*
* will be parsed so that the flag is a key in the Map (including
* the hyphen) and its value will be a {@link String[] } containing
* the optional arguments (if present). The non-flag values not
* captured as flag arguments are collected into a String[] array
* and returned as the value of null
in the Map. In
* this invocation, the maximum number of arguments for each flag
* can be specified as an {@link Integer} value of the appropriate
* flag key in the flagsToNumArgs
{@link Map}
* argument. (By default, flags cannot take arguments.)
*
* Example of usage:
*
*
* Map flagsToNumArgs = new HashMap();
* flagsToNumArgs.put("-x",new Integer(2));
* flagsToNumArgs.put("-d",new Integer(1));
* Map result = argsToMap(args,flagsToNumArgs);
*
*
* @param args the argument array to be parsed
* @param flagsToNumArgs a {@link Map} of flag names to {@link
* Integer} values specifying the maximum number of allowed
* arguments for that flag (default 0).
* @return a {@link Map} of flag names to flag argument {@link
* String[]} arrays.
*/
public static Map argsToMap(String[] args,
Map flagsToNumArgs) {
Map result = new HashMap();
List remainingArgs = new ArrayList();
String key;
for (int i = 0; i < args.length; i++) {
key = args[i];
if (key.charAt(0) == '-') { // found a flag
Integer maxFlagArgs = flagsToNumArgs.get(key);
int max = maxFlagArgs == null ? 0 : maxFlagArgs.intValue();
List flagArgs = new ArrayList();
for (int j = 0; j < max && i + 1 < args.length && args[i + 1].charAt(0) != '-'; i++, j++) {
flagArgs.add(args[i + 1]);
}
if (result.containsKey(key)) { // append the second specification into the args.
String[] newFlagArg = new String[result.get(key).length
+ flagsToNumArgs.get(key)];
int oldNumArgs = result.get(key).length;
System.arraycopy(result.get(key), 0, newFlagArg, 0, oldNumArgs);
for (int j = 0; j < flagArgs.size(); j++) {
newFlagArg[j + oldNumArgs] = flagArgs.get(j);
}
} else
result.put(key, (String[]) flagArgs.toArray(new String[] {}));
} else {
remainingArgs.add(args[i]);
}
}
result.put(null, (String[]) remainingArgs.toArray(new String[] {}));
return result;
}
private static final String PROP = "prop";
public static Properties argsToProperties(String[] args) {
return argsToProperties(args, new HashMap());
}
/**
* Analagous to {@link #argsToMap}. However, there are several key differences between this method and {@link #argsToMap}:
*
* - Hyphens are stripped from flag names
* - Since Properties objects are String to String mappings, the default number of arguments to a flag is
* assumed to be 1 and not 0.
* - Furthermore, the list of arguments not bound to a flag is mapped to the "" property, not null
* - The special flag "-prop" will load the property file specified by it's argument.
* - The value for flags without arguments is set to "true"
*/
public static Properties argsToProperties(String[] args, Map flagsToNumArgs) {
Properties result = new Properties();
List remainingArgs = new ArrayList();
String key;
for (int i = 0; i < args.length; i++) {
key = args[i];
if (key.charAt(0) == '-') { // found a flag
key = key.substring(1); // strip off the hyphen
Integer maxFlagArgs = (Integer) flagsToNumArgs.get(key);
int max = maxFlagArgs == null ? 1 : maxFlagArgs.intValue();
List flagArgs = new ArrayList();
for (int j = 0; j < max && i + 1 < args.length && args[i + 1].charAt(0) != '-'; i++, j++) {
flagArgs.add(args[i + 1]);
}
if (flagArgs.size() == 0) {
result.setProperty(key, "true");
} else {
result.setProperty(key, join(flagArgs, " "));
if (key.equalsIgnoreCase(PROP)) {
try {
result.load(new BufferedInputStream(new FileInputStream(result
.getProperty(PROP))));
} catch (IOException e) {
e.printStackTrace();
}
}
}
} else {
remainingArgs.add(args[i]);
}
}
result.setProperty("", join(remainingArgs, " "));
return result;
}
/**
* This method converts a comma-separated String (with whitespace
* optionally allowed after the comma) representing properties
* to a Properties object. Each property is "property=value". The value
* for properties without an explicitly given value is set to "true".
*/
public static Properties stringToProperties(String str) {
Properties result = new Properties();
String[] props = str.trim().split(",\\s*");
for (int i = 0; i < props.length; i++) {
String term = props[i];
int divLoc = term.indexOf("=");
String key;
String value;
if (divLoc >= 0) {
key = term.substring(0, divLoc);
value = term.substring(divLoc + 1);
} else {
key = term;
value = "true";
}
result.setProperty(key, value);
}
return result;
}
/**
* Prints to a file. If the file already exists, appends if
* append=true
, and overwrites if append=false
*/
public static void printToFile(File file, String message, boolean append) {
FileWriter fw = null;
PrintWriter pw = null;
try {
fw = new FileWriter(file, append);
pw = new PrintWriter(fw);
pw.print(message);
} catch (Exception e) {
System.out.println("Exception: in printToFile " + file.getAbsolutePath() + " "
+ message);
e.printStackTrace();
} finally {
if (pw != null) {
pw.close();
}
}
}
/**
* Prints to a file. If the file does not exist, rewrites the file;
* does not append.
*/
public static void printToFile(File file, String message) {
printToFile(file, message, false);
}
/**
* Prints to a file. If the file already exists, appends if
* append=true
, and overwrites if append=false
*/
public static void printToFile(String filename, String message, boolean append) {
printToFile(new File(filename), message, append);
}
/**
* Prints to a file. If the file does not exist, rewrites the file;
* does not append.
*/
public static void printToFile(String filename, String message) {
printToFile(new File(filename), message, false);
}
/**
* A simpler form of command line argument parsing.
* Dan thinks this is highly superior to the overly complexified code that
* comes before it.
* Parses command line arguments into a Map. Arguments of the form
* -flag1 arg1 -flag2 -flag3 arg3
* will be parsed so that the flag is a key in the Map (including the hyphen)
* and the
* optional argument will be its value (if present).
*
* @param args
* @return A Map from keys to possible values (String or null)
*/
public static Map parseCommandLineArguments(String[] args) {
Map result = new HashMap();
String key, value;
for (int i = 0; i < args.length; i++) {
key = args[i];
if (key.charAt(0) == '-') {
if (i + 1 < args.length) {
value = args[i + 1];
if (value.charAt(0) != '-') {
result.put(key, value);
i++;
} else {
result.put(key, null);
}
} else {
result.put(key, null);
}
}
}
return result;
}
public static String stripNonAlphaNumerics(String orig) {
StringBuffer sb = new StringBuffer();
char c;
for (int i = 0; i < orig.length(); i++) {
c = orig.charAt(i);
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
sb.append(c);
}
}
return sb.toString();
}
public static void printStringOneCharPerLine(String s) {
for (int i = 0; i < s.length(); i++) {
int c = s.charAt(i);
System.out.println(c + " \'" + (char) c + "\' ");
}
}
public static String escapeString(String s, char[] charsToEscape, char escapeChar) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (c == escapeChar) {
result.append(escapeChar);
} else {
for (int j = 0; j < charsToEscape.length; j++) {
if (c == charsToEscape[j]) {
result.append(escapeChar);
break;
}
}
}
result.append(c);
}
return result.toString();
}
/**
* This function splits the String s into multiple Strings using the
* splitChar. However, it provides an quoting facility: it is possible to
* quote strings with the quoteChar.
* If the quoteChar occurs within the quotedExpression, it must be prefaced
* by the escapeChar
*
* @param s The String to split
* @param splitChar
* @param quoteChar
* @return An array of Strings that s is split into
*/
public static String[] splitOnCharWithQuoting(String s, char splitChar, char quoteChar,
char escapeChar) {
List result = new ArrayList();
int i = 0;
int length = s.length();
StringBuffer b = new StringBuffer();
while (i < length) {
char curr = s.charAt(i);
if (curr == splitChar) {
// add last buffer
if (b.length() > 0) {
result.add(b.toString());
b = new StringBuffer();
}
i++;
} else if (curr == quoteChar) {
// find next instance of quoteChar
i++;
while (i < length) {
curr = s.charAt(i);
if (curr == escapeChar) {
b.append(s.charAt(i + 1));
i += 2;
} else if (curr == quoteChar) {
i++;
break; // break this loop
} else {
b.append(s.charAt(i));
i++;
}
}
} else {
b.append(curr);
i++;
}
}
if (b.length() > 0) {
result.add(b.toString());
}
return (String[]) result.toArray(new String[0]);
}
/**
* Computes the longest common substring of s and t.
* The longest common substring of a and b is the longest run of
* characters that appear in order inside both a and b. Both a and b
* may have other extraneous characters along the way. This is like
* edit distance but with no substitution and a higher number means
* more similar. For example, the LCS of "abcD" and "aXbc" is 3 (abc).
*/
public static int longestCommonSubstring(String s, String t) {
int d[][]; // matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
// Step 1
n = s.length();
m = t.length();
if (n == 0) {
return 0;
}
if (m == 0) {
return 0;
}
d = new int[n + 1][m + 1];
// Step 2
for (i = 0; i <= n; i++) {
d[i][0] = 0;
}
for (j = 0; j <= m; j++) {
d[0][j] = 0;
}
// Step 3
for (i = 1; i <= n; i++) {
s_i = s.charAt(i - 1);
// Step 4
for (j = 1; j <= m; j++) {
t_j = t.charAt(j - 1);
// Step 5
// js: if the chars match, you can get an extra point
// otherwise you have to skip an insertion or deletion (no subs)
if (s_i == t_j) {
d[i][j] = SloppyMath.max(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1] + 1);
} else {
d[i][j] = Math.max(d[i - 1][j], d[i][j - 1]);
}
}
}
if (false) {
// num chars needed to display longest num
int numChars = (int) Math.ceil(Math.log(d[n][m]) / Math.log(10));
for (i = 0; i < numChars + 3; i++) {
System.err.print(' ');
}
for (j = 0; j < m; j++) {
System.err.print("" + t.charAt(j) + " ");
}
System.err.println();
for (i = 0; i <= n; i++) {
System.err.print((i == 0 ? ' ' : s.charAt(i - 1)) + " ");
for (j = 0; j <= m; j++) {
System.err.print("" + d[i][j] + " ");
}
System.err.println();
}
}
// Step 7
return d[n][m];
}
/**
* Computes the Levenshtein (edit) distance of the two given Strings.
*/
public static int editDistance(String s, String t) {
int d[][]; // matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
int cost; // cost
// Step 1
n = s.length();
m = t.length();
if (n == 0) {
return m;
}
if (m == 0) {
return n;
}
d = new int[n + 1][m + 1];
// Step 2
for (i = 0; i <= n; i++) {
d[i][0] = i;
}
for (j = 0; j <= m; j++) {
d[0][j] = j;
}
// Step 3
for (i = 1; i <= n; i++) {
s_i = s.charAt(i - 1);
// Step 4
for (j = 1; j <= m; j++) {
t_j = t.charAt(j - 1);
// Step 5
if (s_i == t_j) {
cost = 0;
} else {
cost = 1;
}
// Step 6
d[i][j] = SloppyMath
.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
}
}
// Step 7
return d[n][m];
}
/**
* Computes the WordNet 2.0 POS tag corresponding to the PTB POS tag s.
*
* @param s a Penn TreeBank POS tag.
*/
public static String pennPOSToWordnetPOS(String s) {
if (s.matches("NN|NNP|NNS|NNPS")) {
return "noun";
}
if (s.matches("VB|VBD|VBG|VBN|VBZ|VBP|MD")) {
return "verb";
}
if (s.matches("JJ|JJR|JJS|CD")) {
return "adjective";
}
if (s.matches("RB|RBR|RBS|RP|WRB")) {
return "adverb";
}
return null;
}
/**
* Uppercases the first character of a string.
*
* @param s a string to capitalize
* @return a capitalized version of the string
*/
public static String capitalize(String s) {
if ((char) s.charAt(0) >= 'a') {
return (String) ((char) (s.charAt(0) + ('A' - 'a')) + s.substring(1));
} else {
return s;
}
}
public static List allMatches(String str, String regex) {
Pattern p = Pattern.compile(regex);
List matches = new ArrayList();
while (true) {
Matcher m = p.matcher(str);
if (!m.find()) break;
matches.add(m);
str = str.substring(m.end());
}
return matches;
}
public static void main(String[] args) throws IOException {
String[] s = { "there once was a man", "this one is a manic", "hey there",
"there once was a mane", "once in a manger.", "where is one match?" };
for (int i = 0; i < 6; i++) {
for (int j = 0; j < 6; j++) {
System.out.println("s1: " + s[i]);
System.out.println("s2: " + s[j]);
System.out.println("edit distance: " + editDistance(s[i], s[j]));
System.out.println("LCS: " + longestCommonSubstring(s[i], s[j]));
System.out.println();
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy