org.deeplearning4j.berkeley.StringUtils Maven / Gradle / Ivy
/*-
*
* * Copyright 2015 Skymind,Inc.
* *
* * Licensed under the Apache License, Version 2.0 (the "License");
* * you may not use this file except in compliance with the License.
* * You may obtain a copy of the License at
* *
* * http://www.apache.org/licenses/LICENSE-2.0
* *
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS,
* * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* * See the License for the specific language governing permissions and
* * limitations under the License.
*
*/
package org.deeplearning4j.berkeley;
import java.io.*;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* StringUtils is a class for random String things.
*
* @author Dan Klein
* @author Christopher Manning
* @author Tim Grow ([email protected])
* @author Chris Cox
* @version 2003/02/03
*/
public class StringUtils {
/**
* Don't let anyone instantiate this class.
*/
private StringUtils() {}
/**
* Say whether this regular expression can be found inside
* this String. This method provides one of the two "missing"
* convenience methods for regular expressions in the String class
* in JDK1.4. This is the one you'll want to use all the time if
* you're used to Perl. What were they smoking?
*
* @param str String to search for match in
* @param regex String to compile as the regular expression
* @return Whether the regex can be found in str
*/
public static boolean find(String str, String regex) {
return Pattern.compile(regex).matcher(str).find();
}
/**
* Say whether this regular expression can be found at the beginning of
* this String. This method provides one of the two "missing"
* convenience methods for regular expressions in the String class
* in JDK1.4.
*
* @param str String to search for match at start of
* @param regex String to compile as the regular expression
* @return Whether the regex can be found at the start of str
*/
public static boolean lookingAt(String str, String regex) {
return Pattern.compile(regex).matcher(str).lookingAt();
}
/**
* Say whether this regular expression matches
* this String. This method is the same as the String.matches() method,
* and is included just to give a call that is parallel to the other
* static regex methods in this class.
*
* @param str String to search for match at start of
* @param regex String to compile as the regular expression
* @return Whether the regex matches the whole of this str
*/
public static boolean matches(String str, String regex) {
return Pattern.compile(regex).matcher(str).matches();
}
private static final int SLURPBUFFSIZE = 16000;
/**
* Returns all the text in the given File.
*/
public static String slurpFile(File file) throws IOException {
Reader r = new FileReader(file);
return slurpReader(r);
}
public static String slurpGBFileNoExceptions(String filename) {
return slurpFileNoExceptions(filename, "GB18030");
}
/**
* Returns all the text in the given file with the given encoding.
*/
public static String slurpFile(String filename, String encoding) throws IOException {
Reader r = new InputStreamReader(new FileInputStream(filename), encoding);
return slurpReader(r);
}
/**
* Returns all the text in the given file with the given encoding.
* If the file cannot be read (non-existent, etc.),
* then and only then the method returns null
.
*/
public static String slurpFileNoExceptions(String filename, String encoding) {
try {
return slurpFile(filename, encoding);
} catch (Exception e) {
throw new RuntimeException();
}
}
public static String slurpGBFile(String filename) throws IOException {
return slurpFile(filename, "GB18030");
}
/**
* Returns all the text from the given Reader.
*
* @return The text in the file.
*/
public static String slurpReader(Reader reader) {
BufferedReader r = new BufferedReader(reader);
StringBuilder buff = new StringBuilder();
try {
char[] chars = new char[SLURPBUFFSIZE];
while (true) {
int amountRead = r.read(chars, 0, SLURPBUFFSIZE);
if (amountRead < 0) {
break;
}
buff.append(chars, 0, amountRead);
}
r.close();
} catch (Exception e) {
throw new RuntimeException();
}
return buff.toString();
}
/**
* Returns all the text in the given file
*
* @return The text in the file.
*/
public static String slurpFile(String filename) throws IOException {
return slurpReader(new FileReader(filename));
}
/**
* Returns all the text in the given File.
*
* @return The text in the file. May be an empty string if the file
* is empty. If the file cannot be read (non-existent, etc.),
* then and only then the method returns null
.
*/
public static String slurpFileNoExceptions(File file) {
try {
return slurpReader(new FileReader(file));
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* Returns all the text in the given File.
*
* @return The text in the file. May be an empty string if the file
* is empty. If the file cannot be read (non-existent, etc.),
* then and only then the method returns null
.
*/
public static String slurpFileNoExceptions(String filename) {
try {
return slurpFile(filename);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* Returns all the text at the given URL.
*/
public static String slurpGBURL(URL u) throws IOException {
return slurpURL(u, "GB18030");
}
/**
* Returns all the text at the given URL.
*/
public static String slurpGBURLNoExceptions(URL u) {
try {
return slurpGBURL(u);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* Returns all the text at the given URL.
*/
public static String slurpURLNoExceptions(URL u, String encoding) {
try {
return slurpURL(u, encoding);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* Returns all the text at the given URL.
*/
public static String slurpURL(URL u, String encoding) throws IOException {
String lineSeparator = System.getProperty("line.separator");
URLConnection uc = u.openConnection();
uc.setReadTimeout(30000);
InputStream is;
try {
is = uc.getInputStream();
} catch (SocketTimeoutException e) {
//e.printStackTrace();
System.err.println("Time out. Return empty string");
return "";
}
BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding));
String temp;
StringBuilder buff = new StringBuilder(16000); // make biggish
while ((temp = br.readLine()) != null) {
buff.append(temp);
buff.append(lineSeparator);
}
br.close();
return buff.toString();
}
/**
* Returns all the text at the given URL.
*/
public static String slurpURL(URL u) throws IOException {
String lineSeparator = System.getProperty("line.separator");
URLConnection uc = u.openConnection();
InputStream is = uc.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String temp;
StringBuilder buff = new StringBuilder(16000); // make biggish
while ((temp = br.readLine()) != null) {
buff.append(temp);
buff.append(lineSeparator);
}
br.close();
return buff.toString();
}
/**
* Returns all the text at the given URL.
*/
public static String slurpURLNoExceptions(URL u) {
try {
return slurpURL(u);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* Returns all the text at the given URL.
*/
public static String slurpURL(String path) throws Exception {
return slurpURL(new URL(path));
}
/**
* Returns all the text at the given URL. If the file cannot be read (non-existent, etc.),
* then and only then the method returns null
.
*/
public static String slurpURLNoExceptions(String path) {
try {
return slurpURL(path);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* Joins each elem in the Collection with the given glue. For example, given a
* list
* of Integers, you can createComplex a comma-separated list by calling
* join(numbers, ", ").
*/
public static String join(Iterable l, String glue) {
StringBuilder sb = new StringBuilder();
boolean first = true;
for (Object o : l) {
if (!first) {
sb.append(glue);
}
sb.append(o.toString());
first = false;
}
return sb.toString();
}
/**
* Joins each elem in the List with the given glue. For example, given a
* list
* of Integers, you can createComplex a comma-separated list by calling
* join(numbers, ", ").
*/
public static String join(List> l, String glue) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < l.size(); i++) {
if (i > 0) {
sb.append(glue);
}
Object x = l.get(i);
sb.append(x.toString());
}
return sb.toString();
}
/**
* Joins each elem in the array with the given glue. For example, given a list
* of ints, you can createComplex a comma-separated list by calling
* join(numbers, ", ").
*/
public static String join(Object[] elements, String glue) {
return (join(Arrays.asList(elements), glue));
}
/**
* Joins elems with a space.
*/
public static String join(List l) {
return join(l, " ");
}
/**
* Joins elems with a space.
*/
public static String join(Object[] elements) {
return (join(elements, " "));
}
/**
* Splits on whitespace (\\s+).
*/
public static List split(String s) {
return (split(s, "\\s+"));
}
/**
* Splits the given string using the given regex as delimiters.
* This method is the same as the String.split() method (except it throws
* the results in a List),
* and is included just to give a call that is parallel to the other
* static regex methods in this class.
*
* @param str String to split up
* @param regex String to compile as the regular expression
* @return List of Strings resulting from splitting on the regex
*/
public static List split(String str, String regex) {
return (Arrays.asList(str.split(regex)));
}
/**
* Return a String of length a minimum of totalChars characters by
* padding the input String str with spaces. If str is already longer
* than totalChars, it is returned unchanged.
*/
public static String pad(String str, int totalChars) {
if (str == null)
str = "null";
int slen = str.length();
StringBuilder sb = new StringBuilder(str);
for (int i = 0; i < totalChars - slen; i++) {
sb.append(" ");
}
return sb.toString();
}
/**
* Pads the toString value of the given Object.
*/
public static String pad(Object obj, int totalChars) {
return pad(obj.toString(), totalChars);
}
/**
* Pad or trim so as to produce a string of exactly a certain length.
*
* @param str The String to be padded or truncated
* @param num The desired length
*/
public static String padOrTrim(String str, int num) {
if (str == null)
str = "null";
int leng = str.length();
if (leng < num) {
StringBuilder sb = new StringBuilder(str);
for (int i = 0; i < num - leng; i++) {
sb.append(" ");
}
return sb.toString();
} else if (leng > num) {
return str.substring(0, num);
} else {
return str;
}
}
/**
* Pad or trim the toString value of the given Object.
*/
public static String padOrTrim(Object obj, int totalChars) {
return padOrTrim(obj.toString(), totalChars);
}
/**
* Pads the given String to the left with spaces to ensure that it's
* at least totalChars long.
*/
public static String padLeft(String str, int totalChars) {
if (str == null)
str = "null";
StringBuilder sb = new StringBuilder();
for (int i = 0; i < totalChars - str.length(); i++) {
sb.append(" ");
}
sb.append(str);
return sb.toString();
}
public static String padLeft(Object obj, int totalChars) {
return padLeft(obj.toString(), totalChars);
}
public static String padLeft(int i, int totalChars) {
return padLeft(new Integer(i), totalChars);
}
public static String padLeft(double d, int totalChars) {
return padLeft(new Double(d), totalChars);
}
/**
* Returns s if it's at most maxWidth chars, otherwise chops right side to fit.
*/
public static String trim(String s, int maxWidth) {
if (s.length() <= maxWidth) {
return (s);
}
return (s.substring(0, maxWidth));
}
public static String trim(Object obj, int maxWidth) {
return trim(obj.toString(), maxWidth);
}
/**
* Returns a "clean" version of the given filename in which spaces have
* been converted to dashes and all non-alphaneumeric chars are underscores.
*/
public static String fileNameClean(String s) {
char[] chars = s.toCharArray();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < chars.length; i++) {
char c = chars[i];
if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || (c == '_')) {
sb.append(c);
} else {
if (c == ' ' || c == '-') {
sb.append('_');
} else {
sb.append("x" + (int) c + "x");
}
}
}
return sb.toString();
}
/**
* Returns the index of the nth occurrence of ch in s, or -1
* if there are less than n occurrences of ch.
*/
public static int nthIndex(String s, char ch, int n) {
int index = 0;
for (int i = 0; i < n; i++) {
// if we're already at the end of the string,
// and we need to find another ch, return -1
if (index == s.length() - 1) {
return -1;
}
index = s.indexOf(ch, index + 1);
if (index == -1) {
return (-1);
}
}
return index;
}
/**
* This returns a string from decimal digit smallestDigit to decimal digit
* biggest digit. Smallest digit is labeled 1, and the limits are
* inclusive.
*/
public static String truncate(int n, int smallestDigit, int biggestDigit) {
int numDigits = biggestDigit - smallestDigit + 1;
char[] result = new char[numDigits];
for (int j = 1; j < smallestDigit; j++) {
n = n / 10;
}
for (int j = numDigits - 1; j >= 0; j--) {
result[j] = Character.forDigit(n % 10, 10);
n = n / 10;
}
return new String(result);
}
/**
* Parses command line arguments into a Map. Arguments of the form
*
* -flag1 arg1a arg1b ... arg1m -flag2 -flag3 arg3a ... arg3n
*
* will be parsed so that the flag is a key in the Map (including
* the hyphen) and its value will be a {@link String[] } containing
* the optional arguments (if present). The non-flag values not
* captured as flag arguments are collected into a String[] array
* and returned as the value of null
in the Map. In
* this invocation, flags cannot take arguments, so all the {@link
* String} array values other than the value for null
* will be zero-length.
*
* @param args
* @return a {@link Map} of flag names to flag argument {@link
* String[]} arrays.
*/
public static Map argsToMap(String[] args) {
return argsToMap(args, new HashMap());
}
/**
* Parses command line arguments into a Map. Arguments of the form
*
* -flag1 arg1a arg1b ... arg1m -flag2 -flag3 arg3a ... arg3n
*
* will be parsed so that the flag is a key in the Map (including
* the hyphen) and its value will be a {@link String[] } containing
* the optional arguments (if present). The non-flag values not
* captured as flag arguments are collected into a String[] array
* and returned as the value of null
in the Map. In
* this invocation, the maximum number of arguments for each flag
* can be specified as an {@link Integer} value of the appropriate
* flag key in the flagsToNumArgs
{@link Map}
* argument. (By default, flags cannot take arguments.)
*
* Example of usage:
*
*
* Map flagsToNumArgs = new HashMap();
* flagsToNumArgs.put("-x",new Integer(2));
* flagsToNumArgs.put("-d",new Integer(1));
* Map result = argsToMap(args,flagsToNumArgs);
*
*
* @param args the argument array to be parsed
* @param flagsToNumArgs a {@link Map} of flag names to {@link
* Integer} values specifying the maximum number of allowed
* arguments for that flag (default 0).
* @return a {@link Map} of flag names to flag argument {@link
* String[]} arrays.
*/
public static Map argsToMap(String[] args, Map flagsToNumArgs) {
Map result = new HashMap<>();
List remainingArgs = new ArrayList<>();
String key;
for (int i = 0; i < args.length; i++) {
key = args[i];
if (key.charAt(0) == '-') { // found a flag
Integer maxFlagArgs = flagsToNumArgs.get(key);
int max = maxFlagArgs == null ? 0 : maxFlagArgs.intValue();
List flagArgs = new ArrayList<>();
for (int j = 0; j < max && i + 1 < args.length && args[i + 1].charAt(0) != '-'; i++, j++) {
flagArgs.add(args[i + 1]);
}
if (result.containsKey(key)) { // append the second specification into the args.
String[] newFlagArg = new String[result.get(key).length + flagsToNumArgs.get(key)];
int oldNumArgs = result.get(key).length;
System.arraycopy(result.get(key), 0, newFlagArg, 0, oldNumArgs);
for (int j = 0; j < flagArgs.size(); j++) {
newFlagArg[j + oldNumArgs] = flagArgs.get(j);
}
} else
result.put(key, flagArgs.toArray(new String[] {}));
} else {
remainingArgs.add(args[i]);
}
}
result.put(null, remainingArgs.toArray(new String[] {}));
return result;
}
private static final String PROP = "prop";
public static Properties argsToProperties(String[] args) {
return argsToProperties(args, new HashMap());
}
/**
* Analagous to {@link #argsToMap}. However, there are several key differences between this method and {@link #argsToMap}:
*
* - Hyphens are stripped from flag names
* - Since Properties objects are String to String mappings, the default number of arguments to a flag is
* assumed to be 1 and not 0.
* - Furthermore, the list of arguments not bound to a flag is mapped to the "" property, not null
* - The special flag "-prop" will load the property file specified by it's argument.
* - The value for flags without arguments is applyTransformToDestination to "true"
*/
public static Properties argsToProperties(String[] args, Map flagsToNumArgs) {
Properties result = new Properties();
List remainingArgs = new ArrayList<>();
String key;
for (int i = 0; i < args.length; i++) {
key = args[i];
if (key.charAt(0) == '-') { // found a flag
key = key.substring(1); // strip off the hyphen
Integer maxFlagArgs = (Integer) flagsToNumArgs.get(key);
int max = maxFlagArgs == null ? 1 : maxFlagArgs.intValue();
List flagArgs = new ArrayList<>();
for (int j = 0; j < max && i + 1 < args.length && args[i + 1].charAt(0) != '-'; i++, j++) {
flagArgs.add(args[i + 1]);
}
if (flagArgs.isEmpty()) {
result.setProperty(key, "true");
} else {
result.setProperty(key, join(flagArgs, " "));
if (key.equalsIgnoreCase(PROP)) {
try {
result.load(new BufferedInputStream(new FileInputStream(result.getProperty(PROP))));
} catch (IOException e) {
e.printStackTrace();
}
}
}
} else {
remainingArgs.add(args[i]);
}
}
result.setProperty("", join(remainingArgs, " "));
return result;
}
/**
* This method converts a comma-separated String (with whitespace
* optionally allowed after the comma) representing properties
* to a Properties object. Each property is "property=value". The value
* for properties without an explicitly given value is applyTransformToDestination to "true".
*/
public static Properties stringToProperties(String str) {
Properties result = new Properties();
String[] props = str.trim().split(",\\s*");
for (int i = 0; i < props.length; i++) {
String term = props[i];
int divLoc = term.indexOf('=');
String key;
String value;
if (divLoc >= 0) {
key = term.substring(0, divLoc);
value = term.substring(divLoc + 1);
} else {
key = term;
value = "true";
}
result.setProperty(key, value);
}
return result;
}
/**
* Prints to a file. If the file already exists, appends if
* append=true
, and overwrites if append=false
*/
public static void printToFile(File file, String message, boolean append) {
FileWriter fw = null;
PrintWriter pw = null;
try {
fw = new FileWriter(file, append);
pw = new PrintWriter(fw);
pw.print(message);
} catch (Exception e) {
System.out.println("Exception: in printToFile " + file.getAbsolutePath() + " " + message);
e.printStackTrace();
} finally {
if (pw != null) {
pw.close();
}
}
}
/**
* Prints to a file. If the file does not exist, rewrites the file;
* does not append.
*/
public static void printToFile(File file, String message) {
printToFile(file, message, false);
}
/**
* Prints to a file. If the file already exists, appends if
* append=true
, and overwrites if append=false
*/
public static void printToFile(String filename, String message, boolean append) {
printToFile(new File(filename), message, append);
}
/**
* Prints to a file. If the file does not exist, rewrites the file;
* does not append.
*/
public static void printToFile(String filename, String message) {
printToFile(new File(filename), message, false);
}
/**
* A simpler form of command line argument parsing.
* Dan thinks this is highly superior to the overly complexified code that
* comes before it.
* Parses command line arguments into a Map. Arguments of the form
* -flag1 arg1 -flag2 -flag3 arg3
* will be parsed so that the flag is a key in the Map (including the hyphen)
* and the
* optional argument will be its value (if present).
*
* @param args
* @return A Map from keys to possible values (String or null)
*/
public static Map parseCommandLineArguments(String[] args) {
Map result = new HashMap<>();
String key, value;
for (int i = 0; i < args.length; i++) {
key = args[i];
if (key.charAt(0) == '-') {
if (i + 1 < args.length) {
value = args[i + 1];
if (value.charAt(0) != '-') {
result.put(key, value);
i++;
} else {
result.put(key, null);
}
} else {
result.put(key, null);
}
}
}
return result;
}
public static String stripNonAlphaNumerics(String orig) {
StringBuilder sb = new StringBuilder();
char c;
for (int i = 0; i < orig.length(); i++) {
c = orig.charAt(i);
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
sb.append(c);
}
}
return sb.toString();
}
public static void printStringOneCharPerLine(String s) {
for (int i = 0; i < s.length(); i++) {
int c = s.charAt(i);
System.out.println(c + " \'" + (char) c + "\' ");
}
}
public static String escapeString(String s, char[] charsToEscape, char escapeChar) {
StringBuilder result = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (c == escapeChar) {
result.append(escapeChar);
} else {
for (int j = 0; j < charsToEscape.length; j++) {
if (c == charsToEscape[j]) {
result.append(escapeChar);
break;
}
}
}
result.append(c);
}
return result.toString();
}
/**
* This function splits the String s into multiple Strings using the
* splitChar. However, it provides an quoting facility: it is possible to
* quote strings with the quoteChar.
* If the quoteChar occurs within the quotedExpression, it must be prefaced
* by the escapeChar
*
* @param s The String to split
* @param splitChar
* @param quoteChar
* @return An array of Strings that s is split into
*/
public static String[] splitOnCharWithQuoting(String s, char splitChar, char quoteChar, char escapeChar) {
List result = new ArrayList<>();
int i = 0;
int length = s.length();
StringBuilder b = new StringBuilder();
while (i < length) {
char curr = s.charAt(i);
if (curr == splitChar) {
// add last buffer
if (b.length() > 0) {
result.add(b.toString());
b = new StringBuilder();
}
i++;
} else if (curr == quoteChar) {
// find next instance of quoteChar
i++;
while (i < length) {
curr = s.charAt(i);
if (curr == escapeChar) {
b.append(s.charAt(i + 1));
i += 2;
} else if (curr == quoteChar) {
i++;
break; // break this loop
} else {
b.append(s.charAt(i));
i++;
}
}
} else {
b.append(curr);
i++;
}
}
if (b.length() > 0) {
result.add(b.toString());
}
return result.toArray(new String[0]);
}
/**
* Computes the longest common substring of s and t.
* The longest common substring of a and b is the longest run of
* characters that appear in order inside both a and b. Both a and b
* may have other extraneous characters along the way. This is like
* edit distance but with no substitution and a higher number means
* more similar. For example, the LCS of "abcD" and "aXbc" is 3 (abc).
*/
public static int longestCommonSubstring(String s, String t) {
int d[][]; // matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
// Step 1
n = s.length();
m = t.length();
if (n == 0) {
return 0;
}
if (m == 0) {
return 0;
}
d = new int[n + 1][m + 1];
// Step 2
for (i = 0; i <= n; i++) {
d[i][0] = 0;
}
for (j = 0; j <= m; j++) {
d[0][j] = 0;
}
// Step 3
for (i = 1; i <= n; i++) {
s_i = s.charAt(i - 1);
// Step 4
for (j = 1; j <= m; j++) {
t_j = t.charAt(j - 1);
// Step 5
// js: if the chars match, you can getFromOrigin an extra point
// otherwise you have to skip an insertion or deletion (no subs)
if (s_i == t_j) {
d[i][j] = SloppyMath.max(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1] + 1);
} else {
d[i][j] = Math.max(d[i - 1][j], d[i][j - 1]);
}
}
}
if (false) {
// num chars needed to display longest num
int numChars = (int) Math.ceil(Math.log(d[n][m]) / Math.log(10));
for (i = 0; i < numChars + 3; i++) {
System.err.print(' ');
}
for (j = 0; j < m; j++) {
System.err.print("" + t.charAt(j) + " ");
}
System.err.println();
for (i = 0; i <= n; i++) {
System.err.print((i == 0 ? ' ' : s.charAt(i - 1)) + " ");
for (j = 0; j <= m; j++) {
System.err.print("" + d[i][j] + " ");
}
System.err.println();
}
}
// Step 7
return d[n][m];
}
/**
* Computes the Levenshtein (edit) distance of the two given Strings.
*/
public static int editDistance(String s, String t) {
int d[][]; // matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
int cost; // cost
// Step 1
n = s.length();
m = t.length();
if (n == 0) {
return m;
}
if (m == 0) {
return n;
}
d = new int[n + 1][m + 1];
// Step 2
for (i = 0; i <= n; i++) {
d[i][0] = i;
}
for (j = 0; j <= m; j++) {
d[0][j] = j;
}
// Step 3
for (i = 1; i <= n; i++) {
s_i = s.charAt(i - 1);
// Step 4
for (j = 1; j <= m; j++) {
t_j = t.charAt(j - 1);
// Step 5
if (s_i == t_j) {
cost = 0;
} else {
cost = 1;
}
// Step 6
d[i][j] = SloppyMath.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
}
}
// Step 7
return d[n][m];
}
/**
* Computes the WordNet 2.0 POS tag corresponding to the PTB POS tag s.
*
* @param s a Penn TreeBank POS tag.
*/
public static String pennPOSToWordnetPOS(String s) {
if (s.matches("NN|NNP|NNS|NNPS")) {
return "noun";
}
if (s.matches("VB|VBD|VBG|VBN|VBZ|VBP|MD")) {
return "verb";
}
if (s.matches("JJ|JJR|JJS|CD")) {
return "adjective";
}
if (s.matches("RB|RBR|RBS|RP|WRB")) {
return "adverb";
}
return null;
}
/**
* Uppercases the first character of a string.
*
* @param s a string to capitalize
* @return a capitalized version of the string
*/
public static String capitalize(String s) {
if (s.charAt(0) >= 'a') {
return (char) (s.charAt(0) + ('A' - 'a')) + s.substring(1);
} else {
return s;
}
}
public static List allMatches(String str, String regex) {
Pattern p = Pattern.compile(regex);
List matches = new ArrayList<>();
while (true) {
Matcher m = p.matcher(str);
if (!m.find())
break;
matches.add(m);
str = str.substring(m.end());
}
return matches;
}
public static void main(String[] args) throws IOException {
String[] s = {"there once was a man", "this one is a manic", "hey there", "there once was a mane",
"once in a manger.", "where is one match?"};
for (int i = 0; i < 6; i++) {
for (int j = 0; j < 6; j++) {
System.out.println("s1: " + s[i]);
System.out.println("s2: " + s[j]);
System.out.println("edit distance: " + editDistance(s[i], s[j]));
System.out.println("LCS: " + longestCommonSubstring(s[i], s[j]));
System.out.println();
}
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy