Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
edu.stanford.nlp.util.StringUtils Maven / Gradle / Ivy
Go to download
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
package edu.stanford.nlp.util;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasOffset;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.math.SloppyMath;
import java.io.*;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.text.Normalizer;
import java.util.*;
import java.util.Map.Entry;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
/**
* StringUtils is a class for random String things, including output formatting and command line argument parsing.
*
* Many of these methods will be familiar to perl users: {@link #join(Iterable)}, {@link #split(String, String)}, {@link
* #trim(String, int)}, {@link #find(String, String)}, {@link #lookingAt(String, String)}, and {@link #matches(String,
* String)}.
*
* There are also useful methods for padding Strings/Objects with spaces on the right or left for printing even-width
* table columns: {@link #padLeft(int, int)}, {@link #pad(String, int)}.
*
*
Example: print a comma-separated list of numbers:
* System.out.println(StringUtils.pad(nums, ", "));
* Example: print a 2D array of numbers with 8-char cells:
* for(int i = 0; i < nums.length; i++) {
* for(int j = 0; j < nums[i].length; j++) {
*
* System.out.print(StringUtils.leftPad(nums[i][j], 8));
*
* System.out.println();
*
*
* @author Dan Klein
* @author Christopher Manning
* @author Tim Grow ([email protected] )
* @author Chris Cox
* @version 2006/02/03
*/
public class StringUtils {
/**
* Don't let anyone instantiate this class.
*/
private StringUtils() {}
public static final String[] EMPTY_STRING_ARRAY = new String[0];
private static final String PROP = "prop";
private static final String PROPS = "props";
private static final String PROPERTIES = "properties";
private static final String ARGS = "args";
private static final String ARGUMENTS = "arguments";
/**
* Say whether this regular expression can be found inside
* this String. This method provides one of the two "missing"
* convenience methods for regular expressions in the String class
* in JDK1.4. This is the one you'll want to use all the time if
* you're used to Perl. What were they smoking?
*
* @param str String to search for match in
* @param regex String to compile as the regular expression
* @return Whether the regex can be found in str
*/
public static boolean find(String str, String regex) {
return Pattern.compile(regex).matcher(str).find();
}
/**
* Convenience method: a case-insensitive variant of Collection.contains
* @param c Collection<String>
* @param s String
* @return true if s case-insensitively matches a string in c
*/
public static boolean containsIgnoreCase(Collection c, String s) {
for (String squote: c) {
if (squote.equalsIgnoreCase(s))
return true;
}
return false;
}
/**
* Say whether this regular expression can be found at the beginning of
* this String. This method provides one of the two "missing"
* convenience methods for regular expressions in the String class
* in JDK1.4.
*
* @param str String to search for match at start of
* @param regex String to compile as the regular expression
* @return Whether the regex can be found at the start of str
*/
public static boolean lookingAt(String str, String regex) {
return Pattern.compile(regex).matcher(str).lookingAt();
}
/**
* Takes a string of the form "x1=y1,x2=y2,..." such
* that each y is an integer and each x is a key. A
* String[] s is returned such that s[yn]=xn.
*
* @param map A string of the form "x1=y1,x2=y2,..." such
* that each y is an integer and each x is a key.
* @return A String[] s is returned such that s[yn]=xn
*/
public static String[] mapStringToArray(String map) {
String[] m = map.split("[,;]");
int maxIndex = 0;
String[] keys = new String[m.length];
int[] indices = new int[m.length];
for (int i = 0; i < m.length; i++) {
int index = m[i].lastIndexOf('=');
keys[i] = m[i].substring(0, index);
indices[i] = Integer.parseInt(m[i].substring(index + 1));
if (indices[i] > maxIndex) {
maxIndex = indices[i];
}
}
String[] mapArr = new String[maxIndex + 1];
Arrays.fill(mapArr, null);
for (int i = 0; i < m.length; i++) {
mapArr[indices[i]] = keys[i];
}
return mapArr;
}
/**
* Takes a string of the form "x1=y1,x2=y2,..." and returns Map.
*
* @param map A string of the form "x1=y1,x2=y2,..."
* @return A Map m is returned such that m.get(xn) = yn
*/
public static Map mapStringToMap(String map) {
String[] m = map.split("[,;]");
Map res = Generics.newHashMap();
for (String str : m) {
int index = str.lastIndexOf('=');
String key = str.substring(0, index);
String val = str.substring(index + 1);
res.put(key.trim(), val.trim());
}
return res;
}
public static List regexesToPatterns(Iterable regexes)
{
List patterns = new ArrayList<>();
for (String regex:regexes) {
patterns.add(Pattern.compile(regex));
}
return patterns;
}
/**
* Given a pattern and a string, returns a list with the values of the
* captured groups in the pattern. If the pattern does not match, returns
* null. Note that this uses Matcher.find() rather than Matcher.matches().
* If str is null, returns null.
*/
public static List regexGroups(Pattern regex, String str) {
if (str == null) {
return null;
}
Matcher matcher = regex.matcher(str);
if (!matcher.find()) {
return null;
}
List groups = new ArrayList<>();
for (int index = 1; index <= matcher.groupCount(); index++) {
groups.add(matcher.group(index));
}
return groups;
}
/**
* Say whether this regular expression matches
* this String. This method is the same as the String.matches() method,
* and is included just to give a call that is parallel to the other
* static regex methods in this class.
*
* @param str String to search for match at start of
* @param regex String to compile as the regular expression
* @return Whether the regex matches the whole of this str
*/
public static boolean matches(String str, String regex) {
return Pattern.compile(regex).matcher(str).matches();
}
public static Set stringToSet(String str, String delimiter)
{
Set ret = null;
if (str != null) {
String[] fields = str.split(delimiter);
ret = Generics.newHashSet(fields.length);
for (String field:fields) {
field = field.trim();
ret.add(field);
}
}
return ret;
}
public static String joinWords(Iterable extends HasWord> l, String glue) {
StringBuilder sb = new StringBuilder(l instanceof Collection ? ((Collection) l).size() : 64);
boolean first = true;
for (HasWord o : l) {
if ( ! first) {
sb.append(glue);
} else {
first = false;
}
sb.append(o.word());
}
return sb.toString();
}
public static String join(List extends E> l, String glue, Function toStringFunc, int start, int end) {
StringBuilder sb = new StringBuilder();
boolean first = true;
start = Math.max(start, 0);
end = Math.min(end, l.size());
for (int i = start; i < end; i++) {
if ( ! first) {
sb.append(glue);
} else {
first = false;
}
sb.append(toStringFunc.apply(l.get(i)));
}
return sb.toString();
}
public static String joinWords(List extends HasWord> l, String glue, int start, int end) {
return join(l, glue, in -> in.word(), start, end);
}
public static final Function DEFAULT_TOSTRING = new Function() {
@Override
public String apply(Object in) {
return in.toString();
}
};
public static String joinFields(List extends CoreMap> l, final Class field, final String defaultFieldValue,
String glue, int start, int end, final Function toStringFunc) {
return join(l, glue, new Function() {
public String apply(CoreMap in) {
Object val = in.get(field);
return (val != null)? toStringFunc.apply(val):defaultFieldValue;
}
}, start, end);
}
public static String joinFields(List extends CoreMap> l, final Class field, final String defaultFieldValue,
String glue, int start, int end) {
return joinFields(l, field, defaultFieldValue, glue, start, end, DEFAULT_TOSTRING);
}
public static String joinFields(List extends CoreMap> l, final Class field, final Function toStringFunc) {
return joinFields(l, field, "-", " ", 0, l.size(), toStringFunc);
}
public static String joinFields(List extends CoreMap> l, final Class field) {
return joinFields(l, field, "-", " ", 0, l.size());
}
public static String joinMultipleFields(List extends CoreMap> l, final Class[] fields, final String defaultFieldValue,
final String fieldGlue, String glue, int start, int end, final Function toStringFunc) {
return join(l, glue, new Function() {
public String apply(CoreMap in) {
StringBuilder sb = new StringBuilder();
for (Class field: fields) {
if (sb.length() > 0) {
sb.append(fieldGlue);
}
Object val = in.get(field);
String str = (val != null)? toStringFunc.apply(val):defaultFieldValue;
sb.append(str);
}
return sb.toString();
}
}, start, end);
}
public static String joinMultipleFields(List extends CoreMap> l, final Class[] fields, final Function toStringFunc) {
return joinMultipleFields(l, fields, "-", "/", " ", 0, l.size(), toStringFunc);
}
public static String joinMultipleFields(List extends CoreMap> l, final Class[] fields, final String defaultFieldValue,
final String fieldGlue, String glue, int start, int end) {
return joinMultipleFields(l, fields, defaultFieldValue, fieldGlue, glue, start, end, DEFAULT_TOSTRING);
}
public static String joinMultipleFields(List extends CoreMap> l, final Class[] fields) {
return joinMultipleFields(l, fields, "-", "/", " ", 0, l.size());
}
/**
* Joins all the tokens together (more or less) according to their original whitespace.
* It assumes all whitespace was " "
* @param tokens list of tokens which implement {@link HasOffset} and {@link HasWord}
* @return a string of the tokens with the appropriate amount of spacing
*/
public static String joinWithOriginalWhiteSpace(List tokens) {
if (tokens.isEmpty()) {
return "";
}
CoreLabel lastToken = tokens.get(0);
StringBuilder buffer = new StringBuilder(lastToken.word());
for (int i = 1; i < tokens.size(); i++) {
CoreLabel currentToken = tokens.get(i);
int numSpaces = currentToken.beginPosition() - lastToken.endPosition();
if (numSpaces < 0) {
numSpaces = 0;
}
buffer.append(repeat(' ', numSpaces)).append(currentToken.word());
lastToken = currentToken;
}
return buffer.toString();
}
/**
* Joins each elem in the {@link Iterable} with the given glue.
* For example, given a list of {@code Integers}, you can create
* a comma-separated list by calling {@code join(numbers, ", ")}.
*
* @see StringUtils#join(Stream, String)
*/
public static String join(Iterable l, String glue) {
StringBuilder sb = new StringBuilder();
boolean first = true;
for (X o : l) {
if ( ! first) {
sb.append(glue);
} else {
first = false;
}
sb.append(o);
}
return sb.toString();
}
/**
* Joins each elem in the {@link Stream} with the given glue.
* For example, given a list of {@code Integers}, you can create
* a comma-separated list by calling {@code join(numbers, ", ")}.
*
* @see StringUtils#join(Iterable, String)
*/
public static String join(Stream l, String glue) {
StringBuilder sb = new StringBuilder();
boolean first = true;
Iterator iter = l.iterator();
while (iter.hasNext()) {
if ( ! first) {
sb.append(glue);
} else {
first = false;
}
sb.append(iter.next());
}
return sb.toString();
}
// Omitted; I'm pretty sure this are redundant with the above
// /**
// * Joins each elem in the List with the given glue. For example, given a
// * list
// * of Integers, you can create a comma-separated list by calling
// * join(numbers, ", ") .
// */
// public static String join(List l, String glue) {
// StringBuilder sb = new StringBuilder();
// for (int i = 0, sz = l.size(); i < sz; i++) {
// if (i > 0) {
// sb.append(glue);
// }
// sb.append(l.get(i).toString());
// }
// return sb.toString();
// }
/**
* Joins each elem in the array with the given glue. For example, given a
* list of ints, you can create a comma-separated list by calling
* join(numbers, ", ")
.
*/
public static String join(Object[] elements, String glue) {
return (join(Arrays.asList(elements), glue));
}
/**
* Joins an array of elements in a given span.
* @param elements The elements to join.
* @param start The start index to join from.
* @param end The end (non-inclusive) to join until.
* @param glue The glue to hold together the elements.
* @return The string form of the sub-array, joined on the given glue.
*/
public static String join(Object[] elements, int start, int end, String glue) {
StringBuilder b = new StringBuilder(127);
boolean isFirst = true;
for (int i = start; i < end; ++i) {
if (isFirst) {
b.append(elements[i].toString());
isFirst = false;
} else {
b.append(glue).append(elements[i].toString());
}
}
return b.toString();
}
/**
* Joins elems with a space.
*/
public static String join(Iterable> l) {
return join(l, " ");
}
/**
* Joins elements with a space.
*/
public static String join(Object[] elements) {
return (join(elements, " "));
}
/**
* Splits on whitespace (\\s+).
* @param s String to split
* @return List of split strings
*/
public static List split(String s) {
return split(s, "\\s+");
}
/**
* Splits the given string using the given regex as delimiters.
* This method is the same as the String.split() method (except it throws
* the results in a List),
* and is included just to give a call that is parallel to the other
* static regex methods in this class.
*
* @param str String to split up
* @param regex String to compile as the regular expression
* @return List of Strings resulting from splitting on the regex
*/
public static List split(String str, String regex) {
return (Arrays.asList(str.split(regex)));
}
/**
* Split a string on a given single character.
* This method is often faster than the regular split() method.
* @param input The input to split.
* @param delimiter The character to split on.
* @return An array of Strings corresponding to the original input split on the delimiter character.
*/
public static String[] splitOnChar(String input, char delimiter) {
// State
String[] out = new String[input.length() + 1];
int nextIndex = 0;
int lastDelimiterIndex = -1;
char[] chars = input.toCharArray();
// Split
for ( int i = 0; i <= chars.length; ++i ) {
if (i >= chars.length || chars[i] == delimiter) {
char[] tokenChars = new char[i - (lastDelimiterIndex + 1)];
System.arraycopy(chars, lastDelimiterIndex + 1, tokenChars, 0, tokenChars.length);
out[nextIndex] = new String(tokenChars);
nextIndex += 1;
lastDelimiterIndex = i;
}
}
// Clean Result
String[] trimmedOut = new String[nextIndex];
System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length);
return trimmedOut;
}
/**
* Splits a string into whitespace tokenized fields based on a delimiter. For example,
* "aa bb | bb cc | ccc ddd" would be split into "[aa,bb],[bb,cc],[ccc,ddd]" based on
* the delimiter "|". This method uses the old StringTokenizer class, which is up to
* 3x faster than the regex-based "split()" methods.
*
* @param delimiter
* @return
*/
public static List> splitFieldsFast(String str, String delimiter) {
List> fields = Generics.newArrayList();
StringTokenizer tokenizer = new StringTokenizer(str.trim());
List currentField = Generics.newArrayList();
while(tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if (token.equals(delimiter)) {
fields.add(currentField);
currentField = Generics.newArrayList();
} else {
currentField.add(token.trim());
}
}
if (currentField.size() > 0) {
fields.add(currentField);
}
return fields;
}
/** Split a string into tokens. Because there is a tokenRegex as well as a
* separatorRegex (unlike for the conventional split), you can do things
* like correctly split quoted strings or parenthesized arguments.
* However, it doesn't do the unquoting of quoted Strings for you.
* An empty String argument is returned at the beginning, if valueRegex
* accepts the empty String and str begins with separatorRegex.
* But str can end with either valueRegex or separatorRegex and this does
* not generate an empty String at the end (indeed, valueRegex need not
* even accept the empty String in this case. However, if it does accept
* the empty String and there are multiple trailing separators, then
* empty values will be returned.
*
* @param str The String to split
* @param valueRegex Must match a token. You may wish to let it match the empty String
* @param separatorRegex Must match a separator
* @return The List of tokens
* @throws IllegalArgumentException if str cannot be tokenized by the two regex
*/
public static List valueSplit(String str, String valueRegex, String separatorRegex) {
Pattern vPat = Pattern.compile(valueRegex);
Pattern sPat = Pattern.compile(separatorRegex);
List ret = new ArrayList<>();
while (str.length() > 0) {
Matcher vm = vPat.matcher(str);
if (vm.lookingAt()) {
ret.add(vm.group());
str = str.substring(vm.end());
// String got = vm.group();
// System.err.println("vmatched " + got + "; now str is " + str);
} else {
throw new IllegalArgumentException("valueSplit: " + valueRegex + " doesn't match " + str);
}
if (str.length() > 0) {
Matcher sm = sPat.matcher(str);
if (sm.lookingAt()) {
str = str.substring(sm.end());
// String got = sm.group();
// System.err.println("smatched " + got + "; now str is " + str);
} else {
throw new IllegalArgumentException("valueSplit: " + separatorRegex + " doesn't match " + str);
}
}
} // end while
return ret;
}
/**
* Return a String of length a minimum of totalChars characters by
* padding the input String str at the right end with spaces.
* If str is already longer
* than totalChars, it is returned unchanged.
*/
public static String pad(String str, int totalChars) {
if (str == null) {
str = "null";
}
int slen = str.length();
StringBuilder sb = new StringBuilder(str);
for (int i = 0; i < totalChars - slen; i++) {
sb.append(' ');
}
return sb.toString();
}
/**
* Pads the toString value of the given Object.
*/
public static String pad(Object obj, int totalChars) {
return pad(obj.toString(), totalChars);
}
/**
* Pad or trim so as to produce a string of exactly a certain length.
*
* @param str The String to be padded or truncated
* @param num The desired length
*/
public static String padOrTrim(String str, int num) {
if (str == null) {
str = "null";
}
int leng = str.length();
if (leng < num) {
StringBuilder sb = new StringBuilder(str);
for (int i = 0; i < num - leng; i++) {
sb.append(' ');
}
return sb.toString();
} else if (leng > num) {
return str.substring(0, num);
} else {
return str;
}
}
/**
* Pad or trim so as to produce a string of exactly a certain length.
*
* @param str The String to be padded or truncated
* @param num The desired length
*/
public static String padLeftOrTrim(String str, int num) {
if (str == null) {
str = "null";
}
int leng = str.length();
if (leng < num) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < num - leng; i++) {
sb.append(' ');
}
sb.append(str);
return sb.toString();
} else if (leng > num) {
return str.substring(str.length() - num);
} else {
return str;
}
}
/**
* Pad or trim the toString value of the given Object.
*/
public static String padOrTrim(Object obj, int totalChars) {
return padOrTrim(obj.toString(), totalChars);
}
/**
* Pads the given String to the left with the given character to ensure that
* it's at least totalChars long.
*/
public static String padLeft(String str, int totalChars, char ch) {
if (str == null) {
str = "null";
}
StringBuilder sb = new StringBuilder();
for (int i = 0, num = totalChars - str.length(); i < num; i++) {
sb.append(ch);
}
sb.append(str);
return sb.toString();
}
/**
* Pads the given String to the left with spaces to ensure that it's
* at least totalChars long.
*/
public static String padLeft(String str, int totalChars) {
return padLeft(str, totalChars, ' ');
}
public static String padLeft(Object obj, int totalChars) {
return padLeft(obj.toString(), totalChars);
}
public static String padLeft(int i, int totalChars) {
return padLeft(Integer.valueOf(i), totalChars);
}
public static String padLeft(double d, int totalChars) {
return padLeft(new Double(d), totalChars);
}
/**
* Returns s if it's at most maxWidth chars, otherwise chops right side to fit.
*/
public static String trim(String s, int maxWidth) {
if (s.length() <= maxWidth) {
return (s);
}
return (s.substring(0, maxWidth));
}
public static String trim(Object obj, int maxWidth) {
return trim(obj.toString(), maxWidth);
}
public static String repeat(String s, int times) {
if (times == 0) {
return "";
}
StringBuilder sb = new StringBuilder(times * s.length());
for (int i = 0; i < times; i++) {
sb.append(s);
}
return sb.toString();
}
public static String repeat(char ch, int times) {
if (times == 0) {
return "";
}
StringBuilder sb = new StringBuilder(times);
for (int i = 0; i < times; i++) {
sb.append(ch);
}
return sb.toString();
}
/**
* Returns a "clean" version of the given filename in which spaces have
* been converted to dashes and all non-alphanumeric chars are underscores.
*/
public static String fileNameClean(String s) {
char[] chars = s.toCharArray();
StringBuilder sb = new StringBuilder();
for (char c : chars) {
if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || (c == '_')) {
sb.append(c);
} else {
if (c == ' ' || c == '-') {
sb.append('_');
} else {
sb.append('x').append((int) c).append('x');
}
}
}
return sb.toString();
}
/**
* Returns the index of the n th occurrence of ch in s, or -1
* if there are less than n occurrences of ch.
*/
public static int nthIndex(String s, char ch, int n) {
int index = 0;
for (int i = 0; i < n; i++) {
// if we're already at the end of the string,
// and we need to find another ch, return -1
if (index == s.length() - 1) {
return -1;
}
index = s.indexOf(ch, index + 1);
if (index == -1) {
return (-1);
}
}
return index;
}
/**
* This returns a string from decimal digit smallestDigit to decimal digit
* biggest digit. Smallest digit is labeled 1, and the limits are
* inclusive.
*/
public static String truncate(int n, int smallestDigit, int biggestDigit) {
int numDigits = biggestDigit - smallestDigit + 1;
char[] result = new char[numDigits];
for (int j = 1; j < smallestDigit; j++) {
n = n / 10;
}
for (int j = numDigits - 1; j >= 0; j--) {
result[j] = Character.forDigit(n % 10, 10);
n = n / 10;
}
return new String(result);
}
/**
* Parses command line arguments into a Map. Arguments of the form
*
* -flag1 arg1a arg1b ... arg1m -flag2 -flag3 arg3a ... arg3n
*
* will be parsed so that the flag is a key in the Map (including
* the hyphen) and its value will be a {@link String}[] containing
* the optional arguments (if present). The non-flag values not
* captured as flag arguments are collected into a String[] array
* and returned as the value of null
in the Map. In
* this invocation, flags cannot take arguments, so all the {@link
* String} array values other than the value for null
* will be zero-length.
*
* @param args A command-line arguments array
* @return a {@link Map} of flag names to flag argument {@link
* String} arrays.
*/
public static Map argsToMap(String[] args) {
return argsToMap(args, Collections.emptyMap());
}
/**
* Parses command line arguments into a Map. Arguments of the form
*
* -flag1 arg1a arg1b ... arg1m -flag2 -flag3 arg3a ... arg3n
*
* will be parsed so that the flag is a key in the Map (including
* the hyphen) and its value will be a {@link String}[] containing
* the optional arguments (if present). The non-flag values not
* captured as flag arguments are collected into a String[] array
* and returned as the value of null
in the Map. In
* this invocation, the maximum number of arguments for each flag
* can be specified as an {@link Integer} value of the appropriate
* flag key in the flagsToNumArgs
{@link Map}
* argument. (By default, flags cannot take arguments.)
*
* Example of usage:
*
*
* Map flagsToNumArgs = new HashMap();
* flagsToNumArgs.put("-x",new Integer(2));
* flagsToNumArgs.put("-d",new Integer(1));
* Map result = argsToMap(args,flagsToNumArgs);
*
*
* If a given flag appears more than once, the extra args are appended to
* the String[] value for that flag.
*
* @param args the argument array to be parsed
* @param flagsToNumArgs a {@link Map} of flag names to {@link Integer}
* values specifying the number of arguments
* for that flag (default min 0, max 1).
* @return a {@link Map} of flag names to flag argument {@link String}
*/
public static Map argsToMap(String[] args, Map flagsToNumArgs) {
Map result = Generics.newHashMap();
List remainingArgs = new ArrayList<>();
for (int i = 0; i < args.length; i++) {
String key = args[i];
if (key.charAt(0) == '-') { // found a flag
Integer numFlagArgs = flagsToNumArgs.get(key);
int max = numFlagArgs == null ? 1 : numFlagArgs.intValue();
int min = numFlagArgs == null ? 0 : numFlagArgs.intValue();
List flagArgs = new ArrayList<>();
for (int j = 0; j < max && i + 1 < args.length && (j < min || args[i + 1].length() == 0 || args[i + 1].charAt(0) != '-'); i++, j++) {
flagArgs.add(args[i + 1]);
}
if (result.containsKey(key)) { // append the second specification into the args.
String[] newFlagArg = new String[result.get(key).length + flagsToNumArgs.get(key)];
int oldNumArgs = result.get(key).length;
System.arraycopy(result.get(key), 0, newFlagArg, 0, oldNumArgs);
for (int j = 0; j < flagArgs.size(); j++) {
newFlagArg[j + oldNumArgs] = flagArgs.get(j);
}
result.put(key, newFlagArg);
} else {
result.put(key, flagArgs.toArray(new String[flagArgs.size()]));
}
} else {
remainingArgs.add(args[i]);
}
}
result.put(null, remainingArgs.toArray(new String[remainingArgs.size()]));
return result;
}
/**
* In this version each flag has zero or one argument. It has one argument
* if there is a thing following a flag that does not begin with '-'. See
* {@link #argsToProperties(String[], Map)} for full documentation.
*
* @param args Command line arguments
* @return A Properties object representing the arguments.
*/
public static Properties argsToProperties(String... args) {
return argsToProperties(args, Collections.emptyMap());
}
/**
* Analogous to {@link #argsToMap}. However, there are several key differences between this method and {@link #argsToMap}:
*
* Hyphens are stripped from flag names
* Since Properties objects are String to String mappings, the default number of arguments to a flag is
* assumed to be 1 and not 0.
* Furthermore, the list of arguments not bound to a flag is mapped to the "" property, not null
* The special flags "-prop", "-props", or "-properties" will load the property file specified by its argument.
* The value for flags without arguments is set to "true"
* If a flag has multiple arguments, the value of the property is all
* of the arguments joined together with a space (" ") character between
* them.
* The value strings are trimmed so trailing spaces do not stop you from loading a file
*
*
* @param args Command line arguments
* @param flagsToNumArgs Map of how many arguments flags should have. The keys are without the minus signs.
* @return A Properties object representing the arguments.
*/
public static Properties argsToProperties(String[] args, Map flagsToNumArgs) {
Properties result = new Properties();
List remainingArgs = new ArrayList<>();
for (int i = 0; i < args.length; i++) {
String key = args[i];
if (key.length() > 0 && key.charAt(0) == '-') { // found a flag
if (key.length() > 1 && key.charAt(1) == '-')
key = key.substring(2); // strip off 2 hyphens
else
key = key.substring(1); // strip off the hyphen
Integer maxFlagArgs = flagsToNumArgs.get(key);
int max = maxFlagArgs == null ? 1 : maxFlagArgs;
int min = maxFlagArgs == null ? 0 : maxFlagArgs;
if (maxFlagArgs != null && maxFlagArgs == 0 && i < args.length - 1 &&
("true".equalsIgnoreCase(args[i + 1]) || "false".equalsIgnoreCase(args[i + 1]))) {
max = 1; // case: we're reading a boolean flag. TODO(gabor) there's gotta be a better way...
}
List flagArgs = new ArrayList<>();
// cdm oct 2007: add length check to allow for empty string argument!
for (int j = 0; j < max && i + 1 < args.length && (j < min || args[i + 1].isEmpty() || args[i + 1].charAt(0) != '-'); i++, j++) {
flagArgs.add(args[i + 1]);
}
if (flagArgs.isEmpty()) {
result.setProperty(key, "true");
} else {
result.setProperty(key, join(flagArgs, " "));
if (key.equalsIgnoreCase(PROP) || key.equalsIgnoreCase(PROPS) || key.equalsIgnoreCase(PROPERTIES) || key.equalsIgnoreCase(ARGUMENTS) || key.equalsIgnoreCase(ARGS)) {
try {
BufferedReader reader = IOUtils.readerFromString(result.getProperty(key));
result.remove(key); // location of this line is critical
result.load(reader);
// trim all values
for (String propKey : result.stringPropertyNames()){
String newVal = result.getProperty(propKey);
result.setProperty(propKey, newVal.trim());
}
reader.close();
} catch (IOException e) {
String msg = "argsToProperties could not read properties file: " + result.getProperty(key);
result.remove(key);
throw new RuntimeIOException(msg, e);
}
}
}
} else {
remainingArgs.add(args[i]);
}
}
if (!remainingArgs.isEmpty()) {
result.setProperty("", join(remainingArgs, " "));
}
if (result.containsKey(PROP)) {
String file = result.getProperty(PROP);
result.remove(PROP);
Properties toAdd = argsToProperties("-prop", file);
for (String key : toAdd.stringPropertyNames()) {
String val = toAdd.getProperty(key);
if ( ! result.containsKey(key)) {
result.setProperty(key, val);
}
}
}
return result;
}
/**
* This method reads in properties listed in a file in the format prop=value, one property per line.
* Although Properties.load(InputStream)
exists, I implemented this method to trim the lines,
* something not implemented in the load()
method.
*
* @param filename A properties file to read
* @return The corresponding Properties object
*/
public static Properties propFileToProperties(String filename) {
Properties result = new Properties();
try {
InputStream is = new BufferedInputStream(new FileInputStream(filename));
result.load(is);
// trim all values
for (String propKey : result.stringPropertyNames()){
String newVal = result.getProperty(propKey);
result.setProperty(propKey,newVal.trim());
}
is.close();
return result;
} catch (IOException e) {
throw new RuntimeIOException("propFileToProperties could not read properties file: " + filename, e);
}
}
/**
* This method converts a comma-separated String (with whitespace
* optionally allowed after the comma) representing properties
* to a Properties object. Each property is "property=value". The value
* for properties without an explicitly given value is set to "true". This can be used for a 2nd level
* of properties, for example, when you have a commandline argument like "-outputOptions style=xml,tags".
*/
public static Properties stringToProperties(String str) {
Properties result = new Properties();
return stringToProperties(str, result);
}
/**
* This method updates a Properties object based on
* a comma-separated String (with whitespace
* optionally allowed after the comma) representing properties
* to a Properties object. Each property is "property=value". The value
* for properties without an explicitly given value is set to "true".
*/
public static Properties stringToProperties(String str, Properties props) {
String[] propsStr = str.trim().split(",\\s*");
for (String term : propsStr) {
int divLoc = term.indexOf('=');
String key;
String value;
if (divLoc >= 0) {
key = term.substring(0, divLoc).trim();
value = term.substring(divLoc + 1).trim();
} else {
key = term.trim();
value = "true";
}
props.setProperty(key, value);
}
return props;
}
/**
* If any of the given list of properties are not found, returns the
* name of that property. Otherwise, returns null.
*/
public static String checkRequiredProperties(Properties props,
String ... requiredProps) {
for (String required : requiredProps) {
if (props.getProperty(required) == null) {
return required;
}
}
return null;
}
/**
* Prints to a file. If the file already exists, appends if
* append=true
, and overwrites if append=false
.
*/
public static void printToFile(File file, String message, boolean append,
boolean printLn, String encoding) {
PrintWriter pw = null;
try {
Writer fw;
if (encoding != null) {
fw = new OutputStreamWriter(new FileOutputStream(file, append),
encoding);
} else {
fw = new FileWriter(file, append);
}
pw = new PrintWriter(fw);
if (printLn) {
pw.println(message);
} else {
pw.print(message);
}
} catch (Exception e) {
System.err.println("Exception: in printToFile " + file.getAbsolutePath());
e.printStackTrace();
} finally {
if (pw != null) {
pw.flush();
pw.close();
}
}
}
/**
* Prints to a file. If the file already exists, appends if
* append=true
, and overwrites if append=false
.
*/
public static void printToFileLn(File file, String message, boolean append) {
PrintWriter pw = null;
try {
Writer fw = new FileWriter(file, append);
pw = new PrintWriter(fw);
pw.println(message);
} catch (Exception e) {
System.err.println("Exception: in printToFileLn " + file.getAbsolutePath() + ' ' + message);
e.printStackTrace();
} finally {
if (pw != null) {
pw.flush();
pw.close();
}
}
}
/**
* Prints to a file. If the file already exists, appends if
* append=true
, and overwrites if append=false
.
*/
public static void printToFile(File file, String message, boolean append) {
PrintWriter pw = null;
try {
Writer fw = new FileWriter(file, append);
pw = new PrintWriter(fw);
pw.print(message);
} catch (Exception e) {
System.err.println("Exception: in printToFile " + file.getAbsolutePath());
e.printStackTrace();
} finally {
if (pw != null) {
pw.flush();
pw.close();
}
}
}
/**
* Prints to a file. If the file does not exist, rewrites the file;
* does not append.
*/
public static void printToFile(File file, String message) {
printToFile(file, message, false);
}
/**
* Prints to a file. If the file already exists, appends if
* append=true
, and overwrites if append=false
*/
public static void printToFile(String filename, String message, boolean append) {
printToFile(new File(filename), message, append);
}
/**
* Prints to a file. If the file already exists, appends if
* append=true
, and overwrites if append=false
*/
public static void printToFileLn(String filename, String message, boolean append) {
printToFileLn(new File(filename), message, append);
}
/**
* Prints to a file. If the file does not exist, rewrites the file;
* does not append.
*/
public static void printToFile(String filename, String message) {
printToFile(new File(filename), message, false);
}
/**
* A simpler form of command line argument parsing.
* Dan thinks this is highly superior to the overly complexified code that
* comes before it.
* Parses command line arguments into a Map. Arguments of the form
* -flag1 arg1 -flag2 -flag3 arg3
* will be parsed so that the flag is a key in the Map (including the hyphen)
* and the
* optional argument will be its value (if present).
*
* @return A Map from keys to possible values (String or null)
*/
@SuppressWarnings("unchecked")
public static Map parseCommandLineArguments(String[] args) {
return (Map)parseCommandLineArguments(args, false);
}
/**
* A simpler form of command line argument parsing.
* Dan thinks this is highly superior to the overly complexified code that
* comes before it.
* Parses command line arguments into a Map. Arguments of the form
* -flag1 arg1 -flag2 -flag3 arg3
* will be parsed so that the flag is a key in the Map (including the hyphen)
* and the
* optional argument will be its value (if present).
* In this version, if the argument is numeric, it will be a Double value
* in the map, not a String.
*
* @return A Map from keys to possible values (String or null)
*/
public static Map parseCommandLineArguments(String[] args, boolean parseNumbers) {
Map result = Generics.newHashMap();
for (int i = 0; i < args.length; i++) {
String key = args[i];
if (key.charAt(0) == '-') {
if (i + 1 < args.length) {
String value = args[i + 1];
if (value.charAt(0) != '-') {
if (parseNumbers) {
Object numericValue = value;
try {
numericValue = Double.parseDouble(value);
} catch (NumberFormatException e2) {
// ignore
}
result.put(key, numericValue);
} else {
result.put(key, value);
}
i++;
} else {
result.put(key, null);
}
} else {
result.put(key, null);
}
}
}
return result;
}
public static String stripNonAlphaNumerics(String orig) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < orig.length(); i++) {
char c = orig.charAt(i);
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
sb.append(c);
}
}
return sb.toString();
}
public static String stripSGML(String orig) {
Pattern sgmlPattern = Pattern.compile("<.*?>", Pattern.DOTALL);
Matcher sgmlMatcher = sgmlPattern.matcher(orig);
return sgmlMatcher.replaceAll("");
}
public static void printStringOneCharPerLine(String s) {
for (int i = 0; i < s.length(); i++) {
int c = s.charAt(i);
System.out.println(c + " \'" + (char) c + "\' ");
}
}
public static String escapeString(String s, char[] charsToEscape, char escapeChar) {
StringBuilder result = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (c == escapeChar) {
result.append(escapeChar);
} else {
for (char charToEscape : charsToEscape) {
if (c == charToEscape) {
result.append(escapeChar);
break;
}
}
}
result.append(c);
}
return result.toString();
}
/**
* This function splits the String s into multiple Strings using the
* splitChar. However, it provides a quoting facility: it is possible to
* quote strings with the quoteChar.
* If the quoteChar occurs within the quotedExpression, it must be prefaced
* by the escapeChar.
* This routine can be useful for processing a line of a CSV file.
*
* @param s The String to split into fields. Cannot be null.
* @param splitChar The character to split on
* @param quoteChar The character to quote items with
* @param escapeChar The character to escape the quoteChar with
* @return An array of Strings that s is split into
*/
public static String[] splitOnCharWithQuoting(String s, char splitChar, char quoteChar, char escapeChar) {
List result = new ArrayList<>();
int i = 0;
int length = s.length();
StringBuilder b = new StringBuilder();
while (i < length) {
char curr = s.charAt(i);
if (curr == splitChar) {
// add last buffer
// cdm 2014: Do this even if the field is empty!
// if (b.length() > 0) {
result.add(b.toString());
b = new StringBuilder();
// }
i++;
} else if (curr == quoteChar) {
// find next instance of quoteChar
i++;
while (i < length) {
curr = s.charAt(i);
// mrsmith: changed this condition from
// if (curr == escapeChar) {
if ((curr == escapeChar) && (i+1 < length) && (s.charAt(i+1) == quoteChar)) {
b.append(s.charAt(i + 1));
i += 2;
} else if (curr == quoteChar) {
i++;
break; // break this loop
} else {
b.append(s.charAt(i));
i++;
}
}
} else {
b.append(curr);
i++;
}
}
// RFC 4180 disallows final comma. At any rate, don't produce a field after it unless non-empty
if (b.length() > 0) {
result.add(b.toString());
}
return result.toArray(new String[result.size()]);
}
/**
* Computes the longest common substring of s and t.
* The longest common substring of a and b is the longest run of
* characters that appear in order inside both a and b. Both a and b
* may have other extraneous characters along the way. This is like
* edit distance but with no substitution and a higher number means
* more similar. For example, the LCS of "abcD" and "aXbc" is 3 (abc).
*/
public static int longestCommonSubstring(String s, String t) {
int[][] d; // matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
// int cost; // cost
// Step 1
n = s.length();
m = t.length();
if (n == 0) {
return 0;
}
if (m == 0) {
return 0;
}
d = new int[n + 1][m + 1];
// Step 2
for (i = 0; i <= n; i++) {
d[i][0] = 0;
}
for (j = 0; j <= m; j++) {
d[0][j] = 0;
}
// Step 3
for (i = 1; i <= n; i++) {
char s_i = s.charAt(i - 1); // ith character of s
// Step 4
for (j = 1; j <= m; j++) {
char t_j = t.charAt(j - 1); // jth character of t
// Step 5
// js: if the chars match, you can get an extra point
// otherwise you have to skip an insertion or deletion (no subs)
if (s_i == t_j) {
d[i][j] = SloppyMath.max(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1] + 1);
} else {
d[i][j] = Math.max(d[i - 1][j], d[i][j - 1]);
}
}
}
/* ----
// num chars needed to display longest num
int numChars = (int) Math.ceil(Math.log(d[n][m]) / Math.log(10));
for (i = 0; i < numChars + 3; i++) {
System.err.print(' ');
}
for (j = 0; j < m; j++) {
System.err.print(t.charAt(j) + " ");
}
System.err.println();
for (i = 0; i <= n; i++) {
System.err.print((i == 0 ? ' ' : s.charAt(i - 1)) + " ");
for (j = 0; j <= m; j++) {
System.err.print(d[i][j] + " ");
}
System.err.println();
}
---- */
// Step 7
return d[n][m];
}
/**
* Computes the longest common contiguous substring of s and t.
* The LCCS is the longest run of characters that appear consecutively in
* both s and t. For instance, the LCCS of "color" and "colour" is 4, because
* of "colo".
*/
public static int longestCommonContiguousSubstring(String s, String t) {
if (s.isEmpty() || t.isEmpty()) {
return 0;
}
int M = s.length();
int N = t.length();
int[][] d = new int[M + 1][N + 1];
for (int j = 0; j <= N; j++) {
d[0][j] = 0;
}
for (int i = 0; i <= M; i++) {
d[i][0] = 0;
}
int max = 0;
for (int i = 1; i <= M; i++) {
for (int j = 1; j <= N; j++) {
if (s.charAt(i - 1) == t.charAt(j - 1)) {
d[i][j] = d[i - 1][j - 1] + 1;
} else {
d[i][j] = 0;
}
if (d[i][j] > max) {
max = d[i][j];
}
}
}
// System.err.println("LCCS(" + s + "," + t + ") = " + max);
return max;
}
/**
* Computes the Levenshtein (edit) distance of the two given Strings.
* This method doesn't allow transposition, so one character transposed between two strings has a cost of 2 (one insertion, one deletion).
* The EditDistance class also implements the Levenshtein distance, but does allow transposition.
*/
public static int editDistance(String s, String t) {
// Step 1
int n = s.length(); // length of s
int m = t.length(); // length of t
if (n == 0) {
return m;
}
if (m == 0) {
return n;
}
int[][] d = new int[n + 1][m + 1]; // matrix
// Step 2
for (int i = 0; i <= n; i++) {
d[i][0] = i;
}
for (int j = 0; j <= m; j++) {
d[0][j] = j;
}
// Step 3
for (int i = 1; i <= n; i++) {
char s_i = s.charAt(i - 1); // ith character of s
// Step 4
for (int j = 1; j <= m; j++) {
char t_j = t.charAt(j - 1); // jth character of t
// Step 5
int cost; // cost
if (s_i == t_j) {
cost = 0;
} else {
cost = 1;
}
// Step 6
d[i][j] = SloppyMath.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
}
}
// Step 7
return d[n][m];
}
/**
* Computes the WordNet 2.0 POS tag corresponding to the PTB POS tag s.
*
* @param s a Penn TreeBank POS tag.
*/
public static String pennPOSToWordnetPOS(String s) {
if (s.matches("NN|NNP|NNS|NNPS")) {
return "noun";
}
if (s.matches("VB|VBD|VBG|VBN|VBZ|VBP|MD")) {
return "verb";
}
if (s.matches("JJ|JJR|JJS|CD")) {
return "adjective";
}
if (s.matches("RB|RBR|RBS|RP|WRB")) {
return "adverb";
}
return null;
}
/**
* Returns a short class name for an object.
* This is the class name stripped of any package name.
*
* @return The name of the class minus a package name, for example
* ArrayList
*/
public static String getShortClassName(Object o) {
if (o == null) {
return "null";
}
String name = o.getClass().getName();
int index = name.lastIndexOf('.');
if (index >= 0) {
name = name.substring(index + 1);
}
return name;
}
/**
* Converts a tab delimited string into an object with given fields
* Requires the object has setXxx functions for the specified fields
*
* @param objClass Class of object to be created
* @param str string to convert
* @param delimiterRegex delimiter regular expression
* @param fieldNames fieldnames
* @param type to return
* @return Object created from string
*/
public static T columnStringToObject(Class objClass, String str, String delimiterRegex, String[] fieldNames)
throws InstantiationException, IllegalAccessException, NoSuchFieldException, NoSuchMethodException, InvocationTargetException
{
Pattern delimiterPattern = Pattern.compile(delimiterRegex);
return StringUtils.columnStringToObject(objClass, str, delimiterPattern, fieldNames);
}
/**
* Converts a tab delimited string into an object with given fields
* Requires the object has public access for the specified fields
*
* @param objClass Class of object to be created
* @param str string to convert
* @param delimiterPattern delimiter
* @param fieldNames fieldnames
* @param type to return
* @return Object created from string
*/
public static T columnStringToObject(Class> objClass, String str, Pattern delimiterPattern, String[] fieldNames)
throws InstantiationException, IllegalAccessException, NoSuchMethodException, NoSuchFieldException, InvocationTargetException
{
String[] fields = delimiterPattern.split(str);
T item = ErasureUtils.uncheckedCast(objClass.newInstance());
for (int i = 0; i < fields.length; i++) {
try {
Field field = objClass.getDeclaredField(fieldNames[i]);
field.set(item, fields[i]);
} catch (IllegalAccessException ex) {
Method method = objClass.getDeclaredMethod("set" + StringUtils.capitalize(fieldNames[i]), String.class);
method.invoke(item, fields[i]);
}
}
return item;
}
/**
* Converts an object into a tab delimited string with given fields
* Requires the object has public access for the specified fields
*
* @param object Object to convert
* @param delimiter delimiter
* @param fieldNames fieldnames
* @return String representing object
*/
public static String objectToColumnString(Object object, String delimiter, String[] fieldNames)
throws IllegalAccessException, NoSuchFieldException, NoSuchMethodException, InvocationTargetException
{
StringBuilder sb = new StringBuilder();
for (String fieldName : fieldNames) {
if (sb.length() > 0) {
sb.append(delimiter);
}
try {
Field field = object.getClass().getDeclaredField(fieldName);
sb.append(field.get(object));
} catch (IllegalAccessException ex) {
Method method = object.getClass().getDeclaredMethod("get" + StringUtils.capitalize(fieldName));
sb.append(method.invoke(object));
}
}
return sb.toString();
}
/**
* Uppercases the first character of a string.
*
* @param s a string to capitalize
* @return a capitalized version of the string
*/
public static String capitalize(String s) {
if (Character.isLowerCase(s.charAt(0))) {
return Character.toUpperCase(s.charAt(0)) + s.substring(1);
} else {
return s;
}
}
/**
* Check if a string begins with an uppercase.
*
* @param s a string
* @return true if the string is capitalized
* false otherwise
*/
public static boolean isCapitalized(String s) {
return (Character.isUpperCase(s.charAt(0)));
}
public static String searchAndReplace(String text, String from, String to) {
from = escapeString(from, new char[]{'.', '[', ']', '\\'}, '\\'); // special chars in regex
Pattern p = Pattern.compile(from);
Matcher m = p.matcher(text);
return m.replaceAll(to);
}
/**
* Returns an HTML table containing the matrix of Strings passed in.
* The first dimension of the matrix should represent the rows, and the
* second dimension the columns.
*/
public static String makeHTMLTable(String[][] table, String[] rowLabels, String[] colLabels) {
StringBuilder buff = new StringBuilder();
buff.append("\n");
// top row
buff.append("\n");
buff.append(" \n"); // the top left cell
for (int j = 0; j < table[0].length; j++) { // assume table is a rectangular matrix
buff.append("").append(colLabels[j]).append(" \n");
}
buff.append(" \n");
// all other rows
for (int i = 0; i < table.length; i++) {
// one row
buff.append("\n");
buff.append("").append(rowLabels[i]).append(" \n");
for (int j = 0; j < table[i].length; j++) {
buff.append("");
buff.append(((table[i][j] != null) ? table[i][j] : ""));
buff.append(" \n");
}
buff.append(" \n");
}
buff.append("
");
return buff.toString();
}
/**
* Returns a text table containing the matrix of objects passed in.
* The first dimension of the matrix should represent the rows, and the
* second dimension the columns. Each object is printed in a cell with toString().
* The printing may be padded with spaces on the left and then on the right to
* ensure that the String form is of length at least padLeft or padRight.
* If tsv is true, a tab is put between columns.
*
* @return A String form of the table
*/
public static String makeTextTable(Object[][] table, Object[] rowLabels, Object[] colLabels, int padLeft, int padRight, boolean tsv) {
StringBuilder buff = new StringBuilder();
if (colLabels != null) {
// top row
buff.append(makeAsciiTableCell("", padLeft, padRight, tsv)); // the top left cell
for (int j = 0; j < table[0].length; j++) { // assume table is a rectangular matrix
buff.append(makeAsciiTableCell(colLabels[j], padLeft, padRight, (j != table[0].length - 1) && tsv));
}
buff.append('\n');
}
// all other rows
for (int i = 0; i < table.length; i++) {
// one row
if (rowLabels != null) {
buff.append(makeAsciiTableCell(rowLabels[i], padLeft, padRight, tsv));
}
for (int j = 0; j < table[i].length; j++) {
buff.append(makeAsciiTableCell(table[i][j], padLeft, padRight, (j != table[0].length - 1) && tsv));
}
buff.append('\n');
}
return buff.toString();
}
/** The cell String is the string representation of the object.
* If padLeft is greater than 0, it is padded. Ditto right
*
*/
private static String makeAsciiTableCell(Object obj, int padLeft, int padRight, boolean tsv) {
String result = obj.toString();
if (padLeft > 0) {
result = padLeft(result, padLeft);
}
if (padRight > 0) {
result = pad(result, padRight);
}
if (tsv) {
result = result + '\t';
}
return result;
}
/**
* Tests the string edit distance function.
*/
public static void main(String[] args) {
String[] s = {"there once was a man", "this one is a manic", "hey there", "there once was a mane", "once in a manger.", "where is one match?", "Jo3seph Smarr!", "Joseph R Smarr"};
for (int i = 0; i < 8; i++) {
for (int j = 0; j < 8; j++) {
System.out.println("s1: " + s[i]);
System.out.println("s2: " + s[j]);
System.out.println("edit distance: " + editDistance(s[i], s[j]));
System.out.println("LCS: " + longestCommonSubstring(s[i], s[j]));
System.out.println("LCCS: " + longestCommonContiguousSubstring(s[i], s[j]));
System.out.println();
}
}
}
public static String toAscii(String s) {
StringBuilder b = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (c > 127) {
String result = "?";
if (c >= 0x00c0 && c <= 0x00c5) {
result = "A";
} else if (c == 0x00c6) {
result = "AE";
} else if (c == 0x00c7) {
result = "C";
} else if (c >= 0x00c8 && c <= 0x00cb) {
result = "E";
} else if (c >= 0x00cc && c <= 0x00cf) {
result = "F";
} else if (c == 0x00d0) {
result = "D";
} else if (c == 0x00d1) {
result = "N";
} else if (c >= 0x00d2 && c <= 0x00d6) {
result = "O";
} else if (c == 0x00d7) {
result = "x";
} else if (c == 0x00d8) {
result = "O";
} else if (c >= 0x00d9 && c <= 0x00dc) {
result = "U";
} else if (c == 0x00dd) {
result = "Y";
} else if (c >= 0x00e0 && c <= 0x00e5) {
result = "a";
} else if (c == 0x00e6) {
result = "ae";
} else if (c == 0x00e7) {
result = "c";
} else if (c >= 0x00e8 && c <= 0x00eb) {
result = "e";
} else if (c >= 0x00ec && c <= 0x00ef) {
result = "i";
} else if (c == 0x00f1) {
result = "n";
} else if (c >= 0x00f2 && c <= 0x00f8) {
result = "o";
} else if (c >= 0x00f9 && c <= 0x00fc) {
result = "u";
} else if (c >= 0x00fd && c <= 0x00ff) {
result = "y";
} else if (c >= 0x2018 && c <= 0x2019) {
result = "\'";
} else if (c >= 0x201c && c <= 0x201e) {
result = "\"";
} else if (c >= 0x0213 && c <= 0x2014) {
result = "-";
} else if (c >= 0x00A2 && c <= 0x00A5) {
result = "$";
} else if (c == 0x2026) {
result = ".";
}
b.append(result);
} else {
b.append(c);
}
}
return b.toString();
}
public static String toCSVString(String[] fields) {
StringBuilder b = new StringBuilder();
for (String fld : fields) {
if (b.length() > 0) {
b.append(',');
}
String field = escapeString(fld, new char[]{'\"'}, '\"'); // escape quotes with double quotes
b.append('\"').append(field).append('\"');
}
return b.toString();
}
/**
* Swap any occurrences of any characters in the from String in the input String with
* the corresponding character from the to String. As Perl tr, for example,
* tr("chris", "irs", "mop").equals("chomp"), except it does not
* support regular expression character ranges.
*
* Note: This is now optimized to not allocate any objects if the
* input is returned unchanged.
*/
public static String tr(String input, String from, String to) {
assert from.length() == to.length();
StringBuilder sb = null;
int len = input.length();
for (int i = 0; i < len; i++) {
int ind = from.indexOf(input.charAt(i));
if (ind >= 0) {
if (sb == null) {
sb = new StringBuilder(input);
}
sb.setCharAt(i, to.charAt(ind));
}
}
if (sb == null) {
return input;
} else {
return sb.toString();
}
}
/**
* Returns the supplied string with any trailing '\n' removed.
*/
public static String chomp(String s) {
if(s.length() == 0)
return s;
int l_1 = s.length() - 1;
if (s.charAt(l_1) == '\n') {
return s.substring(0, l_1);
}
return s;
}
/**
* Returns the result of calling toString() on the supplied Object, but with
* any trailing '\n' removed.
*/
public static String chomp(Object o) {
return chomp(o.toString());
}
public static void printErrInvocationString(String cls, String[] args) {
System.err.println(toInvocationString(cls, args));
}
public static String toInvocationString(String cls, String[] args) {
StringBuilder sb = new StringBuilder();
sb.append(cls).append(" invoked on ").append(new Date());
sb.append(" with arguments:\n ");
for (String arg : args) {
sb.append(' ').append(arg);
}
return sb.toString();
}
/**
* Strip directory from filename. Like Unix 'basename'.
*
* Example: getBaseName("/u/wcmac/foo.txt") ==> "foo.txt"
*/
public static String getBaseName(String fileName) {
return getBaseName(fileName, "");
}
/**
* Strip directory and suffix from filename. Like Unix 'basename'.
*
* Example: getBaseName("/u/wcmac/foo.txt", "") ==> "foo.txt"
* Example: getBaseName("/u/wcmac/foo.txt", ".txt") ==> "foo"
* Example: getBaseName("/u/wcmac/foo.txt", ".pdf") ==> "foo.txt"
*/
public static String getBaseName(String fileName, String suffix) {
String[] elts = fileName.split("/");
String lastElt = elts[elts.length - 1];
if (lastElt.endsWith(suffix)) {
lastElt = lastElt.substring(0, lastElt.length() - suffix.length());
}
return lastElt;
}
/**
* Given a String the method uses Regex to check if the String only contains alphabet characters
*
* @param s a String to check using regex
* @return true if the String is valid
*/
public static boolean isAlpha(String s){
Pattern p = Pattern.compile("^[\\p{Alpha}\\s]+$");
Matcher m = p.matcher(s);
return m.matches();
}
/**
* Given a String the method uses Regex to check if the String only contains numeric characters
*
* @param s a String to check using regex
* @return true if the String is valid
*/
public static boolean isNumeric(String s){
Pattern p = Pattern.compile("^[\\p{Digit}\\s\\.]+$");
Matcher m = p.matcher(s);
return m.matches();
}
/**
* Given a String the method uses Regex to check if the String only contains alphanumeric characters
*
* @param s a String to check using regex
* @return true if the String is valid
*/
public static boolean isAlphanumeric(String s){
Pattern p = Pattern.compile("^[\\p{Alnum}\\s\\.]+$");
Matcher m = p.matcher(s);
return m.matches();
}
/**
* Given a String the method uses Regex to check if the String only contains punctuation characters
*
* @param s a String to check using regex
* @return true if the String is valid
*/
public static boolean isPunct(String s){
Pattern p = Pattern.compile("^[\\p{Punct}]+$");
Matcher m = p.matcher(s);
return m.matches();
}
/**
* Given a String the method uses Regex to check if the String looks like an acronym
*
* @param s a String to check using regex
* @return true if the String is valid
*/
public static boolean isAcronym(String s){
Pattern p = Pattern.compile("^[\\p{Upper}]+$");
Matcher m = p.matcher(s);
return m.matches();
}
public static String getNotNullString(String s) {
if (s == null)
return "";
else
return s;
}
/**
* Resolve variable. If it is the props file, then substitute that variable with
* the value mentioned in the props file, otherwise look for the variable in the environment variables.
* If the variable is not found then substitute it for empty string.
*/
public static String resolveVars(String str, Map props) {
if (str == null)
return null;
// ${VAR_NAME} or $VAR_NAME
Pattern p = Pattern.compile("\\$\\{(\\w+)\\}");
Matcher m = p.matcher(str);
StringBuffer sb = new StringBuffer();
while (m.find()) {
String varName = null == m.group(1) ? m.group(2) : m.group(1);
String vrValue;
//either in the props file
if (props.containsKey(varName)) {
vrValue = ((String) props.get(varName));
} else {
//or as the environment variable
vrValue = System.getenv(varName);
}
m.appendReplacement(sb, null == vrValue ? "" : vrValue);
}
m.appendTail(sb);
return sb.toString();
}
/**
* convert args to properties with variable names resolved. for each value
* having a ${VAR} or $VAR, its value is first resolved using the variables
* listed in the props file, and if not found then using the environment
* variables. if the variable is not found then substitute it for empty string
*/
public static Properties argsToPropertiesWithResolve(String[] args) {
LinkedHashMap result = new LinkedHashMap<>();
Map existingArgs = new LinkedHashMap<>();
for (int i = 0; i < args.length; i++) {
String key = args[i];
if (key.length() > 0 && key.charAt(0) == '-') { // found a flag
if (key.length() > 1 && key.charAt(1) == '-')
key = key.substring(2); // strip off 2 hyphens
else
key = key.substring(1); // strip off the hyphen
int max = 1;
int min = 0;
List flagArgs = new ArrayList<>();
// cdm oct 2007: add length check to allow for empty string argument!
for (int j = 0; j < max && i + 1 < args.length && (j < min || args[i + 1].length() == 0 || args[i + 1].charAt(0) != '-'); i++, j++) {
flagArgs.add(args[i + 1]);
}
if (flagArgs.isEmpty()) {
existingArgs.put(key, "true");
} else {
if (key.equalsIgnoreCase(PROP) || key.equalsIgnoreCase(PROPS) || key.equalsIgnoreCase(PROPERTIES) || key.equalsIgnoreCase(ARGUMENTS) || key.equalsIgnoreCase(ARGS)) {
for(String flagArg: flagArgs)
result.putAll(propFileToLinkedHashMap(flagArg, existingArgs));
existingArgs.clear();
} else
existingArgs.put(key, join(flagArgs, " "));
}
}
}
result.putAll(existingArgs);
for (Entry o : result.entrySet()) {
String val = resolveVars(o.getValue(), result);
result.put(o.getKey(), val);
}
Properties props = new Properties();
props.putAll(result);
return props;
}
/**
* This method reads in properties listed in a file in the format prop=value,
* one property per line. and reads them into a LinkedHashMap (insertion order preserving)
* Flags not having any arguments is set to "true".
*
* @param filename A properties file to read
* @return The corresponding LinkedHashMap where the ordering is the same as in the
* props file
*/
public static LinkedHashMap propFileToLinkedHashMap(String filename, Map existingArgs) {
LinkedHashMap result = new LinkedHashMap<>();
result.putAll(existingArgs);
for (String l : IOUtils.readLines(filename)) {
l = l.trim();
if (l.isEmpty() || l.startsWith("#"))
continue;
int index = l.indexOf('=');
if (index == -1)
result.put(l, "true");
else
result.put(l.substring(0, index).trim(), l.substring(index + 1).trim());
}
return result;
}
/**
* n grams for already splitted string. the ngrams are joined with a single space
*/
public static Collection getNgrams(List words, int minSize, int maxSize){
List> ng = CollectionUtils.getNGrams(words, minSize, maxSize);
Collection ngrams = new ArrayList<>();
for(List n: ng)
ngrams.add(StringUtils.join(n," "));
return ngrams;
}
/**
* n grams for already splitted string. the ngrams are joined with a single space
*/
public static Collection getNgramsFromTokens(List words, int minSize, int maxSize){
List wordsStr = new ArrayList<>();
for(CoreLabel l : words)
wordsStr.add(l.word());
List> ng = CollectionUtils.getNGrams(wordsStr, minSize, maxSize);
Collection ngrams = new ArrayList<>();
for(List n: ng)
ngrams.add(StringUtils.join(n," "));
return ngrams;
}
/**
* The string is split on whitespace and the ngrams are joined with a single space
*/
public static Collection getNgramsString(String s, int minSize, int maxSize){
return getNgrams(Arrays.asList(s.split("\\s+")), minSize, maxSize);
}
/**
* Build a list of character-based ngrams from the given string.
*/
public static Collection getCharacterNgrams(String s, int minSize, int maxSize) {
Collection ngrams = new ArrayList<>();
int len = s.length();
for (int i = 0; i < len; i++) {
for (int ngramSize = minSize;
ngramSize > 0 && ngramSize <= maxSize && i + ngramSize <= len;
ngramSize++) {
ngrams.add(s.substring(i, i + ngramSize));
}
}
return ngrams;
}
private static Pattern diacriticalMarksPattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}");
public static String normalize(String s) {
// Normalizes string and strips diacritics (map to ascii) by
// 1. taking the NFKD (compatibility decomposition -
// in compatibility equivalence, formatting such as subscripting is lost -
// see http://unicode.org/reports/tr15/)
// 2. Removing diacriticals
// 3. Recombining into NFKC form (compatibility composition)
// This process may be slow.
//
// The main purpose of the function is to remove diacritics for asciis,
// but it may normalize other stuff as well.
// A more conservative approach is to do explicit folding just for ascii character
// (see RuleBasedNameMatcher.normalize)
String d = Normalizer.normalize(s, Normalizer.Form.NFKD);
d = diacriticalMarksPattern.matcher(d).replaceAll("");
return Normalizer.normalize(d, Normalizer.Form.NFKC);
}
/**
* Convert a list of labels into a string, by simply joining them with spaces.
* @param words The words to join.
* @return A string representation of the sentence, tokenized by a single space.
*/
public static String toString(List words) {
return join(words.stream().map(CoreLabel::word), " ");
}
/**
* Convert a CoreMap representing a sentence into a string, by simply joining them with spaces.
* @param sentence The sentence to stringify.
* @return A string representation of the sentence, tokenized by a single space.
*/
public static String toString(CoreMap sentence) {
return toString(sentence.get(CoreAnnotations.TokensAnnotation.class));
}
/** I shamefully stole this from: http://rosettacode.org/wiki/Levenshtein_distance#Java --Gabor */
public static int levenshteinDistance(String s1, String s2) {
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();
int[] costs = new int[s2.length() + 1];
for (int i = 0; i <= s1.length(); i++) {
int lastValue = i;
for (int j = 0; j <= s2.length(); j++) {
if (i == 0)
costs[j] = j;
else {
if (j > 0) {
int newValue = costs[j - 1];
if (s1.charAt(i - 1) != s2.charAt(j - 1))
newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
costs[j - 1] = lastValue;
lastValue = newValue;
}
}
}
if (i > 0)
costs[s2.length()] = lastValue;
}
return costs[s2.length()];
}
/** I shamefully stole this from: http://rosettacode.org/wiki/Levenshtein_distance#Java --Gabor */
public static int levenshteinDistance(E[] s1, E[] s2) {
int[] costs = new int[s2.length + 1];
for (int i = 0; i <= s1.length; i++) {
int lastValue = i;
for (int j = 0; j <= s2.length; j++) {
if (i == 0)
costs[j] = j;
else {
if (j > 0) {
int newValue = costs[j - 1];
if (!s1[i - 1].equals(s2[j - 1]))
newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
costs[j - 1] = lastValue;
lastValue = newValue;
}
}
}
if (i > 0)
costs[s2.length] = lastValue;
}
return costs[s2.length];
}
/**
* Unescape an HTML string.
* Taken from: http://stackoverflow.com/questions/994331/java-how-to-decode-html-character-entities-in-java-like-httputility-htmldecode
* @param input The string to unescape
* @return The unescaped String
*/
public static String unescapeHtml3(final String input) {
StringWriter writer = null;
int len = input.length();
int i = 1;
int st = 0;
while (true) {
// look for '&'
while (i < len && input.charAt(i-1) != '&')
i++;
if (i >= len)
break;
// found '&', look for ';'
int j = i;
while (j < len && j < i + 6 + 1 && input.charAt(j) != ';')
j++;
if (j == len || j < i + 2 || j == i + 6 + 1) {
i++;
continue;
}
// found escape
if (input.charAt(i) == '#') {
// numeric escape
int k = i + 1;
int radix = 10;
final char firstChar = input.charAt(k);
if (firstChar == 'x' || firstChar == 'X') {
k++;
radix = 16;
}
try {
int entityValue = Integer.parseInt(input.substring(k, j), radix);
if (writer == null)
writer = new StringWriter(input.length());
writer.append(input.substring(st, i - 1));
if (entityValue > 0xFFFF) {
final char[] chrs = Character.toChars(entityValue);
writer.write(chrs[0]);
writer.write(chrs[1]);
} else {
writer.write(entityValue);
}
} catch (NumberFormatException ex) {
i++;
continue;
}
}
else {
// named escape
CharSequence value = htmlUnescapeLookupMap.get(input.substring(i, j));
if (value == null) {
i++;
continue;
}
if (writer == null)
writer = new StringWriter(input.length());
writer.append(input.substring(st, i - 1));
writer.append(value);
}
// skip escape
st = j + 1;
i = st;
}
if (writer != null) {
writer.append(input.substring(st, len));
return writer.toString();
}
return input;
}
private static final String[][] HTML_ESCAPES = {
{"\"", "quot"}, // " - double-quote
{"&", "amp"}, // & - ampersand
{"<", "lt"}, // < - less-than
{">", "gt"}, // > - greater-than
{"-", "ndash"}, // - - dash
// Mapping to escape ISO-8859-1 characters to their named HTML 3.x equivalents.
{"\u00A0", "nbsp"}, // non-breaking space
{"\u00A1", "iexcl"}, // inverted exclamation mark
{"\u00A2", "cent"}, // cent sign
{"\u00A3", "pound"}, // pound sign
{"\u00A4", "curren"}, // currency sign
{"\u00A5", "yen"}, // yen sign = yuan sign
{"\u00A6", "brvbar"}, // broken bar = broken vertical bar
{"\u00A7", "sect"}, // section sign
{"\u00A8", "uml"}, // diaeresis = spacing diaeresis
{"\u00A9", "copy"}, // © - copyright sign
{"\u00AA", "ordf"}, // feminine ordinal indicator
{"\u00AB", "laquo"}, // left-pointing double angle quotation mark = left pointing guillemet
{"\u00AC", "not"}, // not sign
{"\u00AD", "shy"}, // soft hyphen = discretionary hyphen
{"\u00AE", "reg"}, // ® - registered trademark sign
{"\u00AF", "macr"}, // macron = spacing macron = overline = APL overbar
{"\u00B0", "deg"}, // degree sign
{"\u00B1", "plusmn"}, // plus-minus sign = plus-or-minus sign
{"\u00B2", "sup2"}, // superscript two = superscript digit two = squared
{"\u00B3", "sup3"}, // superscript three = superscript digit three = cubed
{"\u00B4", "acute"}, // acute accent = spacing acute
{"\u00B5", "micro"}, // micro sign
{"\u00B6", "para"}, // pilcrow sign = paragraph sign
{"\u00B7", "middot"}, // middle dot = Georgian comma = Greek middle dot
{"\u00B8", "cedil"}, // cedilla = spacing cedilla
{"\u00B9", "sup1"}, // superscript one = superscript digit one
{"\u00BA", "ordm"}, // masculine ordinal indicator
{"\u00BB", "raquo"}, // right-pointing double angle quotation mark = right pointing guillemet
{"\u00BC", "frac14"}, // vulgar fraction one quarter = fraction one quarter
{"\u00BD", "frac12"}, // vulgar fraction one half = fraction one half
{"\u00BE", "frac34"}, // vulgar fraction three quarters = fraction three quarters
{"\u00BF", "iquest"}, // inverted question mark = turned question mark
{"\u00C0", "Agrave"}, // А - uppercase A, grave accent
{"\u00C1", "Aacute"}, // Б - uppercase A, acute accent
{"\u00C2", "Acirc"}, // В - uppercase A, circumflex accent
{"\u00C3", "Atilde"}, // Г - uppercase A, tilde
{"\u00C4", "Auml"}, // Д - uppercase A, umlaut
{"\u00C5", "Aring"}, // Е - uppercase A, ring
{"\u00C6", "AElig"}, // Ж - uppercase AE
{"\u00C7", "Ccedil"}, // З - uppercase C, cedilla
{"\u00C8", "Egrave"}, // И - uppercase E, grave accent
{"\u00C9", "Eacute"}, // Й - uppercase E, acute accent
{"\u00CA", "Ecirc"}, // К - uppercase E, circumflex accent
{"\u00CB", "Euml"}, // Л - uppercase E, umlaut
{"\u00CC", "Igrave"}, // М - uppercase I, grave accent
{"\u00CD", "Iacute"}, // Н - uppercase I, acute accent
{"\u00CE", "Icirc"}, // О - uppercase I, circumflex accent
{"\u00CF", "Iuml"}, // П - uppercase I, umlaut
{"\u00D0", "ETH"}, // Р - uppercase Eth, Icelandic
{"\u00D1", "Ntilde"}, // С - uppercase N, tilde
{"\u00D2", "Ograve"}, // Т - uppercase O, grave accent
{"\u00D3", "Oacute"}, // У - uppercase O, acute accent
{"\u00D4", "Ocirc"}, // Ф - uppercase O, circumflex accent
{"\u00D5", "Otilde"}, // Х - uppercase O, tilde
{"\u00D6", "Ouml"}, // Ц - uppercase O, umlaut
{"\u00D7", "times"}, // multiplication sign
{"\u00D8", "Oslash"}, // Ш - uppercase O, slash
{"\u00D9", "Ugrave"}, // Щ - uppercase U, grave accent
{"\u00DA", "Uacute"}, // Ъ - uppercase U, acute accent
{"\u00DB", "Ucirc"}, // Ы - uppercase U, circumflex accent
{"\u00DC", "Uuml"}, // Ь - uppercase U, umlaut
{"\u00DD", "Yacute"}, // Э - uppercase Y, acute accent
{"\u00DE", "THORN"}, // Ю - uppercase THORN, Icelandic
{"\u00DF", "szlig"}, // Я - lowercase sharps, German
{"\u00E0", "agrave"}, // а - lowercase a, grave accent
{"\u00E1", "aacute"}, // б - lowercase a, acute accent
{"\u00E2", "acirc"}, // в - lowercase a, circumflex accent
{"\u00E3", "atilde"}, // г - lowercase a, tilde
{"\u00E4", "auml"}, // д - lowercase a, umlaut
{"\u00E5", "aring"}, // е - lowercase a, ring
{"\u00E6", "aelig"}, // ж - lowercase ae
{"\u00E7", "ccedil"}, // з - lowercase c, cedilla
{"\u00E8", "egrave"}, // и - lowercase e, grave accent
{"\u00E9", "eacute"}, // й - lowercase e, acute accent
{"\u00EA", "ecirc"}, // к - lowercase e, circumflex accent
{"\u00EB", "euml"}, // л - lowercase e, umlaut
{"\u00EC", "igrave"}, // м - lowercase i, grave accent
{"\u00ED", "iacute"}, // н - lowercase i, acute accent
{"\u00EE", "icirc"}, // о - lowercase i, circumflex accent
{"\u00EF", "iuml"}, // п - lowercase i, umlaut
{"\u00F0", "eth"}, // р - lowercase eth, Icelandic
{"\u00F1", "ntilde"}, // с - lowercase n, tilde
{"\u00F2", "ograve"}, // т - lowercase o, grave accent
{"\u00F3", "oacute"}, // у - lowercase o, acute accent
{"\u00F4", "ocirc"}, // ф - lowercase o, circumflex accent
{"\u00F5", "otilde"}, // х - lowercase o, tilde
{"\u00F6", "ouml"}, // ц - lowercase o, umlaut
{"\u00F7", "divide"}, // division sign
{"\u00F8", "oslash"}, // ш - lowercase o, slash
{"\u00F9", "ugrave"}, // щ - lowercase u, grave accent
{"\u00FA", "uacute"}, // ъ - lowercase u, acute accent
{"\u00FB", "ucirc"}, // ы - lowercase u, circumflex accent
{"\u00FC", "uuml"}, // ь - lowercase u, umlaut
{"\u00FD", "yacute"}, // э - lowercase y, acute accent
{"\u00FE", "thorn"}, // ю - lowercase thorn, Icelandic
{"\u00FF", "yuml"}, // я - lowercase y, umlaut
};
private static final HashMap htmlUnescapeLookupMap;
static {
htmlUnescapeLookupMap = new HashMap<>();
for (final CharSequence[] seq : HTML_ESCAPES)
htmlUnescapeLookupMap.put(seq[1].toString(), seq[0]);
}
/**
* Decode an array encoded as a String. This entails a comma separated value enclosed in brackets
* or parentheses
* @param encoded The String encoded array
* @return A String array corresponding to the encoded array
*/
public static String[] decodeArray(String encoded){
if (encoded.length() == 0) return new String[]{};
char[] chars = encoded.trim().toCharArray();
//--Parse the String
//(state)
char quoteCloseChar = (char) 0;
List terms = new LinkedList<>();
StringBuilder current = new StringBuilder();
//(start/stop overhead)
int start = 0; int end = chars.length;
if(chars[0] == '('){ start += 1; end -= 1; if(chars[end] != ')') throw new IllegalArgumentException("Unclosed paren in encoded array: " + encoded); }
if(chars[0] == '['){ start += 1; end -= 1; if(chars[end] != ']') throw new IllegalArgumentException("Unclosed bracket in encoded array: " + encoded); }
if(chars[0] == '{'){ start += 1; end -= 1; if(chars[end] != '}') throw new IllegalArgumentException("Unclosed bracket in encoded array: " + encoded); }
//(finite state automata)
for(int i=start; i 0) {
terms.add(current);
}
current = new StringBuilder();
}else{
current.append(chars[i]);
}
}
}
//--Return
if(current.length() > 0) terms.add(current);
String[] rtn = new String[terms.size()];
int i=0;
for(StringBuilder b : terms){
rtn[i] = b.toString().trim();
i += 1;
}
return rtn;
}
/**
* Decode a map encoded as a string
* @param encoded The String encoded map
* @return A String map corresponding to the encoded map
*/
public static Map decodeMap(String encoded){
if (encoded.length() == 0) return new HashMap<>();
char[] chars = encoded.trim().toCharArray();
//--Parse the String
//(state)
char quoteCloseChar = (char) 0;
Map map = new HashMap<>();
String key = "";
String value = "";
boolean onKey = true;
StringBuilder current = new StringBuilder();
//(start/stop overhead)
int start = 0; int end = chars.length;
if(chars[0] == '('){ start += 1; end -= 1; if(chars[end] != ')') throw new IllegalArgumentException("Unclosed paren in encoded map: " + encoded); }
if(chars[0] == '['){ start += 1; end -= 1; if(chars[end] != ']') throw new IllegalArgumentException("Unclosed bracket in encoded map: " + encoded); }
if(chars[0] == '{'){ start += 1; end -= 1; if(chars[end] != '}') throw new IllegalArgumentException("Unclosed bracket in encoded map: " + encoded); }
//(finite state automata)
for(int i=start; i 0) {
value = current.toString().trim();
}
current = new StringBuilder();
onKey = true;
map.put(key, value); // <- add value
} else if((chars[i] == '-' || chars[i] == '=') && (i < chars.length - 1 && chars[i + 1] == '>')) {
// case: end a key
if (!onKey) {
throw new IllegalArgumentException("Encountered a value without a key");
}
if (current.length() > 0) {
key = current.toString().trim();
}
current = new StringBuilder();
onKey = false;
i += 1; // skip '>' character
} else if (chars[i] == ':') {
// case: end a key
if (!onKey) {
throw new IllegalArgumentException("Encountered a value without a key");
}
if (current.length() > 0) {
key = current.toString().trim();
}
current = new StringBuilder();
onKey = false;
} else {
current.append(chars[i]);
}
}
}
//--Return
if(current.toString().trim().length() > 0 && !onKey) {
map.put(key.trim(), current.toString().trim());
}
return map;
}
/**
* Takes an input String, and replaces any bash-style variables (e.g., $VAR_NAME)
* with its actual environment variable from the passed environment specification.
*
* @param raw The raw String to replace variables in.
* @param env The environment specification; e.g., {@link System#getenv()}.
* @return The input String, but with all variables replaced.
*/
public static String expandEnvironmentVariables(String raw, Map env) {
String pattern = "\\$\\{?([a-zA-Z_]+[a-zA-Z0-9_]*)\\}?";
Pattern expr = Pattern.compile(pattern);
String text = raw;
Matcher matcher = expr.matcher(text);
while (matcher.find()) {
String envValue = env.get(matcher.group(1));
if (envValue == null) {
envValue = "";
} else {
envValue = envValue.replace("\\", "\\\\");
}
Pattern subexpr = Pattern.compile(Pattern.quote(matcher.group(0)));
text = subexpr.matcher(text).replaceAll(envValue);
}
return text;
}
/**
* Takes an input String, and replaces any bash-style variables (e.g., $VAR_NAME)
* with its actual environment variable from {@link System#getenv()}.
*
* @param raw The raw String to replace variables in.
* @return The input String, but with all variables replaced.
*/
public static String expandEnvironmentVariables(String raw) {
return expandEnvironmentVariables(raw, System.getenv());
}
}