All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.deeplearning4j.berkeley.StringUtils Maven / Gradle / Ivy

There is a newer version: 1.0.0-M2.1
Show newest version
/*-
 *
 *  * Copyright 2015 Skymind,Inc.
 *  *
 *  *    Licensed under the Apache License, Version 2.0 (the "License");
 *  *    you may not use this file except in compliance with the License.
 *  *    You may obtain a copy of the License at
 *  *
 *  *        http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  *    Unless required by applicable law or agreed to in writing, software
 *  *    distributed under the License is distributed on an "AS IS" BASIS,
 *  *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  *    See the License for the specific language governing permissions and
 *  *    limitations under the License.
 *
 */

package org.deeplearning4j.berkeley;


import java.io.*;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * StringUtils is a class for random String things.
 *
 * @author Dan Klein
 * @author Christopher Manning
 * @author Tim Grow ([email protected])
 * @author Chris Cox
 * @version 2003/02/03
 */
public class StringUtils {

    /**
     * Don't let anyone instantiate this class.
     */
    private StringUtils() {}

    /**
     * Say whether this regular expression can be found inside
     * this String.  This method provides one of the two "missing"
     * convenience methods for regular expressions in the String class
     * in JDK1.4.  This is the one you'll want to use all the time if
     * you're used to Perl.  What were they smoking?
     *
     * @param str   String to search for match in
     * @param regex String to compile as the regular expression
     * @return Whether the regex can be found in str
     */
    public static boolean find(String str, String regex) {
        return Pattern.compile(regex).matcher(str).find();
    }

    /**
     * Say whether this regular expression can be found at the beginning of
     * this String.  This method provides one of the two "missing"
     * convenience methods for regular expressions in the String class
     * in JDK1.4.
     *
     * @param str   String to search for match at start of
     * @param regex String to compile as the regular expression
     * @return Whether the regex can be found at the start of str
     */
    public static boolean lookingAt(String str, String regex) {
        return Pattern.compile(regex).matcher(str).lookingAt();
    }

    /**
     * Say whether this regular expression matches
     * this String.  This method is the same as the String.matches() method,
     * and is included just to give a call that is parallel to the other
     * static regex methods in this class.
     *
     * @param str   String to search for match at start of
     * @param regex String to compile as the regular expression
     * @return Whether the regex matches the whole of this str
     */
    public static boolean matches(String str, String regex) {
        return Pattern.compile(regex).matcher(str).matches();
    }

    private static final int SLURPBUFFSIZE = 16000;

    /**
     * Returns all the text in the given File.
     */
    public static String slurpFile(File file) throws IOException {
        Reader r = new FileReader(file);
        return slurpReader(r);
    }

    public static String slurpGBFileNoExceptions(String filename) {
        return slurpFileNoExceptions(filename, "GB18030");
    }

    /**
     * Returns all the text in the given file with the given encoding.
     */
    public static String slurpFile(String filename, String encoding) throws IOException {
        Reader r = new InputStreamReader(new FileInputStream(filename), encoding);
        return slurpReader(r);
    }

    /**
     * Returns all the text in the given file with the given encoding.
     * If the file cannot be read (non-existent, etc.),
     * then and only then the method returns null.
     */
    public static String slurpFileNoExceptions(String filename, String encoding) {
        try {
            return slurpFile(filename, encoding);
        } catch (Exception e) {
            throw new RuntimeException();
        }
    }

    public static String slurpGBFile(String filename) throws IOException {
        return slurpFile(filename, "GB18030");
    }

    /**
     * Returns all the text from the given Reader.
     *
     * @return The text in the file.
     */
    public static String slurpReader(Reader reader) {
        BufferedReader r = new BufferedReader(reader);
        StringBuilder buff = new StringBuilder();
        try {
            char[] chars = new char[SLURPBUFFSIZE];
            while (true) {
                int amountRead = r.read(chars, 0, SLURPBUFFSIZE);
                if (amountRead < 0) {
                    break;
                }
                buff.append(chars, 0, amountRead);
            }
            r.close();
        } catch (Exception e) {
            throw new RuntimeException();
        }
        return buff.toString();
    }

    /**
     * Returns all the text in the given file
     *
     * @return The text in the file.
     */
    public static String slurpFile(String filename) throws IOException {
        return slurpReader(new FileReader(filename));
    }

    /**
     * Returns all the text in the given File.
     *
     * @return The text in the file.  May be an empty string if the file
     *         is empty.  If the file cannot be read (non-existent, etc.),
     *         then and only then the method returns null.
     */
    public static String slurpFileNoExceptions(File file) {
        try {
            return slurpReader(new FileReader(file));
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * Returns all the text in the given File.
     *
     * @return The text in the file.  May be an empty string if the file
     *         is empty.  If the file cannot be read (non-existent, etc.),
     *         then and only then the method returns null.
     */
    public static String slurpFileNoExceptions(String filename) {
        try {
            return slurpFile(filename);
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * Returns all the text at the given URL.
     */
    public static String slurpGBURL(URL u) throws IOException {
        return slurpURL(u, "GB18030");
    }

    /**
     * Returns all the text at the given URL.
     */
    public static String slurpGBURLNoExceptions(URL u) {
        try {
            return slurpGBURL(u);
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * Returns all the text at the given URL.
     */
    public static String slurpURLNoExceptions(URL u, String encoding) {
        try {
            return slurpURL(u, encoding);
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * Returns all the text at the given URL.
     */
    public static String slurpURL(URL u, String encoding) throws IOException {
        String lineSeparator = System.getProperty("line.separator");
        URLConnection uc = u.openConnection();
        uc.setReadTimeout(30000);
        InputStream is;
        try {
            is = uc.getInputStream();
        } catch (SocketTimeoutException e) {
            //e.printStackTrace();
            System.err.println("Time out. Return empty string");
            return "";
        }
        BufferedReader br = new BufferedReader(new InputStreamReader(is, encoding));
        String temp;
        StringBuilder buff = new StringBuilder(16000); // make biggish
        while ((temp = br.readLine()) != null) {
            buff.append(temp);
            buff.append(lineSeparator);
        }
        br.close();
        return buff.toString();
    }

    /**
     * Returns all the text at the given URL.
     */
    public static String slurpURL(URL u) throws IOException {
        String lineSeparator = System.getProperty("line.separator");
        URLConnection uc = u.openConnection();
        InputStream is = uc.getInputStream();
        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        String temp;
        StringBuilder buff = new StringBuilder(16000); // make biggish
        while ((temp = br.readLine()) != null) {
            buff.append(temp);
            buff.append(lineSeparator);
        }
        br.close();
        return buff.toString();
    }

    /**
     * Returns all the text at the given URL.
     */
    public static String slurpURLNoExceptions(URL u) {
        try {
            return slurpURL(u);
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * Returns all the text at the given URL.
     */
    public static String slurpURL(String path) throws Exception {
        return slurpURL(new URL(path));
    }

    /**
     * Returns all the text at the given URL. If the file cannot be read (non-existent, etc.),
     * then and only then the method returns null.
     */
    public static String slurpURLNoExceptions(String path) {
        try {
            return slurpURL(path);
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * Joins each elem in the Collection with the given glue. For example, given a
     * list
     * of Integers, you can createComplex a comma-separated list by calling
     * join(numbers, ", ").
     */
    public static String join(Iterable l, String glue) {
        StringBuilder sb = new StringBuilder();
        boolean first = true;
        for (Object o : l) {
            if (!first) {
                sb.append(glue);
            }
            sb.append(o.toString());
            first = false;
        }
        return sb.toString();
    }

    /**
     * Joins each elem in the List with the given glue. For example, given a
     * list
     * of Integers, you can createComplex a comma-separated list by calling
     * join(numbers, ", ").
     */
    public static String join(List l, String glue) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < l.size(); i++) {
            if (i > 0) {
                sb.append(glue);
            }
            Object x = l.get(i);
            sb.append(x.toString());
        }
        return sb.toString();
    }

    /**
     * Joins each elem in the array with the given glue. For example, given a list
     * of ints, you can createComplex a comma-separated list by calling
     * join(numbers, ", ").
     */
    public static String join(Object[] elements, String glue) {
        return (join(Arrays.asList(elements), glue));
    }

    /**
     * Joins elems with a space.
     */
    public static String join(List l) {
        return join(l, " ");
    }

    /**
     * Joins elems with a space.
     */
    public static String join(Object[] elements) {
        return (join(elements, " "));
    }

    /**
     * Splits on whitespace (\\s+).
     */
    public static List split(String s) {
        return (split(s, "\\s+"));
    }

    /**
     * Splits the given string using the given regex as delimiters.
     * This method is the same as the String.split() method (except it throws
     * the results in a List),
     * and is included just to give a call that is parallel to the other
     * static regex methods in this class.
     *
     * @param str   String to split up
     * @param regex String to compile as the regular expression
     * @return List of Strings resulting from splitting on the regex
     */
    public static List split(String str, String regex) {
        return (Arrays.asList(str.split(regex)));
    }

    /**
     * Return a String of length a minimum of totalChars characters by
     * padding the input String str with spaces.  If str is already longer
     * than totalChars, it is returned unchanged.
     */
    public static String pad(String str, int totalChars) {
        if (str == null)
            str = "null";
        int slen = str.length();
        StringBuilder sb = new StringBuilder(str);
        for (int i = 0; i < totalChars - slen; i++) {
            sb.append(" ");
        }
        return sb.toString();
    }

    /**
     * Pads the toString value of the given Object.
     */
    public static String pad(Object obj, int totalChars) {
        return pad(obj.toString(), totalChars);
    }

    /**
     * Pad or trim so as to produce a string of exactly a certain length.
     *
     * @param str The String to be padded or truncated
     * @param num The desired length
     */
    public static String padOrTrim(String str, int num) {
        if (str == null)
            str = "null";
        int leng = str.length();
        if (leng < num) {
            StringBuilder sb = new StringBuilder(str);
            for (int i = 0; i < num - leng; i++) {
                sb.append(" ");
            }
            return sb.toString();
        } else if (leng > num) {
            return str.substring(0, num);
        } else {
            return str;
        }
    }

    /**
     * Pad or trim the toString value of the given Object.
     */
    public static String padOrTrim(Object obj, int totalChars) {
        return padOrTrim(obj.toString(), totalChars);
    }

    /**
     * Pads the given String to the left with spaces to ensure that it's
     * at least totalChars long.
     */
    public static String padLeft(String str, int totalChars) {
        if (str == null)
            str = "null";
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < totalChars - str.length(); i++) {
            sb.append(" ");
        }
        sb.append(str);
        return sb.toString();
    }

    public static String padLeft(Object obj, int totalChars) {
        return padLeft(obj.toString(), totalChars);
    }

    public static String padLeft(int i, int totalChars) {
        return padLeft(new Integer(i), totalChars);
    }

    public static String padLeft(double d, int totalChars) {
        return padLeft(new Double(d), totalChars);
    }

    /**
     * Returns s if it's at most maxWidth chars, otherwise chops right side to fit.
     */
    public static String trim(String s, int maxWidth) {
        if (s.length() <= maxWidth) {
            return (s);
        }
        return (s.substring(0, maxWidth));
    }

    public static String trim(Object obj, int maxWidth) {
        return trim(obj.toString(), maxWidth);
    }

    /**
     * Returns a "clean" version of the given filename in which spaces have
     * been converted to dashes and all non-alphaneumeric chars are underscores.
     */
    public static String fileNameClean(String s) {
        char[] chars = s.toCharArray();
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < chars.length; i++) {
            char c = chars[i];
            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || (c == '_')) {
                sb.append(c);
            } else {
                if (c == ' ' || c == '-') {
                    sb.append('_');
                } else {
                    sb.append("x" + (int) c + "x");
                }
            }
        }
        return sb.toString();
    }

    /**
     * Returns the index of the nth occurrence of ch in s, or -1
     * if there are less than n occurrences of ch.
     */
    public static int nthIndex(String s, char ch, int n) {
        int index = 0;
        for (int i = 0; i < n; i++) {
            // if we're already at the end of the string,
            // and we need to find another ch, return -1
            if (index == s.length() - 1) {
                return -1;
            }
            index = s.indexOf(ch, index + 1);
            if (index == -1) {
                return (-1);
            }
        }
        return index;
    }

    /**
     * This returns a string from decimal digit smallestDigit to decimal digit
     * biggest digit. Smallest digit is labeled 1, and the limits are
     * inclusive.
     */
    public static String truncate(int n, int smallestDigit, int biggestDigit) {
        int numDigits = biggestDigit - smallestDigit + 1;
        char[] result = new char[numDigits];
        for (int j = 1; j < smallestDigit; j++) {
            n = n / 10;
        }
        for (int j = numDigits - 1; j >= 0; j--) {
            result[j] = Character.forDigit(n % 10, 10);
            n = n / 10;
        }
        return new String(result);
    }

    /**
     * Parses command line arguments into a Map. Arguments of the form
     * 

* -flag1 arg1a arg1b ... arg1m -flag2 -flag3 arg3a ... arg3n *

* will be parsed so that the flag is a key in the Map (including * the hyphen) and its value will be a {@link String[] } containing * the optional arguments (if present). The non-flag values not * captured as flag arguments are collected into a String[] array * and returned as the value of null in the Map. In * this invocation, flags cannot take arguments, so all the {@link * String} array values other than the value for null * will be zero-length. * * @param args * @return a {@link Map} of flag names to flag argument {@link * String[]} arrays. */ public static Map argsToMap(String[] args) { return argsToMap(args, new HashMap()); } /** * Parses command line arguments into a Map. Arguments of the form *

* -flag1 arg1a arg1b ... arg1m -flag2 -flag3 arg3a ... arg3n *

* will be parsed so that the flag is a key in the Map (including * the hyphen) and its value will be a {@link String[] } containing * the optional arguments (if present). The non-flag values not * captured as flag arguments are collected into a String[] array * and returned as the value of null in the Map. In * this invocation, the maximum number of arguments for each flag * can be specified as an {@link Integer} value of the appropriate * flag key in the flagsToNumArgs {@link Map} * argument. (By default, flags cannot take arguments.) *

* Example of usage: *

* * Map flagsToNumArgs = new HashMap(); * flagsToNumArgs.put("-x",new Integer(2)); * flagsToNumArgs.put("-d",new Integer(1)); * Map result = argsToMap(args,flagsToNumArgs); * * * @param args the argument array to be parsed * @param flagsToNumArgs a {@link Map} of flag names to {@link * Integer} values specifying the maximum number of allowed * arguments for that flag (default 0). * @return a {@link Map} of flag names to flag argument {@link * String[]} arrays. */ public static Map argsToMap(String[] args, Map flagsToNumArgs) { Map result = new HashMap<>(); List remainingArgs = new ArrayList<>(); String key; for (int i = 0; i < args.length; i++) { key = args[i]; if (key.charAt(0) == '-') { // found a flag Integer maxFlagArgs = flagsToNumArgs.get(key); int max = maxFlagArgs == null ? 0 : maxFlagArgs.intValue(); List flagArgs = new ArrayList<>(); for (int j = 0; j < max && i + 1 < args.length && args[i + 1].charAt(0) != '-'; i++, j++) { flagArgs.add(args[i + 1]); } if (result.containsKey(key)) { // append the second specification into the args. String[] newFlagArg = new String[result.get(key).length + flagsToNumArgs.get(key)]; int oldNumArgs = result.get(key).length; System.arraycopy(result.get(key), 0, newFlagArg, 0, oldNumArgs); for (int j = 0; j < flagArgs.size(); j++) { newFlagArg[j + oldNumArgs] = flagArgs.get(j); } } else result.put(key, flagArgs.toArray(new String[] {})); } else { remainingArgs.add(args[i]); } } result.put(null, remainingArgs.toArray(new String[] {})); return result; } private static final String PROP = "prop"; public static Properties argsToProperties(String[] args) { return argsToProperties(args, new HashMap()); } /** * Analagous to {@link #argsToMap}. However, there are several key differences between this method and {@link #argsToMap}: *

    *
  • Hyphens are stripped from flag names
  • *
  • Since Properties objects are String to String mappings, the default number of arguments to a flag is * assumed to be 1 and not 0.
  • *
  • Furthermore, the list of arguments not bound to a flag is mapped to the "" property, not null
  • *
  • The special flag "-prop" will load the property file specified by it's argument.
  • *
  • The value for flags without arguments is applyTransformToDestination to "true"
  • */ public static Properties argsToProperties(String[] args, Map flagsToNumArgs) { Properties result = new Properties(); List remainingArgs = new ArrayList<>(); String key; for (int i = 0; i < args.length; i++) { key = args[i]; if (key.charAt(0) == '-') { // found a flag key = key.substring(1); // strip off the hyphen Integer maxFlagArgs = (Integer) flagsToNumArgs.get(key); int max = maxFlagArgs == null ? 1 : maxFlagArgs.intValue(); List flagArgs = new ArrayList<>(); for (int j = 0; j < max && i + 1 < args.length && args[i + 1].charAt(0) != '-'; i++, j++) { flagArgs.add(args[i + 1]); } if (flagArgs.isEmpty()) { result.setProperty(key, "true"); } else { result.setProperty(key, join(flagArgs, " ")); if (key.equalsIgnoreCase(PROP)) { try { result.load(new BufferedInputStream(new FileInputStream(result.getProperty(PROP)))); } catch (IOException e) { e.printStackTrace(); } } } } else { remainingArgs.add(args[i]); } } result.setProperty("", join(remainingArgs, " ")); return result; } /** * This method converts a comma-separated String (with whitespace * optionally allowed after the comma) representing properties * to a Properties object. Each property is "property=value". The value * for properties without an explicitly given value is applyTransformToDestination to "true". */ public static Properties stringToProperties(String str) { Properties result = new Properties(); String[] props = str.trim().split(",\\s*"); for (int i = 0; i < props.length; i++) { String term = props[i]; int divLoc = term.indexOf('='); String key; String value; if (divLoc >= 0) { key = term.substring(0, divLoc); value = term.substring(divLoc + 1); } else { key = term; value = "true"; } result.setProperty(key, value); } return result; } /** * Prints to a file. If the file already exists, appends if * append=true, and overwrites if append=false */ public static void printToFile(File file, String message, boolean append) { FileWriter fw = null; PrintWriter pw = null; try { fw = new FileWriter(file, append); pw = new PrintWriter(fw); pw.print(message); } catch (Exception e) { System.out.println("Exception: in printToFile " + file.getAbsolutePath() + " " + message); e.printStackTrace(); } finally { if (pw != null) { pw.close(); } } } /** * Prints to a file. If the file does not exist, rewrites the file; * does not append. */ public static void printToFile(File file, String message) { printToFile(file, message, false); } /** * Prints to a file. If the file already exists, appends if * append=true, and overwrites if append=false */ public static void printToFile(String filename, String message, boolean append) { printToFile(new File(filename), message, append); } /** * Prints to a file. If the file does not exist, rewrites the file; * does not append. */ public static void printToFile(String filename, String message) { printToFile(new File(filename), message, false); } /** * A simpler form of command line argument parsing. * Dan thinks this is highly superior to the overly complexified code that * comes before it. * Parses command line arguments into a Map. Arguments of the form * -flag1 arg1 -flag2 -flag3 arg3 * will be parsed so that the flag is a key in the Map (including the hyphen) * and the * optional argument will be its value (if present). * * @param args * @return A Map from keys to possible values (String or null) */ public static Map parseCommandLineArguments(String[] args) { Map result = new HashMap<>(); String key, value; for (int i = 0; i < args.length; i++) { key = args[i]; if (key.charAt(0) == '-') { if (i + 1 < args.length) { value = args[i + 1]; if (value.charAt(0) != '-') { result.put(key, value); i++; } else { result.put(key, null); } } else { result.put(key, null); } } } return result; } public static String stripNonAlphaNumerics(String orig) { StringBuilder sb = new StringBuilder(); char c; for (int i = 0; i < orig.length(); i++) { c = orig.charAt(i); if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { sb.append(c); } } return sb.toString(); } public static void printStringOneCharPerLine(String s) { for (int i = 0; i < s.length(); i++) { int c = s.charAt(i); System.out.println(c + " \'" + (char) c + "\' "); } } public static String escapeString(String s, char[] charsToEscape, char escapeChar) { StringBuilder result = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); if (c == escapeChar) { result.append(escapeChar); } else { for (int j = 0; j < charsToEscape.length; j++) { if (c == charsToEscape[j]) { result.append(escapeChar); break; } } } result.append(c); } return result.toString(); } /** * This function splits the String s into multiple Strings using the * splitChar. However, it provides an quoting facility: it is possible to * quote strings with the quoteChar. * If the quoteChar occurs within the quotedExpression, it must be prefaced * by the escapeChar * * @param s The String to split * @param splitChar * @param quoteChar * @return An array of Strings that s is split into */ public static String[] splitOnCharWithQuoting(String s, char splitChar, char quoteChar, char escapeChar) { List result = new ArrayList<>(); int i = 0; int length = s.length(); StringBuilder b = new StringBuilder(); while (i < length) { char curr = s.charAt(i); if (curr == splitChar) { // add last buffer if (b.length() > 0) { result.add(b.toString()); b = new StringBuilder(); } i++; } else if (curr == quoteChar) { // find next instance of quoteChar i++; while (i < length) { curr = s.charAt(i); if (curr == escapeChar) { b.append(s.charAt(i + 1)); i += 2; } else if (curr == quoteChar) { i++; break; // break this loop } else { b.append(s.charAt(i)); i++; } } } else { b.append(curr); i++; } } if (b.length() > 0) { result.add(b.toString()); } return result.toArray(new String[0]); } /** * Computes the longest common substring of s and t. * The longest common substring of a and b is the longest run of * characters that appear in order inside both a and b. Both a and b * may have other extraneous characters along the way. This is like * edit distance but with no substitution and a higher number means * more similar. For example, the LCS of "abcD" and "aXbc" is 3 (abc). */ public static int longestCommonSubstring(String s, String t) { int d[][]; // matrix int n; // length of s int m; // length of t int i; // iterates through s int j; // iterates through t char s_i; // ith character of s char t_j; // jth character of t // Step 1 n = s.length(); m = t.length(); if (n == 0) { return 0; } if (m == 0) { return 0; } d = new int[n + 1][m + 1]; // Step 2 for (i = 0; i <= n; i++) { d[i][0] = 0; } for (j = 0; j <= m; j++) { d[0][j] = 0; } // Step 3 for (i = 1; i <= n; i++) { s_i = s.charAt(i - 1); // Step 4 for (j = 1; j <= m; j++) { t_j = t.charAt(j - 1); // Step 5 // js: if the chars match, you can getFromOrigin an extra point // otherwise you have to skip an insertion or deletion (no subs) if (s_i == t_j) { d[i][j] = SloppyMath.max(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1] + 1); } else { d[i][j] = Math.max(d[i - 1][j], d[i][j - 1]); } } } if (false) { // num chars needed to display longest num int numChars = (int) Math.ceil(Math.log(d[n][m]) / Math.log(10)); for (i = 0; i < numChars + 3; i++) { System.err.print(' '); } for (j = 0; j < m; j++) { System.err.print("" + t.charAt(j) + " "); } System.err.println(); for (i = 0; i <= n; i++) { System.err.print((i == 0 ? ' ' : s.charAt(i - 1)) + " "); for (j = 0; j <= m; j++) { System.err.print("" + d[i][j] + " "); } System.err.println(); } } // Step 7 return d[n][m]; } /** * Computes the Levenshtein (edit) distance of the two given Strings. */ public static int editDistance(String s, String t) { int d[][]; // matrix int n; // length of s int m; // length of t int i; // iterates through s int j; // iterates through t char s_i; // ith character of s char t_j; // jth character of t int cost; // cost // Step 1 n = s.length(); m = t.length(); if (n == 0) { return m; } if (m == 0) { return n; } d = new int[n + 1][m + 1]; // Step 2 for (i = 0; i <= n; i++) { d[i][0] = i; } for (j = 0; j <= m; j++) { d[0][j] = j; } // Step 3 for (i = 1; i <= n; i++) { s_i = s.charAt(i - 1); // Step 4 for (j = 1; j <= m; j++) { t_j = t.charAt(j - 1); // Step 5 if (s_i == t_j) { cost = 0; } else { cost = 1; } // Step 6 d[i][j] = SloppyMath.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost); } } // Step 7 return d[n][m]; } /** * Computes the WordNet 2.0 POS tag corresponding to the PTB POS tag s. * * @param s a Penn TreeBank POS tag. */ public static String pennPOSToWordnetPOS(String s) { if (s.matches("NN|NNP|NNS|NNPS")) { return "noun"; } if (s.matches("VB|VBD|VBG|VBN|VBZ|VBP|MD")) { return "verb"; } if (s.matches("JJ|JJR|JJS|CD")) { return "adjective"; } if (s.matches("RB|RBR|RBS|RP|WRB")) { return "adverb"; } return null; } /** * Uppercases the first character of a string. * * @param s a string to capitalize * @return a capitalized version of the string */ public static String capitalize(String s) { if (s.charAt(0) >= 'a') { return (char) (s.charAt(0) + ('A' - 'a')) + s.substring(1); } else { return s; } } public static List allMatches(String str, String regex) { Pattern p = Pattern.compile(regex); List matches = new ArrayList<>(); while (true) { Matcher m = p.matcher(str); if (!m.find()) break; matches.add(m); str = str.substring(m.end()); } return matches; } public static void main(String[] args) throws IOException { String[] s = {"there once was a man", "this one is a manic", "hey there", "there once was a mane", "once in a manger.", "where is one match?"}; for (int i = 0; i < 6; i++) { for (int j = 0; j < 6; j++) { System.out.println("s1: " + s[i]); System.out.println("s2: " + s[j]); System.out.println("edit distance: " + editDistance(s[i], s[j])); System.out.println("LCS: " + longestCommonSubstring(s[i], s[j])); System.out.println(); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy