All Downloads are FREE. Search and download functionalities are using the official Maven repository.

sunlabs.brazil.util.regexp.Regexp Maven / Gradle / Ivy

The newest version!
/*
 * Regexp.java
 *
 * Brazil project web application toolkit,
 * export version: 2.3 
 * Copyright (c) 1999-2004 Sun Microsystems, Inc.
 *
 * Sun Public License Notice
 *
 * The contents of this file are subject to the Sun Public License Version 
 * 1.0 (the "License"). You may not use this file except in compliance with 
 * the License. A copy of the License is included as the file "license.terms",
 * and also available at http://www.sun.com/
 * 
 * The Original Code is from:
 *    Brazil project web application toolkit release 2.3.
 * The Initial Developer of the Original Code is: cstevens.
 * Portions created by cstevens are Copyright (C) Sun Microsystems, Inc.
 * All Rights Reserved.
 * 
 * Contributor(s): cstevens, suhler.
 *
 * Version:  2.3
 * Created by cstevens on 99/08/10
 * Last modified by suhler on 04/12/30 12:42:56
 *
 * Version Histories:
 *
 * 2.3 04/12/30-12:42:56 (suhler)
 *   add toString()
 *
 * 2.2 04/11/30-15:19:46 (suhler)
 *   fixed sccs version string
 *
 * 2.1 02/10/01-16:37:03 (suhler)
 *   version change
 *
 * 1.10 00/11/06-10:45:53 (suhler)
 *   make serializable
 *
 * 1.9 00/05/31-13:52:58 (suhler)
 *   docs
 *
 * 1.8 99/11/17-10:20:17 (suhler)
 *   fixed wildcarded imports
 *
 * 1.7 99/10/14-13:04:11 (cstevens)
 *   Documentation for Regexp.Filter
 *
 * 1.6 99/10/07-13:19:37 (cstevens)
 *   javadoc lint.
 *
 * 1.5 99/09/03-11:34:10 (cstevens)
 *   Change Regexp.sub(String, String) so that if there were 0 matches it returns
 *   null.
 *   Regexp.match with a pattern that had '$' at the end was broken.
 *
 * 1.4 99/08/27-16:18:05 (cstevens)
 *   "\\", "\&", "\" followed by anything not a digit should be a literal char in
 *   subspec, not an error.
 *
 * 1.3 99/08/27-13:12:23 (cstevens)
 *   Consolidate RegexpFilter into Regexp, making Regexp.Filter a public inner
 *   interface.
 *
 * 1.2 99/08/27-12:32:52 (cstevens)
 *   Passes tcl test suite.
 *   Added support for case-insensitive match
 *   Put RegexpFilter back in.
 *   Fixed exceptions revealed by test suite.
 *
 * 1.1.1.1 99/08/18-08:41:04 (suhler)
 *   lint
 *
 * 1.2 99/08/10-16:14:33 (Codemgr)
 *   SunPro Code Manager data about conflicts, renames, etc...
 *   Name history : 1 0 util/regexp/Regexp.java
 *
 * 1.1 99/08/10-16:14:32 (cstevens)
 *   date and time created 99/08/10 16:14:32 by cstevens
 *
 */

package sunlabs.brazil.util.regexp;

/**
 * The Regexp class can be used to match a pattern against a
 * string and optionally replace the matched parts with new strings.
 * 

* Regular expressions were implemented by translating Henry Spencer's * regular expression package for tcl8.0. * Much of the description below is copied verbatim from the tcl8.0 regsub * manual entry. *


* REGULAR EXPRESSIONS *

* A regular expression is zero or more branches, separated by * "|". It matches anything that matches one of the branches. *

* A branch is zero or more pieces, concatenated. * It matches a match for the first piece, followed by a match for the * second piece, etc. *

* A piece is an atom, possibly followed by "*", "+", or * "?".

    *
  • An atom followed by "*" matches a sequence of 0 or more matches of * the atom. *
  • An atom followed by "+" matches a sequence of 1 or more matches of * the atom. *
  • An atom followed by "?" matches either 0 or 1 matches of the atom. *
*

* An atom is

    *
  • a regular expression in parentheses (matching a match for the * regular expression) *
  • a range (see below) *
  • "." (matching any single character) *
  • "^" (matching the null string at the beginning of the input string) *
  • "$" (matching the null string at the end of the input string) *
  • a "\" followed by a single character (matching that character) *
  • a single character with no other significance (matching that * character). *
*

* A range is a sequence of characters enclosed in "[]". * The range normally matches any single character from the sequence. * If the sequence begins with "^", the range matches any single character * not from the rest of the sequence. * If two characters in the sequence are separated by "-", this is shorthand * for the full list of characters between them (e.g. "[0-9]" matches any * decimal digit). To include a literal "]" in the sequence, make it the * first character (following a possible "^"). To include a literal "-", * make it the first or last character. *

* In general there may be more than one way to match a regular expression * to an input string. For example, consider the command *

 * String[] match = new String[2];
 * Regexp.match("(a*)b*", "aabaaabb", match);
 * 
* Considering only the rules given so far, match[0] and * match[1] could end up with the values
    *
  • "aabb" and "aa" *
  • "aaab" and "aaa" *
  • "ab" and "a" *
* or any of several other combinations. To resolve this potential ambiguity, * Regexp chooses among alternatives using the rule "first then longest". * In other words, it considers the possible matches in order working * from left to right across the input string and the pattern, and it * attempts to match longer pieces of the input string before shorter * ones. More specifically, the following rules apply in decreasing * order of priority:
    *
  1. If a regular expression could match two different parts of an input * string then it will match the one that begins earliest. *
  2. If a regular expression contains "|" operators then the * leftmost matching sub-expression is chosen. *
  3. In "*", "+", and "?" constructs, longer matches are chosen in * preference to shorter ones. *
  4. * In sequences of expression components the components are considered * from left to right. *
*

* In the example from above, "(a*)b*" therefore matches exactly "aab"; the * "(a*)" portion of the pattern is matched first and it consumes the leading * "aa", then the "b*" portion of the pattern consumes the next "b". Or, * consider the following example: *

 * String match = new String[3];
 * Regexp.match("(ab|a)(b*)c", "abc", match);
 * 
* After this command, match[0] will be "abc", * match[1] will be "ab", and match[2] will be an * empty string. * Rule 4 specifies that the "(ab|a)" component gets first shot at the input * string and Rule 2 specifies that the "ab" sub-expression * is checked before the "a" sub-expression. * Thus the "b" has already been claimed before the "(b*)" * component is checked and therefore "(b*)" must match an empty string. *
* * REGULAR EXPRESSION SUBSTITUTION *

* Regular expression substitution matches a string against a regular * expression, transforming the string by replacing the matched region(s) * with new substring(s). *

* What gets substituted into the result is controlled by a * subspec. The subspec is a formatting string that specifies * what portions of the matched region should be substituted into the * result. *

    *
  • "&" or "\0" is replaced with a copy of the entire matched region. *
  • "\n", where n is a digit from 1 to 9, * is replaced with a copy of the nth subexpression. *
  • "\&" or "\\" are replaced with just "&" or "\" to escape their * special meaning. *
  • any other character is passed through. *
* In the above, strings like "\2" represents the two characters * backslash and "2", not the Unicode character 0002. *
* Here is an example of how to use Regexp *
 *
 *    public static void
 *    main(String[] args)
 *	throws Exception
 *    {
 *	Regexp re;
 *	String[] matches;
 *	String s;
 *
 *	/*
 *	 * A regular expression to match the first line of a HTTP request.
 *	 *
 *	 * 1. ^               - starting at the beginning of the line
 *	 * 2. ([A-Z]+)        - match and remember some upper case characters
 *	 * 3. [ \t]+          - skip blank space
 *	 * 4. ([^ \t]*)       - match and remember up to the next blank space
 *	 * 5. [ \t]+          - skip more blank space
 *	 * 6. (HTTP/1\\.[01]) - match and remember HTTP/1.0 or HTTP/1.1
 *	 * 7. $		      - end of string - no chars left.
 *	 */
 *
 *	s = "GET http://a.b.com:1234/index.html HTTP/1.1";
 *
 *	re = new Regexp("^([A-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/1\\.[01])$");
 *	matches = new String[4];
 *	if (re.match(s, matches)) {
 *	    System.out.println("METHOD  " + matches[1]);
 *	    System.out.println("URL     " + matches[2]);
 *	    System.out.println("VERSION " + matches[3]);
 *	}
 *
 *	/*
 *	 * A regular expression to extract some simple comma-separated data,
 *	 * reorder some of the columns, and discard column 2.
 *	 */
 *
 *	s = "abc,def,ghi,klm,nop,pqr";
 *
 *	re = new Regexp("^([^,]+),([^,]+),([^,]+),(.*)");
 *	System.out.println(re.sub(s, "\\3,\\1,\\4"));
 *    }
 * 
* * @author Colin Stevens ([email protected]) * @version 2.3 * @see Regsub */ public class Regexp implements java.io.Serializable { public static void main(String[] args) throws Exception { if ((args.length == 2) && (args[0].equals("compile"))) { System.out.println(new Regexp(args[1])); } else if ((args.length == 3) && (args[0].equals("match"))) { Regexp r = new Regexp(args[1]); String[] substrs = new String[r.subspecs()]; boolean match = r.match(args[2], substrs); System.out.println("match:\t" + match); for (int i = 0; i < substrs.length; i++) { System.out.println((i + 1) + ":\t" + substrs[i]); } } else if ((args.length == 4) && (args[0].equals("sub"))) { Regexp r = new Regexp(args[1]); System.out.println(r.subAll(args[2], args[3])); } else { System.out.println("usage:"); System.out.println("\tRegexp match "); System.out.println("\tRegexp sub "); System.out.println("\tRegexp compile "); } } /* * Structure for regexp "program". This is essentially a linear encoding * of a nondeterministic finite-state machine (aka syntax charts or * "railroad normal form" in parsing technology). Each node is an opcode * plus a "next" pointer, possibly plus an operand. "Next" pointers of * all nodes except BRANCH implement concatenation; a "next" pointer with * a BRANCH on both ends of it is connecting two alternatives. (Here we * have one of the subtle syntax dependencies: an individual BRANCH (as * opposed to a collection of them) is never concatenated with anything * because of operator precedence.) The operand of some types of node is * a literal string; for others, it is a node leading into a sub-FSM. In * particular, the operand of a BRANCH node is the first node of the branch. * (NB this is *not* a tree structure: the tail of the branch connects * to the thing following the set of BRANCHes.) The opcodes are: */ static final int NSUBEXP = 100; /* definition number opnd? meaning */ static final char END = 0; /* no End of program. */ static final char BOL = 1; /* no Match "" at beginning of line. */ static final char EOL = 2; /* no Match "" at end of line. */ static final char ANY = 3; /* no Match any one character. */ static final char ANYOF = 4; /* str Match any character in this string. */ static final char ANYBUT = 5; /* str Match any character not in this string. */ static final char BRANCH = 6; /* node Match this alternative, or the next... */ static final char BACK = 7; /* no Match "", "next" ptr points backward. */ static final char EXACTLY = 8; /* str Match this string. */ static final char NOTHING = 9; /* no Match empty string. */ static final char STAR = 10; /* node Match this (simple) thing 0 or more times. */ static final char PLUS = 11; /* node Match this (simple) thing 1 or more times. */ static final char OPEN = 20; /* no Mark this point in input as start of #n. */ /* OPEN+1 is number 1, etc. */ static final char CLOSE = (char) (OPEN+NSUBEXP); /* no Analogous to OPEN. */ static final String[] opnames = { "END", "BOL", "EOL", "ANY", "ANYOF", "ANYBUT", "BRANCH", "BACK", "EXACTLY", "NOTHING", "STAR", "PLUS" }; /* * A node is one char of opcode followed by one char of "next" pointer. * The value is a positive offset from the opcode of the node containing * it. An operand, if any, simply follows the node. (Note that much of * the code generation knows about this implicit relationship.) * * Opcode notes: * * BRANCH The set of branches constituting a single choice are hooked * together with their "next" pointers, since precedence prevents * anything being concatenated to any individual branch. The * "next" pointer of the last BRANCH in a choice points to the * thing following the whole choice. This is also where the * final "next" pointer of each individual branch points; each * branch starts with the operand node of a BRANCH node. * * ANYOF, ANYBUT, EXACTLY * The format of a string operand is one char of length * followed by the characters making up the string. * * BACK Normal "next" pointers all implicitly point forward; BACK * exists to make loop structures possible. * * STAR, PLUS * '?', and complex '*' and '+' are implemented as circular * BRANCH structures using BACK. Simple cases (one character * per match) are implemented with STAR and PLUS for speed * and to minimize recursive plunges. * * OPENn, CLOSEn * are numbered at compile time. */ /** * The bytecodes making up the regexp program. */ char[] program; /** * Whether the regexp matching should be case insensitive. */ boolean ignoreCase; /** * The number of parenthesized subexpressions in the regexp pattern, * plus 1 for the match of the whole pattern itself. */ int npar; /** * true if the pattern must match the beginning of the * string, so we don't have to waste time matching against all possible * starting locations in the string. */ boolean anchored; int startChar; String must; /** * Compiles a new Regexp object from the given regular expression * pattern. *

* It takes a certain amount of time to parse and validate a regular * expression pattern before it can be used to perform matches * or substitutions. If the caller caches the new Regexp object, that * parsing time will be saved because the same Regexp can be used with * respect to many different strings. * * @param pat * The string holding the regular expression pattern. * * @throws IllegalArgumentException if the pattern is malformed. * The detail message for the exception will be set to a * string indicating how the pattern was malformed. */ public Regexp(String pat) throws IllegalArgumentException { compile(pat); } /** * Compiles a new Regexp object from the given regular expression * pattern. * * @param pat * The string holding the regular expression pattern. * * @param ignoreCase * If true then this regular expression will * do case-insensitive matching. If false, then * the matches are case-sensitive. Regular expressions * generated by Regexp(String) are case-sensitive. * * @throws IllegalArgumentException if the pattern is malformed. * The detail message for the exception will be set to a * string indicating how the pattern was malformed. */ public Regexp(String pat, boolean ignoreCase) throws IllegalArgumentException { this.ignoreCase = ignoreCase; if (ignoreCase) { pat = pat.toLowerCase(); } compile(pat); } /** * Returns the number of parenthesized subexpressions in this regular * expression, plus one more for this expression itself. * * @return The number. */ public int subspecs() { return npar; } /** * Matches the given string against this regular expression. * * @param str * The string to match. * * @return The substring of str that matched the entire * regular expression, or null if the string did not * match this regular expression. */ public String match(String str) { Match m = exec(str, 0, 0); if (m == null) { return null; } return str.substring(m.indices[0], m.indices[1]); } /** * Matches the given string against this regular expression, and computes * the set of substrings that matched the parenthesized subexpressions. *

* substrs[0] is set to the range of str * that matched the entire regular expression. *

* substrs[1] is set to the range of str * that matched the first (leftmost) parenthesized subexpression. * substrs[n] is set to the range that matched the * nth subexpression, and so on. *

* If subexpression n did not match, then * substrs[n] is set to null. Not to * be confused with "", which is a valid value for a * subexpression that matched 0 characters. *

* The length that the caller should use when allocating the * substr array is the return value of * Regexp.subspecs. The array * can be shorter (in which case not all the information will * be returned), or longer (in which case the remainder of the * elements are initialized to null), or * null (to ignore the subexpressions). * * @param str * The string to match. * * @param substrs * An array of strings allocated by the caller, and filled in * with information about the portions of str that * matched the regular expression. May be null. * * @return true if str that matched this * regular expression, false otherwise. * If false is returned, then the contents of * substrs are unchanged. * * @see #subspecs */ public boolean match(String str, String[] substrs) { Match m = exec(str, 0, 0); if (m == null) { return false; } if (substrs != null) { int max = Math.min(substrs.length, npar); int i; int j = 0; for (i = 0; i < max; i++) { int start = m.indices[j++]; int end = m.indices[j++]; if (start < 0) { substrs[i] = null; } else { substrs[i] = str.substring(start, end); } } for ( ; i < substrs.length; i++) { substrs[i] = null; } } return true; } /** * Matches the given string against this regular expression, and computes * the set of substrings that matched the parenthesized subexpressions. *

* For the indices specified below, the range extends from the character * at the starting index up to, but not including, the character at the * ending index. *

* indices[0] and indices[1] are set to * starting and ending indices of the range of str * that matched the entire regular expression. *

* indices[2] and indices[3] are set to the * starting and ending indices of the range of str that * matched the first (leftmost) parenthesized subexpression. * indices[n * 2] and indices[n * 2 + 1] * are set to the range that matched the nth * subexpression, and so on. *

* If subexpression n did not match, then * indices[n * 2] and indices[n * 2 + 1] * are both set to -1. *

* The length that the caller should use when allocating the * indices array is twice the return value of * Regexp.subspecs. The array * can be shorter (in which case not all the information will * be returned), or longer (in which case the remainder of the * elements are initialized to -1), or * null (to ignore the subexpressions). * * @param str * The string to match. * * @param indices * An array of integers allocated by the caller, and filled in * with information about the portions of str that * matched all the parts of the regular expression. * May be null. * * @return true if the string matched the regular expression, * false otherwise. If false is * returned, then the contents of indices are * unchanged. * * @see #subspecs */ public boolean match(String str, int[] indices) { Match m = exec(str, 0, 0); if (m == null) { return false; } if (indices != null) { int max = Math.min(indices.length, npar * 2); System.arraycopy(m.indices, 0, indices, 0, max); for (int i = max; i < indices.length; i++) { indices[i] = -1; } } return true; } /** * Matches a string against a regular expression and replaces the first * match with the string generated from the substitution parameter. * * @param str * The string to match against this regular expression. * * @param subspec * The substitution parameter, described in * REGULAR EXPRESSION SUBSTITUTION. * * @return The string formed by replacing the first match in * str with the string generated from * subspec. If no matches were found, then * the return value is null. */ public String sub(String str, String subspec) { Regsub rs = new Regsub(this, str); if (rs.nextMatch()) { StringBuffer sb = new StringBuffer(rs.skipped()); applySubspec(rs, subspec, sb); sb.append(rs.rest()); return sb.toString(); } else { return null; } } /** * Matches a string against a regular expression and replaces all * matches with the string generated from the substitution parameter. * After each substutition is done, the portions of the string already * examined, including the newly substituted region, are not checked * again for new matches -- only the rest of the string is examined. * * @param str * The string to match against this regular expression. * * @param subspec * The substitution parameter, described in * REGULAR EXPRESSION SUBSTITUTION. * * @return The string formed by replacing all the matches in * str with the strings generated from * subspec. If no matches were found, then * the return value is a copy of str. */ public String subAll(String str, String subspec) { return sub(str, new SubspecFilter(subspec, true)); } /** * Utility method to give access to the standard substitution algorithm * used by sub and subAll. Appends to the * string buffer the string generated by applying the substitution * parameter to the matched region. * * @param rs * Information about the matched region. * * @param subspec * The substitution parameter. * * @param sb * StringBuffer to which the generated string is appended. */ public static void applySubspec(Regsub rs, String subspec, StringBuffer sb) { try { int len = subspec.length(); for (int i = 0; i < len; i++) { char ch = subspec.charAt(i); switch (ch) { case '&': { sb.append(rs.matched()); break; } case '\\': { i++; ch = subspec.charAt(i); if ((ch >= '0') && (ch <= '9')) { String match = rs.submatch(ch - '0'); if (match != null) { sb.append(match); } break; } // fall through. } default: { sb.append(ch); } } } } catch (IndexOutOfBoundsException e) { /* * Ignore malformed substitution pattern. * Return string matched so far. */ } } public String sub(String str, Filter rf) { Regsub rs = new Regsub(this, str); if (rs.nextMatch() == false) { return str; } StringBuffer sb = new StringBuffer(); do { sb.append(rs.skipped()); if (rf.filter(rs, sb) == false) { break; } } while (rs.nextMatch()); sb.append(rs.rest()); return sb.toString(); } /** * This interface is used by the Regexp class to generate * the replacement string for each pattern match found in the source * string. * * @author Colin Stevens ([email protected]) * @version 2.3, 04/12/30 */ public interface Filter { /** * Given the current state of the match, generate the replacement * string. This method will be called for each match found in * the source string, unless this filter decides not to handle any * more matches. *

* The implementation can use whatever rules it chooses * to generate the replacement string. For example, here is an * example of a filter that replaces the first 5 * occurrences of "%XX" in a string with the ASCII character * represented by the hex digits "XX": *

	 * String str = ...;
	 *
	 * Regexp re = new Regexp("%[a-fA-F0-9][a-fA-F0-9]");
	 *
	 * Regexp.Filter rf = new Regexp.Filter() {
	 *     int count = 5;
	 *     public boolean filter(Regsub rs, StringBuffer sb) {
	 *         String match = rs.matched();
	 *         int hi = Character.digit(match.charAt(1), 16);
	 *         int lo = Character.digit(match.charAt(2), 16);
	 *         sb.append((char) ((hi << 4) | lo));
	 *         return (--count > 0);
	 *     }
	 * }
	 *
	 * String result = re.sub(str, rf);
	 * 
* * @param rs * Regsub containing the state of the current * match. * * @param sb * The string buffer that this filter should append the * generated string to. This string buffer actually * contains the results the calling Regexp has * generated up to this point. * * @return false if no further matches should be * considered in this string, true to allow * Regexp to continue looking for further * matches. */ public boolean filter(Regsub rs, StringBuffer sb); } private static class SubspecFilter implements Filter { String subspec; boolean all; public SubspecFilter(String subspec, boolean all) { this.subspec = subspec; this.all = all; } public boolean filter(Regsub rs, StringBuffer sb) { applySubspec(rs, subspec, sb); return all; } } /** * Returns a string representation of this compiled regular * expression. The format of the string representation is a * symbolic dump of the bytecodes. * * @return A string representation of this regular expression. */ public String toString() { StringBuffer sb = new StringBuffer(); sb.append("# subs: " + npar + "\n"); sb.append("anchor: " + anchored + "\n"); sb.append("start: " + (char) startChar + "\n"); sb.append("must: " + must + "\n"); for (int i = 0; i < program.length; ) { sb.append(i + ":\t"); int op = program[i]; if (op >= CLOSE) { sb.append("CLOSE" + (op - CLOSE)); } else if (op >= OPEN) { sb.append("OPEN" + (op - OPEN)); } else { sb.append(opnames[op]); } int line; int offset = (int) program[i + 1]; if (offset == 0) { sb.append('\t'); } else if (op == BACK) { sb.append("\t-" + offset + "," + (i - offset)); } else { sb.append("\t+" + offset + "," + (i + offset)); } if ((op == ANYOF) || (op == ANYBUT) || (op == EXACTLY)) { sb.append("\t'"); sb.append(program, i + 3, program[i + 2]); sb.append("'"); i += 3 + program[i + 2]; } else { i += 2; } sb.append('\n'); } return sb.toString(); } private void compile(String exp) throws IllegalArgumentException { Compiler rcstate = new Compiler(); rcstate.parse = exp.toCharArray(); rcstate.off = 0; rcstate.npar = 1; rcstate.code = new StringBuffer(); rcstate.reg(false); program = rcstate.code.toString().toCharArray(); npar = rcstate.npar; startChar = -1; /* optimize */ if (program[rcstate.regnext(0)] == END) { if (program[2] == BOL) { anchored = true; } else if (program[2] == EXACTLY) { startChar = (int) program[5]; } } /* * If there's something expensive in the r.e., find the * longest literal string that must appear and make it the * regmust. Resolve ties in favor of later strings, since * the regstart check works with the beginning of the r.e. * and avoiding duplication strengthens checking. Not a * strong reason, but sufficient in the absence of others. */ /* if ((rcstate.flagp & Compiler.SPSTART) != 0) { int index = -1; int longest = 0; for (scan = 0; scan < program.length; ) { switch (program[scan]) { case EXACTLY: int length = program[scan + 2]; if (length > longest) { index = scan; longest = length; } // fall through; case ANYOF: case ANYBUT: scan += 3 + program[scan + 2]; break; default: scan += 2; break; } } if (longest > 0) { must = new String(program, index + 3, longest); } } */ } Match exec(String str, int start, int off) { if (ignoreCase) { str = str.toLowerCase(); } Match match = new Match(); match.program = program; /* Mark beginning of line for ^ . */ match.str = str; match.bol = start; match.length = str.length(); match.indices = new int[npar * 2]; if (anchored) { /* Simplest case: anchored match need be tried only once. */ if (match.regtry(off)) { return match; } } else if (startChar >= 0) { /* We know what char it must start with. */ while (off < match.length) { off = str.indexOf(startChar, off); if (off < 0) { break; } if (match.regtry(off)) { return match; } off++; } } else { /* Messy cases: unanchored match. */ do { if (match.regtry(off)) { return match; } } while (off++ < match.length); } return null; } static class Compiler { char[] parse; int off; int npar; StringBuffer code; int flagp; static final String META = "^$.[()|?+*\\"; static final String MULT = "*+?"; static final int WORST = 00; /* Worst case. */ static final int HASWIDTH = 01; /* Known never to match null string. */ static final int SIMPLE = 02; /* Simple enough to be STAR/PLUS operand. */ static final int SPSTART = 04; /* Starts with * or +. */ /* - reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ int reg(boolean paren) throws IllegalArgumentException { int netFlags = HASWIDTH; int parno = 0; int ret = -1; if (paren) { parno = npar++; if (npar >= NSUBEXP) { throw new IllegalArgumentException("too many ()"); } ret = regnode((char) (OPEN + parno)); } /* Pick up the branches, linking them together. */ int br = regbranch(); if (ret >= 0) { regtail(ret, br); } else { ret = br; } if ((flagp & HASWIDTH) == 0) { netFlags &= ~HASWIDTH; } netFlags |= (flagp & SPSTART); while ((off < parse.length) && (parse[off] == '|')) { off++; br = regbranch(); regtail(ret, br); if ((flagp & HASWIDTH) == 0) { netFlags &= ~HASWIDTH; } netFlags |= (flagp & SPSTART); } /* Make a closing node, and hook it on the end. */ int ender = regnode((paren) ? (char) (CLOSE + parno) : END); regtail(ret, ender); /* Hook the tails of the branches to the closing node. */ for (br = ret; br >= 0; br = regnext(br)) { regoptail(br, ender); } /* Check for proper termination. */ if (paren && ((off >= parse.length) || (parse[off++] != ')'))) { throw new IllegalArgumentException("missing )"); } else if ((paren == false) && (off < parse.length)) { throw new IllegalArgumentException("unexpected )"); } flagp = netFlags; return ret; } /* - regbranch - one alternative of an | operator * * Implements the concatenation operator. */ int regbranch() throws IllegalArgumentException { int netFlags = WORST; /* Tentatively. */ int ret = regnode(BRANCH); int chain = -1; while ((off < parse.length) && (parse[off] != '|') && (parse[off] != ')')) { int latest = regpiece(); netFlags |= flagp & HASWIDTH; if (chain < 0) { /* First piece. */ netFlags |= (flagp & SPSTART); } else { regtail(chain, latest); } chain = latest; } if (chain < 0) { /* Loop ran zero times. */ regnode(NOTHING); } flagp = netFlags; return ret; } /* - regpiece - something followed by possible [*+?] * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as * both the endmarker for their branch list and the body of the last branch. * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ int regpiece() throws IllegalArgumentException { int netFlags; int ret = regatom(); if ((off >= parse.length) || (isMult(parse[off]) == false)) { return ret; } char op = parse[off]; if (((flagp & HASWIDTH) == 0) && (op != '?')) { throw new IllegalArgumentException("*+ operand could be empty"); } netFlags = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH); if ((op == '*') && ((flagp & SIMPLE) != 0)) { reginsert(STAR, ret); } else if (op == '*') { /* Emit x* as (x&|), where & means "self". */ reginsert(BRANCH, ret); /* Either x */ regoptail(ret, regnode(BACK)); /* and loop */ regoptail(ret, ret); /* back */ regtail(ret, regnode(BRANCH)); /* or */ regtail(ret, regnode(NOTHING)); /* null. */ } else if ((op == '+') && ((flagp & SIMPLE) != 0)) { reginsert(PLUS, ret); } else if (op == '+') { /* Emit x+ as x(&|), where & means "self". */ int next = regnode(BRANCH); /* Either */ regtail(ret, next); regtail(regnode(BACK), ret); /* loop back */ regtail(next, regnode(BRANCH)); /* or */ regtail(ret, regnode(NOTHING)); /* null. */ } else if (op == '?') { /* Emit x? as (x|) */ reginsert(BRANCH, ret); /* Either x */ regtail(ret, regnode(BRANCH)); /* or */ int next = regnode(NOTHING); /* null. */ regtail(ret, next); regoptail(ret, next); } off++; if ((off < parse.length) && isMult(parse[off])) { throw new IllegalArgumentException("nested *?+"); } flagp = netFlags; return ret; } /* - regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. */ int regatom() throws IllegalArgumentException { int netFlags = WORST; /* Tentatively. */ int ret; switch (parse[off++]) { case '^': ret = regnode(BOL); break; case '$': ret = regnode(EOL); break; case '.': ret = regnode(ANY); netFlags |= (HASWIDTH | SIMPLE); break; case '[': { try { if (parse[off] == '^') { ret = regnode(ANYBUT); off++; } else { ret = regnode(ANYOF); } int pos = reglen(); regc('\0'); if ((parse[off] == ']') || (parse[off] == '-')) { regc(parse[off++]); } while (parse[off] != ']') { if (parse[off] == '-') { off++; if (parse[off] == ']') { regc('-'); } else { int start = parse[off - 2]; int end = parse[off++]; if (start > end) { throw new IllegalArgumentException( "invalid [] range"); } for (int i = start + 1; i <= end; i++) { regc((char) i); } } } else { regc(parse[off++]); } } regset(pos, (char) (reglen() - pos - 1)); off++; netFlags |= HASWIDTH | SIMPLE; } catch (ArrayIndexOutOfBoundsException e) { throw new IllegalArgumentException("missing ]"); } break; } case '(': ret = reg(true); netFlags |= (flagp & (HASWIDTH | SPSTART)); break; case '|': case ')': throw new IllegalArgumentException("internal urp"); case '?': case '+': case '*': throw new IllegalArgumentException("?+* follows nothing"); case '\\': if (off >= parse.length) { throw new IllegalArgumentException("trailing \\"); } ret = regnode(EXACTLY); regc((char) 1); regc(parse[off++]); netFlags |= HASWIDTH | SIMPLE; break; default: { off--; int end; for (end = off; end < parse.length; end++) { if (META.indexOf(parse[end]) >= 0) { break; } } if ((end > off + 1) && (end < parse.length) && isMult(parse[end])) { end--; /* Back off clear of ?+* operand. */ } netFlags |= HASWIDTH; if (end == off + 1) { netFlags |= SIMPLE; } ret = regnode(EXACTLY); regc((char) (end - off)); for ( ; off < end; off++) { regc(parse[off]); } } break; } flagp = netFlags; return ret; } /* - regnode - emit a node */ int regnode(char op) { int ret = code.length(); code.append(op); code.append('\0'); return ret; } /* - regc - emit (if appropriate) a byte of code */ void regc(char b) { code.append(b); } int reglen() { return code.length(); } void regset(int pos, char ch) { code.setCharAt(pos, ch); } /* - reginsert - insert an operator in front of already-emitted operand * * Means relocating the operand. */ void reginsert(char op, int pos) { char[] tmp = new char[] {op, '\0'}; code.insert(pos, tmp); } /* - regtail - set the next-pointer at the end of a node chain */ void regtail(int pos, int val) { /* Find last node. */ int scan = pos; while (true) { int tmp = regnext(scan); if (tmp < 0) { break; } scan = tmp; } int offset = (code.charAt(scan) == BACK) ? scan - val : val - scan; code.setCharAt(scan + 1, (char) offset); } /* - regoptail - regtail on operand of first argument; nop if operandless */ void regoptail(int pos, int val) { if ((pos < 0) || (code.charAt(pos) != BRANCH)) { return; } regtail(pos + 2, val); } /* - regnext - dig the "next" pointer out of a node */ int regnext(int pos) { int offset = code.charAt(pos + 1); if (offset == 0) { return -1; } if (code.charAt(pos) == BACK) { return pos - offset; } else { return pos + offset; } } static boolean isMult(char ch) { return (ch == '*') || (ch == '+') || (ch == '?'); } } static class Match { char[] program; String str; int bol; int input; int length; int[] indices; boolean regtry(int off) { this.input = off; for (int i = 0; i < indices.length; i++) { indices[i] = -1; } if (regmatch(0)) { indices[0] = off; indices[1] = input; return true; } else { return false; } } /* - regmatch - main matching routine * * Conceptually the strategy is simple: check to see whether the current * node matches, call self recursively to see whether the rest matches, * and then act accordingly. In practice we make some effort to avoid * recursion, in particular by going through "ordinary" nodes (that don't * need to know whether the rest of the match failed) by a loop instead of * by recursion. */ boolean regmatch(int scan) { while (true) { int next = regnext(scan); int op = program[scan]; switch (op) { case BOL: if (input != bol) { return false; } break; case EOL: if (input != length) { return false; } break; case ANY: if (input >= length) { return false; } input++; break; case EXACTLY: { if (compare(scan) == false) { return false; } break; } case ANYOF: if (input >= length) { return false; } if (present(scan) == false) { return false; } input++; break; case ANYBUT: if (input >= length) { return false; } if (present(scan)) { return false; } input++; break; case NOTHING: case BACK: break; case BRANCH: { if (program[next] != BRANCH) { next = scan + 2; } else { do { int save = input; if (regmatch(scan + 2)) { return true; } input = save; scan = regnext(scan); } while ((scan >= 0) && (program[scan] == BRANCH)); return false; } break; } case STAR: case PLUS: { /* * Lookahead to avoid useless match attempts * when we know what character comes next. */ int ch = -1; if (program[next] == EXACTLY) { ch = program[next + 3]; } int min = (op == STAR) ? 0 : 1; int save = input; int no = regrepeat(scan + 2); while (no >= min) { /* If it could work, try it. */ if ((ch < 0) || ((input < length) && (str.charAt(input) == ch))) { if (regmatch(next)) { return true; } } /* Couldn't or didn't -- back up. */ no--; input = save + no; } return false; } case END: return true; default: if (op >= CLOSE) { int no = op - CLOSE; int save = input; if (regmatch(next)) { /* * Don't set endp if some later * invocation of the same parentheses * already has. */ if (indices[no * 2 + 1] <= 0) { indices[no * 2 + 1] = save; } return true; } } else if (op >= OPEN) { int no = op - OPEN; int save = input; if (regmatch(next)) { /* * Don't set startp if some later invocation of the * same parentheses already has. */ if (indices[no * 2] <= 0) { indices[no * 2] = save; } return true; } } return false; } scan = next; } } boolean compare(int scan) { int count = program[scan + 2]; if (input + count > length) { return false; } int start = scan + 3; int end = start + count; for (int i = start; i < end; i++) { if (str.charAt(input++) != program[i]) { return false; } } return true; } boolean present(int scan) { char ch = str.charAt(input); int count = program[scan + 2]; int start = scan + 3; int end = start + count; for (int i = start; i < end; i++) { if (program[i] == ch) { return true; } } return false; } /* - regrepeat - repeatedly match something simple, report how many */ int regrepeat(int scan) { int op = program[scan]; int count = 0; switch (op) { case ANY: // '.*' matches all the way to the end. count = length - input; input = length; break; case EXACTLY: { // 'g*' matches all the following 'g' characters. char ch = program[scan + 3]; while ((input < length) && (str.charAt(input) == ch)) { input++; count++; } break; } case ANYOF: // [abc]* while ((input < length) && present(scan)) { input++; count++; } break; case ANYBUT: while ((input < length) && !present(scan)) { input++; count++; } break; } return count; } /* - regnext - dig the "next" pointer out of a node */ int regnext(int scan) { int offset = program[scan + 1]; if (program[scan] == BACK) { return scan - offset; } else { return scan + offset; } } public String toString() { String result = "Match: str=" + str + " "; for (int i=0;i




© 2015 - 2024 Weber Informatics LLC | Privacy Policy