org.apache.lucene.util.automaton.RegExp Maven / Gradle / Ivy

Go to download
/*
 * dk.brics.automaton
 * 
 * Copyright (c) 2001-2009 Anders Moeller
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.apache.lucene.util.automaton;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * Regular Expression extension to Automaton.
 * 
 * Regular expressions are built from the following abstract syntax:
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * regexp ::= unionexp 
| 
unionexp ::= interexp | unionexp (union) 
| interexp 
interexp ::= concatexp & interexp (intersection) [OPTIONAL]
| concatexp 
concatexp ::= repeatexp concatexp (concatenation) 
| repeatexp 
repeatexp ::= repeatexp ? (zero or one occurrence) 
| repeatexp * (zero or more occurrences) 
| repeatexp + (one or more occurrences) 
| repeatexp {n} (n occurrences) 
| repeatexp {n,} (n or more occurrences) 
| repeatexp {n,m} (n to m occurrences, including both) 
| complexp 
complexp ::= ~ complexp (complement) [OPTIONAL]
| charclassexp 
charclassexp ::= [ charclasses ] (character class) 
| [^ charclasses ] (negated character class) 
| simpleexp 
charclasses ::= charclass charclasses 
| charclass 
charclass ::= charexp - charexp (character range, including end-points) 
| charexp 
simpleexp ::= charexp 
| . (any single character) 
| # (the empty language) [OPTIONAL]
| @ (any string) [OPTIONAL]
| " <Unicode string without double-quotes>  " (a string) 
| ( ) (the empty string) 
| ( unionexp ) (precedence override) 
| < <identifier> > (named automaton) [OPTIONAL]
| <n-m> (numerical interval) [OPTIONAL]
charexp ::= <Unicode character> (a single non-reserved character) 
| \ <Unicode character>  (a single character) 
 * 
 * The productions marked [OPTIONAL] are only allowed if
 * specified by the syntax flags passed to the RegExp constructor.
 * The reserved characters used in the (enabled) syntax must be escaped with
 * backslash (\) or double-quotes ("..."). (In
 * contrast to other regexp syntaxes, this is required also in character
 * classes.) Be aware that dash (-) has a special meaning in
 * charclass expressions. An identifier is a string not containing right
 * angle bracket (>) or dash (-). Numerical
 * intervals are specified by non-negative decimal integers and include both end
 * points, and if n and m have the same number
 * of digits, then the conforming strings must have that length (i.e. prefixed
 * by 0's).
 * 
 * @lucene.experimental
 */
public class RegExp {
  
  /**
   * The type of expression represented by a RegExp node.
   */
  public enum Kind {
    /** The union of two expressions */
    REGEXP_UNION, 
    /** A sequence of two expressions */
    REGEXP_CONCATENATION,
    /** The intersection of two expressions */
    REGEXP_INTERSECTION,
    /** An optional expression */
    REGEXP_OPTIONAL,
    /** An expression that repeats */
    REGEXP_REPEAT,
    /** An expression that repeats a minimum number of times*/
    REGEXP_REPEAT_MIN,
    /** An expression that repeats a minimum and maximum number of times*/
    REGEXP_REPEAT_MINMAX,
    /** The complement of an expression */
    REGEXP_COMPLEMENT,
    /** A Character */
    REGEXP_CHAR,
    /** A Character range*/
    REGEXP_CHAR_RANGE,
    /** Any Character allowed*/
    REGEXP_ANYCHAR,
    /** An empty expression*/
    REGEXP_EMPTY,
    /** A string expression*/
    REGEXP_STRING,
    /** Any string allowed */
    REGEXP_ANYSTRING,
    /** An Automaton expression*/
    REGEXP_AUTOMATON,
    /** An Interval expression */
    REGEXP_INTERVAL,
    /** An expression for a pre-defined class e.g. \w */
  }
  
  //-----  Syntax flags ( <= 0xff )  ------
  /**
   * Syntax flag, enables intersection (&).
   */
  public static final int INTERSECTION = 0x0001;
  
  /**
   * Syntax flag, enables complement (~).
   */
  public static final int COMPLEMENT = 0x0002;
  
  /**
   * Syntax flag, enables empty language (#).
   */
  public static final int EMPTY = 0x0004;
  
  /**
   * Syntax flag, enables anystring (@).
   */
  public static final int ANYSTRING = 0x0008;
  
  /**
   * Syntax flag, enables named automata (<identifier>).
   */
  public static final int AUTOMATON = 0x0010;
  
  /**
   * Syntax flag, enables numerical intervals (
   * <n-m>).
   */
  public static final int INTERVAL = 0x0020;
  
  /**
   * Syntax flag, enables all optional regexp syntax.
   */
  public static final int ALL = 0xff;
  
  /**
   * Syntax flag, enables no optional regexp syntax.
   */
  public static final int NONE = 0x0000;

  //-----  Matching flags ( > 0xff )  ------

  /**
   * Allows case insensitive matching of ASCII characters.
   */
  public static final int ASCII_CASE_INSENSITIVE = 0x0100;   
  
  
  //Immutable parsed state
  /**
   * The type of expression
   */
  public final Kind kind;
  /**
   * Child expressions held by a container type expression
   */
  public final RegExp exp1, exp2;
  /**
   * String expression
   */
  public final String s;
  /**
   *  Character expression
   */
  public final int c;
  /**
   * Limits for repeatable type expressions
   */
  public final int min, max, digits;
  /**
   * Extents for range type expressions
   */
  public final int from, to;

  // Parser variables
  private final String originalString;
  final int flags;
  int pos;
    
  /**
   * Constructs new RegExp from a string. Same as
   * RegExp(s, ALL).
   * 
   * @param s regexp string
   * @exception IllegalArgumentException if an error occurred while parsing the
   *              regular expression
   */
  public RegExp(String s) throws IllegalArgumentException {
    this(s, ALL);
  }
  
  /**
   * Constructs new RegExp from a string.
   * 
   * @param s regexp string
   * @param syntax_flags boolean 'or' of optional syntax constructs to be
   *          enabled
   * @exception IllegalArgumentException if an error occurred while parsing the
   *              regular expression
   */
  public RegExp(String s, int syntax_flags) throws IllegalArgumentException {
    this(s, syntax_flags, 0);
  }
  
  /**
   * Constructs new RegExp from a string.
   * 
   * @param s regexp string
   * @param syntax_flags boolean 'or' of optional syntax constructs to be
   *          enabled
   * @param match_flags boolean 'or' of match behavior options such as case insensitivity
   * @exception IllegalArgumentException if an error occurred while parsing the
   *              regular expression
   */
  public RegExp(String s, int syntax_flags, int match_flags) throws IllegalArgumentException {
    originalString = s;
    if (syntax_flags >  ALL) {
      throw new IllegalArgumentException("Illegal syntax flag");
    }
    
    if (match_flags > 0 && match_flags <= ALL) {
      throw new IllegalArgumentException("Illegal match flag");
    }
    flags = syntax_flags | match_flags;
    RegExp e;
    if (s.length() == 0) e = makeString(flags, "");
    else {
      e = parseUnionExp();
      if (pos < originalString.length()) throw new IllegalArgumentException(
          "end-of-string expected at position " + pos);
    }
    kind = e.kind;
    exp1 = e.exp1;
    exp2 = e.exp2;
    this.s = e.s;
    c = e.c;
    min = e.min;
    max = e.max;
    digits = e.digits;
    from = e.from;
    to = e.to;
  }
  
  RegExp(int flags, Kind kind, RegExp exp1, RegExp exp2, String s, int c, int min, int max, int digits, int from, int to){
    this.originalString = null;
    this.kind = kind;
    this.flags = flags;
    this.exp1 = exp1;
    this.exp2 = exp2;
    this.s = s;
    this.c = c;
    this.min = min;
    this.max = max;
    this.digits = digits;
    this.from = from;
    this.to = to;
  }

  // Simplified construction of container nodes
  static RegExp newContainerNode(int flags, Kind kind, RegExp exp1, RegExp exp2) {
    return new RegExp(flags, kind, exp1, exp2, null, 0, 0, 0, 0, 0, 0);
  }

  // Simplified construction of repeating nodes
  static RegExp newRepeatingNode(int flags, Kind kind, RegExp exp,  int min, int max) {
    return new RegExp(flags, kind, exp, null, null, 0, min, max, 0, 0, 0);
  }  
  
  
  // Simplified construction of leaf nodes
  static RegExp newLeafNode(int flags, Kind kind, String s, int c, int min, int max, int digits, int from, int to) {
    return new RegExp(flags, kind, null, null, s, c, min, max, digits, from, to);
  }  

  /**
   * Constructs new Automaton from this RegExp. Same
   * as toAutomaton(null) (empty automaton map).
   */
  public Automaton toAutomaton() {
    return toAutomaton(null, null, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
  }

  /**
   * Constructs new Automaton from this RegExp. The
   * constructed automaton is minimal and deterministic and has no transitions
   * to dead states.
   * 
   * @param determinizeWorkLimit maximum effort to spend while determinizing the automata. If
   *     determinizing the automata would require more than this effort,
   *     TooComplexToDeterminizeException is thrown. Higher numbers require more space but can
   *     process more complex regexes. Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a
   *     decent default if you don't otherwise know what to specify.
   * @exception IllegalArgumentException if this regular expression uses a named
   *     identifier that is not available from the automaton provider
   * @exception TooComplexToDeterminizeException if determinizing this regexp requires more effort
   *     than determinizeWorkLimit states
   */
  public Automaton toAutomaton(int determinizeWorkLimit)
      throws IllegalArgumentException, TooComplexToDeterminizeException {
    return toAutomaton(null, null, determinizeWorkLimit);
  }

  /**
   * Constructs new Automaton from this RegExp. The
   * constructed automaton is minimal and deterministic and has no transitions
   * to dead states.
   * 
   * @param automaton_provider provider of automata for named identifiers
   * @param determinizeWorkLimit maximum effort to spend while determinizing the automata. If
   *     determinizing the automata would require more than this effort,
   *     TooComplexToDeterminizeException is thrown. Higher numbers require more space but can
   *     process more complex regexes. Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a
   *     decent default if you don't otherwise know what to specify.
   * @exception IllegalArgumentException if this regular expression uses a named
   *   identifier that is not available from the automaton provider
   * @exception TooComplexToDeterminizeException if determinizing this regexp requires more effort
   *     than determinizeWorkLimit states
   */
  public Automaton toAutomaton(AutomatonProvider automaton_provider,
      int determinizeWorkLimit) throws IllegalArgumentException,
      TooComplexToDeterminizeException {
    return toAutomaton(null, automaton_provider, determinizeWorkLimit);
  }
  
  /**
   * Constructs new Automaton from this RegExp. The
   * constructed automaton is minimal and deterministic and has no transitions
   * to dead states.
   * 
   * @param automata a map from automaton identifiers to automata (of type
   *          Automaton).
   * @param determinizeWorkLimit maximum effort to spend while determinizing the automata. If
   *     determinizing the automata would require more than this effort,
   *     TooComplexToDeterminizeException is thrown. Higher numbers require more space but can
   *     process more complex regexes.
   * @exception IllegalArgumentException if this regular expression uses a named
   *   identifier that does not occur in the automaton map
   * @exception TooComplexToDeterminizeException if determinizing this regexp requires more effort
   *     than determinizeWorkLimit states
   */
  public Automaton toAutomaton(Map automata,
      int determinizeWorkLimit) throws IllegalArgumentException,
      TooComplexToDeterminizeException {
    return toAutomaton(automata, null, determinizeWorkLimit);
  }

  private Automaton toAutomaton(Map automata,
      AutomatonProvider automaton_provider, int determinizeWorkLimit)
      throws IllegalArgumentException, TooComplexToDeterminizeException {
    try {
      return toAutomatonInternal(automata, automaton_provider,
        determinizeWorkLimit);
    } catch (TooComplexToDeterminizeException e) {
      throw new TooComplexToDeterminizeException(this, e);
    }
  }

  private Automaton toAutomatonInternal(Map automata,
      AutomatonProvider automaton_provider, int determinizeWorkLimit)
      throws IllegalArgumentException {
    List list;
    Automaton a = null;
    switch (kind) {
      case REGEXP_UNION:
        list = new ArrayList<>();
        findLeaves(exp1, Kind.REGEXP_UNION, list, automata, automaton_provider,
          determinizeWorkLimit);
        findLeaves(exp2, Kind.REGEXP_UNION, list, automata, automaton_provider,
          determinizeWorkLimit);
        a = Operations.union(list);
        a = MinimizationOperations.minimize(a, determinizeWorkLimit);
        break;
      case REGEXP_CONCATENATION:
        list = new ArrayList<>();
        findLeaves(exp1, Kind.REGEXP_CONCATENATION, list, automata,
            automaton_provider, determinizeWorkLimit);
        findLeaves(exp2, Kind.REGEXP_CONCATENATION, list, automata,
            automaton_provider, determinizeWorkLimit);
        a = Operations.concatenate(list);
        a = MinimizationOperations.minimize(a, determinizeWorkLimit);
        break;
      case REGEXP_INTERSECTION:
        a = Operations.intersection(
            exp1.toAutomatonInternal(
              automata, automaton_provider, determinizeWorkLimit),
            exp2.toAutomatonInternal(
              automata, automaton_provider, determinizeWorkLimit));
        a = MinimizationOperations.minimize(a, determinizeWorkLimit);
        break;
      case REGEXP_OPTIONAL:
        a = Operations.optional(exp1.toAutomatonInternal(automata,
          automaton_provider, determinizeWorkLimit));
        a = MinimizationOperations.minimize(a, determinizeWorkLimit);
        break;
      case REGEXP_REPEAT:
        a = Operations.repeat(exp1.toAutomatonInternal(
          automata, automaton_provider, determinizeWorkLimit));
        a = MinimizationOperations.minimize(a, determinizeWorkLimit);
        break;
      case REGEXP_REPEAT_MIN:
        a = exp1.toAutomatonInternal(automata, automaton_provider, determinizeWorkLimit);
        int minNumStates = (a.getNumStates() - 1) * min;
        if (minNumStates > determinizeWorkLimit) {
          throw new TooComplexToDeterminizeException(a, minNumStates);
        }
        a = Operations.repeat(a, min);
        a = MinimizationOperations.minimize(a, determinizeWorkLimit);
        break;
      case REGEXP_REPEAT_MINMAX:
        a = exp1.toAutomatonInternal(automata, automaton_provider, determinizeWorkLimit);
        int minMaxNumStates = (a.getNumStates() - 1) * max;
        if (minMaxNumStates > determinizeWorkLimit) {
          throw new TooComplexToDeterminizeException(a, minMaxNumStates);
        }
        a = Operations.repeat(a, min, max);
        break;
      case REGEXP_COMPLEMENT:
        a = Operations.complement(
          exp1.toAutomatonInternal(automata, automaton_provider,
            determinizeWorkLimit),
          determinizeWorkLimit);
        a = MinimizationOperations.minimize(a, determinizeWorkLimit);
        break;
      case REGEXP_CHAR:
        if (check(ASCII_CASE_INSENSITIVE)) {
          a = toCaseInsensitiveChar(c, determinizeWorkLimit);
        } else {
          a = Automata.makeChar(c);          
        }
        break;
      case REGEXP_CHAR_RANGE:
        a = Automata.makeCharRange(from, to);
        break;
      case REGEXP_ANYCHAR:
        a = Automata.makeAnyChar();
        break;
      case REGEXP_EMPTY:
        a = Automata.makeEmpty();
        break;
      case REGEXP_STRING:
        if (check(ASCII_CASE_INSENSITIVE)) {
          a = toCaseInsensitiveString(determinizeWorkLimit);
        } else {
          a = Automata.makeString(s);
        }
        break;
      case REGEXP_ANYSTRING:
        a = Automata.makeAnyString();
        break;
      case REGEXP_AUTOMATON:
        Automaton aa = null;
        if (automata != null) {
          aa = automata.get(s);
        }
        if (aa == null && automaton_provider != null) {
          try {
            aa = automaton_provider.getAutomaton(s);
          } catch (IOException e) {
            throw new IllegalArgumentException(e);
          }
        }
        if (aa == null) {
          throw new IllegalArgumentException("'" + s + "' not found");
        }
        a = aa;
        break;
      case REGEXP_INTERVAL:
        a = Automata.makeDecimalInterval(min, max, digits);
        break;
    }
    return a;
  }
  
  
  private Automaton toCaseInsensitiveChar(int codepoint, int determinizeWorkLimit) {
    Automaton case1 = Automata.makeChar(codepoint);
    // For now we only work with ASCII characters
    if (codepoint > 128) {
      return case1;
    }
    int altCase = Character.isLowerCase(codepoint) ? Character.toUpperCase(codepoint) : Character.toLowerCase(codepoint);
    Automaton result;
    if (altCase != codepoint) {
      result = Operations.union(case1, Automata.makeChar(altCase));
      result = MinimizationOperations.minimize(result, determinizeWorkLimit);          
    } else {
      result = case1;                      
    }          
    return result;
  }
  
  private Automaton toCaseInsensitiveString(int determinizeWorkLimit) {
    List list = new ArrayList<>();
    
    Iterator iter = s.codePoints().iterator();
    while (iter.hasNext()) {
      list.add(toCaseInsensitiveChar(iter.next(), determinizeWorkLimit));
    }
    Automaton a = Operations.concatenate(list);
    a = MinimizationOperations.minimize(a, determinizeWorkLimit);
    return a;
  }    
  
  private void findLeaves(RegExp exp, Kind kind, List list,
      Map automata, AutomatonProvider automaton_provider,
      int determinizeWorkLimit) {
    if (exp.kind == kind) {
      findLeaves(exp.exp1, kind, list, automata, automaton_provider,
        determinizeWorkLimit);
      findLeaves(exp.exp2, kind, list, automata, automaton_provider,
        determinizeWorkLimit);
    } else {
      list.add(exp.toAutomatonInternal(automata, automaton_provider, 
        determinizeWorkLimit));
    }
  }

  /**
   * The string that was used to construct the regex.  Compare to toString.
   */
  public String getOriginalString() {
    return originalString;
  }

  /**
   * Constructs string from parsed regular expression.
   */
  @Override
  public String toString() {
    StringBuilder b = new StringBuilder();
    toStringBuilder(b);
    return b.toString();
  }
  
  void toStringBuilder(StringBuilder b) {
    switch (kind) {
      case REGEXP_UNION:
        b.append("(");
        exp1.toStringBuilder(b);
        b.append("|");
        exp2.toStringBuilder(b);
        b.append(")");
        break;
      case REGEXP_CONCATENATION:
        exp1.toStringBuilder(b);
        exp2.toStringBuilder(b);
        break;
      case REGEXP_INTERSECTION:
        b.append("(");
        exp1.toStringBuilder(b);
        b.append("&");
        exp2.toStringBuilder(b);
        b.append(")");
        break;
      case REGEXP_OPTIONAL:
        b.append("(");
        exp1.toStringBuilder(b);
        b.append(")?");
        break;
      case REGEXP_REPEAT:
        b.append("(");
        exp1.toStringBuilder(b);
        b.append(")*");
        break;
      case REGEXP_REPEAT_MIN:
        b.append("(");
        exp1.toStringBuilder(b);
        b.append("){").append(min).append(",}");
        break;
      case REGEXP_REPEAT_MINMAX:
        b.append("(");
        exp1.toStringBuilder(b);
        b.append("){").append(min).append(",").append(max).append("}");
        break;
      case REGEXP_COMPLEMENT:
        b.append("~(");
        exp1.toStringBuilder(b);
        b.append(")");
        break;
      case REGEXP_CHAR:
        b.append("\\").appendCodePoint(c);
        break;
      case REGEXP_CHAR_RANGE:
        b.append("[\\").appendCodePoint(from).append("-\\").appendCodePoint(to).append("]");
        break;
      case REGEXP_ANYCHAR:
        b.append(".");
        break;
      case REGEXP_EMPTY:
        b.append("#");
        break;
      case REGEXP_STRING:
        b.append("\"").append(s).append("\"");
        break;
      case REGEXP_ANYSTRING:
        b.append("@");
        break;
      case REGEXP_AUTOMATON:
        b.append("<").append(s).append(">");
        break;
      case REGEXP_INTERVAL:
        String s1 = Integer.toString(min);
        String s2 = Integer.toString(max);
        b.append("<");
        if (digits > 0) for (int i = s1.length(); i < digits; i++)
          b.append('0');
        b.append(s1).append("-");
        if (digits > 0) for (int i = s2.length(); i < digits; i++)
          b.append('0');
        b.append(s2).append(">");
        break;
    }
  }

  /**
   * Like to string, but more verbose (shows the higherchy more clearly).
   */
  public String toStringTree() {
    StringBuilder b = new StringBuilder();
    toStringTree(b, "");
    return b.toString();
  }

  void toStringTree(StringBuilder b, String indent) {
    switch (kind) {
      // binary
      case REGEXP_UNION:
      case REGEXP_CONCATENATION:
      case REGEXP_INTERSECTION:
        b.append(indent);
        b.append(kind);
        b.append('\n');
        exp1.toStringTree(b, indent + "  ");
        exp2.toStringTree(b, indent + "  ");
        break;
      // unary
      case REGEXP_OPTIONAL:
      case REGEXP_REPEAT:
      case REGEXP_COMPLEMENT:
        b.append(indent);
        b.append(kind);
        b.append('\n');
        exp1.toStringTree(b, indent + "  ");
        break;
      case REGEXP_REPEAT_MIN:
        b.append(indent);
        b.append(kind);
        b.append(" min=");
        b.append(min);
        b.append('\n');
        exp1.toStringTree(b, indent + "  ");
        break;
      case REGEXP_REPEAT_MINMAX:
        b.append(indent);
        b.append(kind);
        b.append(" min=");
        b.append(min);
        b.append(" max=");
        b.append(max);
        b.append('\n');
        exp1.toStringTree(b, indent + "  ");
        break;
      case REGEXP_CHAR:
        b.append(indent);
        b.append(kind);
        b.append(" char=");
        b.appendCodePoint(c);
        b.append('\n');
        break;
      case REGEXP_CHAR_RANGE:
        b.append(indent);
        b.append(kind);
        b.append(" from=");
        b.appendCodePoint(from);
        b.append(" to=");
        b.appendCodePoint(to);
        b.append('\n');
        break;
      case REGEXP_ANYCHAR:
      case REGEXP_EMPTY:
        b.append(indent);
        b.append(kind);
        b.append('\n');
        break;
      case REGEXP_STRING:
        b.append(indent);
        b.append(kind);
        b.append(" string=");
        b.append(s);
        b.append('\n');
        break;
      case REGEXP_ANYSTRING:
        b.append(indent);
        b.append(kind);
        b.append('\n');
        break;
      case REGEXP_AUTOMATON:
        b.append(indent);
        b.append(kind);
        b.append('\n');
        break;
      case REGEXP_INTERVAL:
        b.append(indent);
        b.append(kind);
        String s1 = Integer.toString(min);
        String s2 = Integer.toString(max);
        b.append("<");
        if (digits > 0) for (int i = s1.length(); i < digits; i++)
          b.append('0');
        b.append(s1).append("-");
        if (digits > 0) for (int i = s2.length(); i < digits; i++)
          b.append('0');
        b.append(s2).append(">");
        b.append('\n');
        break;
    }
  }

  /**
   * Returns set of automaton identifiers that occur in this regular expression.
   */
  public Set getIdentifiers() {
    HashSet set = new HashSet<>();
    getIdentifiers(set);
    return set;
  }
  
  void getIdentifiers(Set set) {
    switch (kind) {
      case REGEXP_UNION:
      case REGEXP_CONCATENATION:
      case REGEXP_INTERSECTION:
        exp1.getIdentifiers(set);
        exp2.getIdentifiers(set);
        break;
      case REGEXP_OPTIONAL:
      case REGEXP_REPEAT:
      case REGEXP_REPEAT_MIN:
      case REGEXP_REPEAT_MINMAX:
      case REGEXP_COMPLEMENT:
        exp1.getIdentifiers(set);
        break;
      case REGEXP_AUTOMATON:
        set.add(s);
        break;
      default:
    }
  }
  
  static RegExp makeUnion(int flags, RegExp exp1, RegExp exp2) {
    return newContainerNode(flags, Kind.REGEXP_UNION, exp1, exp2);
  }
  
  static RegExp makeConcatenation(int flags, RegExp exp1, RegExp exp2) {
    if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
        && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString(
            flags, exp1, exp2);
    RegExp rexp1, rexp2;
    if (exp1.kind == Kind.REGEXP_CONCATENATION
        && (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING)
        && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) {
      rexp1 = exp1.exp1;
      rexp2 = makeString(flags, exp1.exp2, exp2);
    } else if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
        && exp2.kind == Kind.REGEXP_CONCATENATION
        && (exp2.exp1.kind == Kind.REGEXP_CHAR || exp2.exp1.kind == Kind.REGEXP_STRING)) {
      rexp1 = makeString(flags, exp1, exp2.exp1);
      rexp2 = exp2.exp2;
    } else {
      rexp1 = exp1;
      rexp2 = exp2;
    }
    return newContainerNode(flags, Kind.REGEXP_CONCATENATION, rexp1, rexp2);
  }
  
  static private RegExp makeString(int flags, RegExp exp1, RegExp exp2) {
    StringBuilder b = new StringBuilder();
    if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s);
    else b.appendCodePoint(exp1.c);
    if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s);
    else b.appendCodePoint(exp2.c);
    return makeString(flags, b.toString());
  }
  
  static RegExp makeIntersection(int flags, RegExp exp1, RegExp exp2) {
    return newContainerNode(flags, Kind.REGEXP_INTERSECTION, exp1, exp2);
  }
  
  static RegExp makeOptional(int flags, RegExp exp) {
    return newContainerNode(flags, Kind.REGEXP_OPTIONAL, exp, null);
  }
  
  static RegExp makeRepeat(int flags, RegExp exp) {
    return newContainerNode(flags, Kind.REGEXP_REPEAT, exp, null);
  }
  
  static RegExp makeRepeat(int flags, RegExp exp, int min) {
    return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MIN, exp, min, 0);
  }
  
  static RegExp makeRepeat(int flags, RegExp exp, int min, int max) {
    return newRepeatingNode(flags, Kind.REGEXP_REPEAT_MINMAX, exp, min, max);
  }
  
  static RegExp makeComplement(int flags, RegExp exp) {
    return newContainerNode(flags, Kind.REGEXP_COMPLEMENT, exp, null);
  }
  
  static RegExp makeChar(int flags, int c) {
    return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0);
  }
  
  static RegExp makeCharRange(int flags, int from, int to) {
    if (from > to) 
      throw new IllegalArgumentException("invalid range: from (" + from + ") cannot be > to (" + to + ")");
    return newLeafNode(flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to);
  }
  
  static RegExp makeAnyChar(int flags) {
    return newContainerNode(flags, Kind.REGEXP_ANYCHAR, null, null);
  }
  
  static RegExp makeEmpty(int flags) {
    return newContainerNode(flags, Kind.REGEXP_EMPTY, null, null);
  }
  
  static RegExp makeString(int flags, String s) {
    return newLeafNode(flags, Kind.REGEXP_STRING, s, 0, 0, 0, 0, 0, 0);
  }
  
  static RegExp makeAnyString(int flags) {
    return newContainerNode(flags, Kind.REGEXP_ANYSTRING, null, null);
  }
  
  static RegExp makeAutomaton(int flags, String s) {
    return newLeafNode(flags, Kind.REGEXP_AUTOMATON, s, 0, 0, 0, 0, 0, 0);
  }
  
  static RegExp makeInterval(int flags, int min, int max, int digits) {
  return newLeafNode(flags, Kind.REGEXP_INTERVAL, null, 0, min, max, digits, 0, 0);
  }
  
  private boolean peek(String s) {
    return more() && s.indexOf(originalString.codePointAt(pos)) != -1;
  }
  
  private boolean match(int c) {
    if (pos >= originalString.length()) return false;
    if (originalString.codePointAt(pos) == c) {
      pos += Character.charCount(c);
      return true;
    }
    return false;
  }
  
  private boolean more() {
    return pos < originalString.length();
  }
  
  private int next() throws IllegalArgumentException {
    if (!more()) throw new IllegalArgumentException("unexpected end-of-string");
    int ch = originalString.codePointAt(pos);
    pos += Character.charCount(ch);
    return ch;
  }
  
  private boolean check(int flag) {
    return (flags & flag) != 0;
  }
  
  final RegExp parseUnionExp() throws IllegalArgumentException {
    RegExp e = parseInterExp();
    if (match('|')) e = makeUnion(flags, e, parseUnionExp());
    return e;
  }
  
  final RegExp parseInterExp() throws IllegalArgumentException {
    RegExp e = parseConcatExp();
    if (check(INTERSECTION) && match('&')) e = makeIntersection(flags, e,
        parseInterExp());
    return e;
  }
  
  final RegExp parseConcatExp() throws IllegalArgumentException {
    RegExp e = parseRepeatExp();
    if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation(
        flags, e, parseConcatExp());
    return e;
  }
  
  final RegExp parseRepeatExp() throws IllegalArgumentException {
    RegExp e = parseComplExp();
    while (peek("?*+{")) {
      if (match('?')) e = makeOptional(flags, e);
      else if (match('*')) e = makeRepeat(flags, e);
      else if (match('+')) e = makeRepeat(flags, e, 1);
      else if (match('{')) {
        int start = pos;
        while (peek("0123456789"))
          next();
        if (start == pos) throw new IllegalArgumentException(
            "integer expected at position " + pos);
        int n = Integer.parseInt(originalString.substring(start, pos));
        int m = -1;
        if (match(',')) {
          start = pos;
          while (peek("0123456789"))
            next();
          if (start != pos) m = Integer.parseInt(
            originalString.substring(start, pos));
        } else m = n;
        if (!match('}')) throw new IllegalArgumentException(
            "expected '}' at position " + pos);
        if (m == -1) e = makeRepeat(flags, e, n);
        else e = makeRepeat(flags, e, n, m);
      }
    }
    return e;
  }
  
  final RegExp parseComplExp() throws IllegalArgumentException {
    if (check(COMPLEMENT) && match('~')) return makeComplement(flags, parseComplExp());
    else return parseCharClassExp();
  }
  
  final RegExp parseCharClassExp() throws IllegalArgumentException {
    if (match('[')) {
      boolean negate = false;
      if (match('^')) negate = true;
      RegExp e = parseCharClasses();
      if (negate) e = makeIntersection(flags, makeAnyChar(flags), makeComplement(flags, e));
      if (!match(']')) throw new IllegalArgumentException(
          "expected ']' at position " + pos);
      return e;
    } else return parseSimpleExp();
  }
  
  final RegExp parseCharClasses() throws IllegalArgumentException {
    RegExp e = parseCharClass();
    while (more() && !peek("]"))
      e = makeUnion(flags, e, parseCharClass());
    return e;
  }
  
  final RegExp parseCharClass() throws IllegalArgumentException {
    int c = parseCharExp();
    if (match('-')) return makeCharRange(flags, c, parseCharExp());
    else return makeChar(flags, c);
  }
  
  final RegExp parseSimpleExp() throws IllegalArgumentException {
    if (match('.')) return makeAnyChar(flags);
    else if (check(EMPTY) && match('#')) return makeEmpty(flags);
    else if (check(ANYSTRING) && match('@')) return makeAnyString(flags);
    else if (match('"')) {
      int start = pos;
      while (more() && !peek("\""))
        next();
      if (!match('"')) throw new IllegalArgumentException(
          "expected '\"' at position " + pos);
      return makeString(flags, originalString.substring(start, pos - 1));
    } else if (match('(')) {
      if (match(')')) return makeString(flags, "");
      RegExp e = parseUnionExp();
      if (!match(')')) throw new IllegalArgumentException(
          "expected ')' at position " + pos);
      return e;
    } else if ((check(AUTOMATON) || check(INTERVAL)) && match('<')) {
      int start = pos;
      while (more() && !peek(">"))
        next();
      if (!match('>')) throw new IllegalArgumentException(
          "expected '>' at position " + pos);
      String s = originalString.substring(start, pos - 1);
      int i = s.indexOf('-');
      if (i == -1) {
        if (!check(AUTOMATON)) throw new IllegalArgumentException(
            "interval syntax error at position " + (pos - 1));
        return makeAutomaton(flags, s);
      } else {
        if (!check(INTERVAL)) throw new IllegalArgumentException(
            "illegal identifier at position " + (pos - 1));
        try {
          if (i == 0 || i == s.length() - 1 || i != s.lastIndexOf('-')) throw new NumberFormatException();
          String smin = s.substring(0, i);
          String smax = s.substring(i + 1, s.length());
          int imin = Integer.parseInt(smin);
          int imax = Integer.parseInt(smax);
          int digits;
          if (smin.length() == smax.length()) digits = smin.length();
          else digits = 0;
          if (imin > imax) {
            int t = imin;
            imin = imax;
            imax = t;
          }
          return makeInterval(flags, imin, imax, digits);
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException(
              "interval syntax error at position " + (pos - 1));
        }
      }
    } else return makeChar(flags, parseCharExp());
  }
  
  final int parseCharExp() throws IllegalArgumentException {
    match('\\');
    return next();
  }
}
regexp	::=	unionexp
	\|
unionexp	::=	interexp `\|` unionexp	(union)
	\|	interexp
interexp	::=	concatexp `&` interexp	(intersection)	[OPTIONAL]
	\|	concatexp
concatexp	::=	repeatexp concatexp	(concatenation)
	\|	repeatexp
repeatexp	::=	repeatexp `?`	(zero or one occurrence)
	\|	repeatexp `*`	(zero or more occurrences)
	\|	repeatexp `+`	(one or more occurrences)
	\|	repeatexp `{n}`	(`n` occurrences)
	\|	repeatexp `{n,}`	(`n` or more occurrences)
	\|	repeatexp `{n,m}`	(`n` to `m` occurrences, including both)
	\|	complexp
complexp	::=	`~` complexp	(complement)	[OPTIONAL]
	\|	charclassexp
charclassexp	::=	`[` charclasses `]`	(character class)
	\|	`[^` charclasses `]`	(negated character class)
	\|	simpleexp
charclasses	::=	charclass charclasses
	\|	charclass
charclass	::=	charexp `-` charexp	(character range, including end-points)
	\|	charexp
simpleexp	::=	charexp
	\|	`.`	(any single character)
	\|	`#`	(the empty language)	[OPTIONAL]
	\|	`@`	(any string)	[OPTIONAL]
	\|	`"` <Unicode string without double-quotes> `"`	(a string)
	\|	`(` `)`	(the empty string)
	\|	`(` unionexp `)`	(precedence override)
	\|	`<` <identifier> `>`	(named automaton)	[OPTIONAL]
	\|	`<n-m>`	(numerical interval)	[OPTIONAL]
charexp	::=	<Unicode character>	(a single non-reserved character)
	\|	`\` <Unicode character>	(a single character)