All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.javascript.jscomp.regex.RegExpTree Maven / Gradle / Ivy

There is a newer version: 9.0.8
Show newest version
/*
 * Copyright 2011 The Closure Compiler Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.javascript.jscomp.regex;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static java.lang.Math.min;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

/**
 * An AST for JavaScript regular expressions.
 */
public abstract class RegExpTree {

  /**
   * Returns a simpler regular expression that is semantically the same assuming
   * the given flags.
   * @param flags Regular expression flags, e.g. {@code "igm"}.
   */
  public abstract RegExpTree simplify(String flags);

  /**
   * True if the presence or absence of an {@code "i"} flag would change the
   * meaning of this regular expression.
   */
  public abstract boolean isCaseSensitive();

  /**
   * True if the regular expression contains an anchor : {@code ^} or {@code $}.
   */
  public abstract boolean containsAnchor();

  /**
   * True if the regular expression contains capturing groups.
   */
  public final boolean hasCapturingGroup() {
    return numCapturingGroups() != 0;
  }

  /**
   * The number of capturing groups.
   */
  public abstract int numCapturingGroups();

  /**
   * The children of this node.
   */
  public abstract List children();

  /**
   * Appends this regular expression source to the given buffer.
   */
  protected abstract void appendSourceCode(StringBuilder sb);

  protected abstract void appendDebugInfo(StringBuilder sb);

  @Override
  public final String toString() {
    StringBuilder sb = new StringBuilder();
    sb.append('/');
    appendSourceCode(sb);
    // Don't emit a regular expression that looks like a line comment start.
    if (sb.length() == 1) {
      sb.append("(?:)");
    }
    sb.append('/');
    return sb.toString();
  }

  @Override
  public abstract boolean equals(Object o);

  @Override
  public abstract int hashCode();

  private enum ParentheticalType {
    CAPTURING,
    NONCAPTURING,
    POSITIVE_LOOKAHEAD,
    NEGATIVE_LOOKAHEAD,
    POSITIVE_LOOKBEHIND,
    NEGATIVE_LOOKBEHIND,
    NAMED_GROUPS,
  }

  /**
   * Parses a regular expression to an AST.
   *
   * @param pattern The {@code foo} From {@code /foo/i}.
   * @param flags The {@code i} From {@code /foo/i}.
   */
  public static RegExpTree parseRegExp(
      final String pattern, final String flags) {

    /** A recursive descent parser that closes over pattern and flags above. */
    class Parser {
      /** The number of characters in pattern consumed. */
      int pos;
      /** The number of capturing groups seen so far. */
      int numCapturingGroups;
      /** The names of capturing groups in the regex expression */
      Set capturingGroupNames = new HashSet<>();
      /** The length of pattern. */
      final int limit = pattern.length();
      /** Boolean indicating whether we should look for named capture group backreferences */
      boolean lookForNamedCaptureBackreferences;

      RegExpTree parseTopLevel() {
        // First assume there are no named capture backreferences,
        // because the spec says they should only be recognized
        // if the pattern contains at least one named capture group.
        this.pos = 0;
        this.numCapturingGroups = 0;
        this.lookForNamedCaptureBackreferences = false;
        RegExpTree out = parse();

        // If there were named capture groups, we must parse the pattern string again
        // so we can check for any backreferences to them.
        if (!capturingGroupNames.isEmpty()) {
          this.pos = 0;
          this.numCapturingGroups = 0;
          this.lookForNamedCaptureBackreferences = true;
          out = parse();
        }

        if (pos < limit) {  // Unmatched closed paren maybe.
          throw new IllegalArgumentException(pattern.substring(pos));
        }
        return out;
      }

      RegExpTree parse() {
        // Collects ["foo", "bar", "baz"] for /foo|bar|baz/.
        ImmutableList.Builder alternatives = null;
        // The last item parsed within an alternation.
        RegExpTree preceder = null;

        topLoop:
        while (pos < limit) {
          char ch = pattern.charAt(pos);
          RegExpTree atom;
          switch (ch) {
            case '[':
              atom = parseCharset();
              break;
            case '(':
              atom = parseParenthetical();
              break;
            case ')':
              break topLoop;
            case '\\':
              atom = parseEscape();
              break;
            case '^':
            case '$':
              atom = new Anchor(ch);
              ++pos;
              break;
            case '.':
              // We represent . as a character set to make it easy to simplify
              // things like /.|[\r\n]/.
              atom = DOT_CHARSET;
              ++pos;
              break;
            case '|':
              // An alternative may be empty as in /foo||bar/.
              // The '|' is consumed below.
              atom = Empty.INSTANCE;
              break;
            default:
              // Find a run of concatenated characters to avoid building a
              // tree node per literal character.
              int start = pos;
              int end = pos + 1;
              charsLoop:
              while (end < limit) {
                switch (pattern.charAt(end)) {
                  case '[':
                  case '(':
                  case ')':
                  case '\\':
                  case '^':
                  case '$':
                  case '|':
                  case '.':
                  case '*':
                  case '+':
                  case '?':
                  case '{':
                    break charsLoop;
                  default:
                    // Repetition binds more tightly than concatenation.
                    // Only consume up to "foo" in /foob*/ so that the suffix
                    // operator parser below has the right precedence.
                    if (end + 1 >= limit
                        || !isRepetitionStart(pattern.charAt(end + 1))) {
                      ++end;
                    } else {
                      break charsLoop;
                    }
                }
              }
              atom = new Text(pattern.substring(start, end));
              pos = end;
              break;
          }
          if (pos < limit && isRepetitionStart(pattern.charAt(pos))) {
            atom = parseRepetition(atom);
          }
          if (preceder == null) {
            preceder = atom;
          } else {
            preceder = new Concatenation(preceder, atom);
          }
          // If this is an alternative in a alternation, then add it to the
          // list of complete alternatives, and reset the parser state for the
          // next alternative.
          if (pos < limit && pattern.charAt(pos) == '|') {
            if (alternatives == null) {
              alternatives = ImmutableList.builder();
            }
            alternatives.add(preceder);
            preceder = null;
            ++pos;
          }
        }
        // An alternative may have no parsed content blank as in /foo|/.
        if (preceder == null) { preceder = Empty.INSTANCE; }
        if (alternatives != null) {
          alternatives.add(preceder);
          return new Alternation(alternatives.build());
        } else {
          return preceder;
        }
      }

      /**
       * Handles capturing groups {@code (...)},
       * non-capturing groups {@code (?:...)}, and lookahead assertions
       * {@code (?=...)}.
       */
      private RegExpTree parseParenthetical() {
        checkState(pattern.charAt(pos) == '(');
        int start = pos;
        ++pos;
        ParentheticalType type;
        String captureName = null;
        if (pos < limit && pattern.charAt(pos) == '?') {
          if (pos + 1 < limit) {
            char ch = pattern.charAt(pos + 1);
            switch (ch) {
                // (?:...) Non-capturing groups.
              case ':':
                pos += 2;
                type = ParentheticalType.NONCAPTURING;
                break;

                // (?=...) and (?!...) Lookahead Assertions
              case '=':
                pos += 2;
                type = ParentheticalType.POSITIVE_LOOKAHEAD;
                break;
              case '!':
                pos += 2;
                type = ParentheticalType.NEGATIVE_LOOKAHEAD;
                break;

                // (?<=...) and (?) named groups
              case '<':
                if (pos + 2 < limit && pattern.charAt(pos + 2) == '=') {
                  pos += 3;
                  type = ParentheticalType.POSITIVE_LOOKBEHIND;
                  break;
                } else if (pos + 2 < limit && pattern.charAt(pos + 2) == '!') {
                  pos += 3;
                  type = ParentheticalType.NEGATIVE_LOOKBEHIND;
                  break;
                } else {
                  pos += 2;
                  captureName = scanNamedGroupName();
                  capturingGroupNames.add(captureName);
                  type = ParentheticalType.NAMED_GROUPS;
                  break;
                }
              default:
                throw new IllegalArgumentException(
                    "Malformed parenthetical: " + pattern.substring(start));
            }
          } else {
            throw new IllegalArgumentException(
                "Malformed parenthetical: " + pattern.substring(start));
          }
        } else {
          type = ParentheticalType.CAPTURING;
        }
        RegExpTree body = parse();
        if (pos < limit && pattern.charAt(pos) == ')') {
          ++pos;
        } else {
          throw new IllegalArgumentException(
              "Unclosed parenthetical group: " + pattern.substring(start));
        }
        switch (type) {
          case CAPTURING:
            ++numCapturingGroups;
            return new CapturingGroup(body);
          case NONCAPTURING:
            return body;

          case POSITIVE_LOOKAHEAD:
            return new LookaheadAssertion(body, true);
          case NEGATIVE_LOOKAHEAD:
            return new LookaheadAssertion(body, false);

          case POSITIVE_LOOKBEHIND:
            return new LookbehindAssertion(body, true);
          case NEGATIVE_LOOKBEHIND:
            return new LookbehindAssertion(body, false);

          case NAMED_GROUPS:
            if (captureName != null) {
              ++numCapturingGroups;
              return new NamedCaptureGroup(body, captureName);
            } else {
              throw new IllegalArgumentException(
                  "Malformed named capture group: " + pattern.substring(start));
            }
        }
        throw new AssertionError("Unrecognized ParentheticalType " + type);
      }

      /**
       * Helper that scans the pattern for a named group name. Assumes that {@code pos} points to
       * the character after '<'
       *
       * @return the group name
       */
      private String scanNamedGroupName() {
        int start = pos;
        int end = pos;
        if (!isIdentifierStart(pattern.charAt(start))) {
          throw new IllegalArgumentException(
              "Invalid capture group name: <" + pattern.substring(start));
        }
        ++end;
        while (end < limit) {
          if (pattern.charAt(end) == '>') {
            pos = end + 1;
            return pattern.substring(start, end);
          } else if (isIdentifierPart(pattern.charAt(end))) {
            ++end;
          } else {
            throw new IllegalArgumentException(
                "Invalid capture group name: <" + pattern.substring(start));
          }
        }
        throw new IllegalArgumentException(
            "Malformed named capture group: <" + pattern.substring(start));
      }

      /**
       * Parses a square bracketed character set.
       * Standalone character groups (@code /\d/} are handled by
       * {@link #parseEscape}.
       */
      private RegExpTree parseCharset() {
        checkState(pattern.charAt(pos) == '[');
        ++pos;

        boolean isCaseInsensitive = flags.indexOf('i') >= 0;
        boolean inverse = pos < limit && pattern.charAt(pos) == '^';
        if (inverse) { ++pos; }
        CharRanges ranges = CharRanges.EMPTY;
        CharRanges ieExplicits = CharRanges.EMPTY;
        while (pos < limit && pattern.charAt(pos) != ']') {
          char ch = pattern.charAt(pos);
          int start;
          if (ch == '\\') {
            ++pos;
            char possibleGroupName = pattern.charAt(pos);
            CharRanges group = NAMED_CHAR_GROUPS.get(possibleGroupName);
            if (group != null) {
              ++pos;
              ranges = ranges.union(group);
              continue;
            }
            start = parseEscapeChar();
          } else {
            start = ch;
            ++pos;
          }
          int end = start;
          if (pos + 1 < limit && pattern.charAt(pos) == '-'
              && pattern.charAt(pos + 1) != ']') {
            ++pos;
            ch = pattern.charAt(pos);
            if (ch == '\\') {
              ++pos;
              end = parseEscapeChar();
            } else {
              end = ch;
              ++pos;
            }
          }
          CharRanges range = CharRanges.inclusive(start, end);
          ranges = ranges.union(range);
          if (IE_SPEC_ERRORS.contains(start) && IE_SPEC_ERRORS.contains(end)) {
            ieExplicits = ieExplicits.union(range.intersection(IE_SPEC_ERRORS));
          }
          if (isCaseInsensitive) {
            // If the flags contain the 'i' flag, then it is not correct to
            // say that [^a-z] contains the letter 'A', or that [a-z] does not
            // contain the letter 'A'.
            // We expand out letter groups here so that parse returns something
            // that is valid independent of flags.
            // Calls to simplify(flags) may later reintroduce flag assumptions.
            // but without this step, later steps might conflate
            //     /[a-z]/i
            // and
            //     /[^\0-`{-\uffff]/i
            // which matches nothing because the information about whether the
            // ^ is present has been lost during optimizations and charset
            // unionizing as in /[...]|[^...]/.
            ranges = CaseCanonicalize.expandToAllMatched(ranges);
          }
        }
        ++pos;  // Consume ']'

        if (inverse) {
          ranges = CharRanges.ALL_CODE_UNITS.difference(ranges);
        }

        return new Charset(ranges, ieExplicits);
      }

      /**
       * Parses an escape to a code point. Some of the characters parsed here have special meanings
       * in various contexts, so contexts must filter those instead. E.g. '\b' means a different
       * thing inside a charset than without.
       */
      private int parseEscapeChar() {
        char ch = pattern.charAt(pos++);
        switch (ch) {
          case 'b': return '\b';
          case 'f': return '\f';
          case 'n': return '\n';
          case 'r': return '\r';
          case 't': return '\t';
          case 'u':
            return (flags.contains("u") && pos < limit && pattern.charAt(pos) == '{')
                ? parseBracedUnicodeEscape()
                : parseHex(4);
          case 'v': return '\u000b';
          case 'x': return parseHex(2);
          default:
            if ('0' <= ch && ch <= '7') {
              char codeUnit = (char) (ch - '0');
              // Allow octal literals in the range \0-\377.
              // \41 might be a group, but \041 is not a group.
              // We read, but do not emit octal literals since they
              // are deprecated in ES5.
              int octLimit = min(limit, pos + (ch <= '3' ? 2 : 1) + (ch == '0' ? 1 : 0));
              while (pos < octLimit) {
                ch = pattern.charAt(pos);
                if ('0' <= ch && ch <= '7') {
                  codeUnit = (char) ((codeUnit << 3) + (ch - '0'));
                  ++pos;
                } else {
                  break;
                }
              }
              return codeUnit;
            }
            return ch;
        }
      }

      /**
       * Parses an escape that appears outside a charset.
       */
      private RegExpTree parseEscape() {
        checkState(pattern.charAt(pos) == '\\');
        int start = pos;
        ++pos;
        char ch = pattern.charAt(pos);
        if (ch == 'b' || ch == 'B') {
          ++pos;
          return new WordBoundary(ch);
        } else if ((ch == 'p' || ch == 'P') && flags.contains("u")) {
          // handle ES2018 unicode property tests, e.g.
          // /\p{ASCII_Hex_Digit=true}/ only hex digits
          // /\P{Script=Greek}/  no greek letters
          boolean negated = ch == 'P';
          ++pos;
          if (pos < limit && pattern.charAt(pos) == '{') {
            StringBuilder lhs = new StringBuilder();
            while (++pos < limit
                && ((ch = pattern.charAt(pos)) == '_'
                    || ('a' <= ch && ch <= 'z')
                    || ('A' <= ch && ch <= 'Z')
                    || ('0' <= ch && ch <= '9'))) {
              lhs.append(ch);
            }
            if (pos < limit && ch == '}') {
              // Case of shorthand like /\p{ASCII_Hex_Digit}/u
              ++pos;
              return new UnicodePropertyEscape(null, lhs.toString(), negated);
            } else if (pos < limit && ch == '=') {
              // Case of having '=' like /\p{Script=Greek}/u
              StringBuilder rhs = new StringBuilder();
              while (++pos < limit
                  && ((ch = pattern.charAt(pos)) == '_'
                      || ('a' <= ch && ch <= 'z')
                      || ('A' <= ch && ch <= 'Z')
                      || ('0' <= ch && ch <= '9'))) {
                rhs.append(ch);
              }
              if (pos < limit && ch == '}') {
                ++pos;
                return new UnicodePropertyEscape(lhs.toString(), rhs.toString(), negated);
              } else {
                throw new IllegalArgumentException(
                    "Malformed Unicode Property Escape: expected '}' after "
                        + pattern.substring(start, pos));
              }
            } else {
              throw new IllegalArgumentException(
                  "Malformed Unicode Property Escape: expected '=' or '}' after "
                      + pattern.substring(start, pos));
            }
          } else {
            throw new IllegalArgumentException(
                "Malformed Unicode Property Escape: expected '{' after "
                    + pattern.substring(start, pos));
          }
        } else if ('1' <= ch && ch <= '9') {
          ++pos;
          int possibleGroupIndex = ch - '0';
          if (numCapturingGroups >= possibleGroupIndex) {
            if (pos < limit) {
              char next = pattern.charAt(pos);
              if ('0' <= next && next <= '9') {
                int twoDigitGroupIndex = possibleGroupIndex * 10 + (next - '0');
                if (numCapturingGroups >= twoDigitGroupIndex) {
                  ++pos;
                  possibleGroupIndex = twoDigitGroupIndex;
                }
              }
            }
            return new BackReference(possibleGroupIndex);
          } else {
            // \1 - \7 are octal escapes if there is no such group.
            // \8 and \9 are the literal characters '8' and '9' if there
            // is no such group.
            return new Text(Character.toString(
              possibleGroupIndex <= 7 ? (char) possibleGroupIndex : ch));
          }
        } else if (lookForNamedCaptureBackreferences
            && ch == 'k'
            && pos + 1 < limit
            && pattern.charAt(pos + 1) == '<'
            // According to the spec
            // https://github.com/tc39/proposal-regexp-named-groups#backwards-compatibility-of-new-syntax
            // we want to treat \k as a normal string if there are no named capturing groups present
            && !capturingGroupNames.isEmpty()) {
          pos += 2;
          String potentialName = scanNamedGroupName();
          if (!capturingGroupNames.contains(potentialName)) {
            throw new IllegalArgumentException(
                "Invalid named capture referenced: " + pattern.substring(start));
          }
          return new NamedBackReference(potentialName);
        } else {
          CharRanges charGroup = NAMED_CHAR_GROUPS.get(ch);
          if (charGroup != null) {  // Handle \d, etc.
            ++pos;
            return new Charset(charGroup, CharRanges.EMPTY);
          }
          return new Text(new String(Character.toChars(parseEscapeChar())));
        }
      }

      /** Parses n hex digits to a code-unit. */
      private int parseHex(int n) {
        if (pos + n > limit) {
          throw new IllegalArgumentException(
              "Abbreviated hex escape " + pattern.substring(pos));
        }
        if (n > 7) {
          // We need to guard the MSB to prevent overflow.
          throw new IllegalArgumentException(
              "Cannot parse hexadecimal encoding wider than 28 bits: "
                  + pattern.substring(pos, pos + n));
        }

        int result = 0;
        while (--n >= 0) {
          char ch = pattern.charAt(pos);
          int digit;
          if ('0' <= ch && ch <= '9') {
            digit = ch - '0';
          } else if ('a' <= ch && ch <= 'f') {
            digit = ch + (10 - 'a');
          } else if ('A' <= ch && ch <= 'F') {
            digit = ch + (10 - 'A');
          } else {
            throw new IllegalArgumentException(pattern.substring(pos));
          }
          ++pos;
          result = (result << 4) | digit;
        }
        return result;
      }

      private int parseBracedUnicodeEscape() {
        int openBrace = pos;
        checkState(pattern.charAt(pos++) == '{');

        int closeBrace = pos;
        while (closeBrace < limit && pattern.charAt(closeBrace) != '}') {
          closeBrace++;
        }
        if (closeBrace == limit) {
          throw new IllegalArgumentException(
              "Malformed unicode escape: expected '}' after " + pattern.substring(openBrace));
        } else if (closeBrace == pos) {
          throw new IllegalArgumentException("Empty unicode escape");
        }

        int result = parseHex(closeBrace - pos);
        if (result > 0x10FFFF) {
          throw new IllegalArgumentException(
              "Unicode must be at most 0x10FFFF: " + pattern.substring(openBrace + 1, pos));
        }
        pos++; // Consume the close brace.
        return result;
      }

      private boolean isRepetitionStart(char ch) {
        switch (ch) {
          case '?':
          case '*':
          case '+':
          case '{':
            return true;
          default:
            return false;
        }
      }

      /**
       * Parse a repetition.  {@code x?} is treated as a repetition --
       * an optional production can be matched 0 or 1 time.
       */
      private RegExpTree parseRepetition(RegExpTree body) {
        if (pos == limit) { return body; }
        int min, max;
        switch (pattern.charAt(pos)) {
          case '+':
            ++pos;
            min = 1;
            max = Integer.MAX_VALUE;
            break;
          case '*':
            ++pos;
            min = 0;
            max = Integer.MAX_VALUE;
            break;
          case '?':
            ++pos;
            min = 0;
            max = 1;
            break;
          case '{':
            ++pos;
            int start = pos;
            int end = pattern.indexOf('}', start);
            if (end < 0) {
              pos = start - 1;
              return body;
            }
            String counts = pattern.substring(start, end);
            pos = end + 1;
            int comma = counts.indexOf(',');
            try {
              min = Integer.parseInt(
                  comma >= 0 ? counts.substring(0, comma) : counts);
              max = comma >= 0
                  ? comma + 1 != counts.length()
                      ? Integer.parseInt(counts.substring(comma + 1))
                      : Integer.MAX_VALUE
                  : min;
            } catch (NumberFormatException ex) {
              min = max = -1;
            }
            if (min < 0 || min > max) {
              // Treat the open curly bracket literally.
              pos = start - 1;
              return body;
            }
            break;
          default:
            return body;
        }
        boolean greedy = true;
        if (pos < limit && pattern.charAt(pos) == '?') {
          greedy = false;
          ++pos;
        }
        return new Repetition(body, min, max, greedy);
      }
    }

    return new Parser().parseTopLevel();
  }

  /**
   * @param ch the character
   * @return true if the character is a valid Javascript identifier start character.
   */
  private static boolean isIdentifierStart(char ch) {
    // TODO(yitingwang) This and the one in Scanner.java should share the same implementation
    // Most code is written in pure ASCII create a fast path here.
    if (ch <= 127) {
      return ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch == '_' || ch == '$'));
    }

    // Workaround b/36459436
    // When running under GWT, Character.isLetter only handles ASCII
    // Angular relies heavily on U+0275 (Latin Barred O)
    return ch == 0x0275
        // TODO: UnicodeLetter also includes Letter Number (NI)
        || Character.isLetter(ch);
  }

  /**
   * @param ch the character
   * @return true if the character is allowed in Javascript identifiers.
   */
  private static boolean isIdentifierPart(char ch) {
    // TODO(yitingwang) This and the one in Scanner.java should share the same implementation
    // Most code is written in pure ASCII create a fast path here.
    if (ch <= 127) {
      return ((ch >= 'A' && ch <= 'Z')
          || (ch >= 'a' && ch <= 'z')
          || (ch >= '0' && ch <= '9')
          || (ch == '_' || ch == '$')); // _ or $
    }
    // TODO: identifier part character classes
    // CombiningMark
    //   Non-Spacing mark (Mn)
    //   Combining spacing mark(Mc)
    // Connector punctuation (Pc)
    // Zero Width Non-Joiner
    // Zero Width Joiner
    return isIdentifierStart(ch) || Character.isDigit(ch);
  }

  /**
   * True if, but not necessarily always when the, given regular expression
   * must match the whole input or none of it.
   */
  public static boolean matchesWholeInput(RegExpTree t, String flags) {
    if (flags.indexOf('m') >= 0) { return false; }

    if (!(t instanceof Concatenation)) {
      return false;
    }

    Concatenation c = (Concatenation) t;
    if (c.elements.isEmpty()) { return false; }
    RegExpTree first = c.elements.get(0), last = Iterables.getLast(c.elements);
    if (!(first instanceof Anchor && last instanceof Anchor)) { return false; }
    return ((Anchor) first).type == '^' && ((Anchor) last).type == '$';
  }

  /** Represents a node that never has children such as an anchor or charset. */
  public abstract static class RegExpTreeAtom extends RegExpTree {
    @Override
    public boolean isCaseSensitive() {
      return false;
    }

    @Override
    public boolean containsAnchor() {
      return false;
    }

    @Override
    public final int numCapturingGroups() {
      return 0;
    }

    @Override
    public final ImmutableList children() {
      return ImmutableList.of();
    }
  }

  /** Represents an empty portion of a RegExp such as the middle of "||" */
  public static final class Empty extends RegExpTreeAtom {
    static final Empty INSTANCE = new Empty();

    @Override
    public RegExpTree simplify(String flags) {
      return this;
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      // No output
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      // No output
    }

    @Override
    public boolean equals(Object o) {
      return o instanceof Empty;
    }

    @Override
    public int hashCode() {
      return 0x7ee06141;
    }
  }

  /** Represents an anchor, namely ^ or $. */
  public static final class Anchor extends RegExpTreeAtom {
    final char type;
    Anchor(char type) { this.type = type; }

    @Override
    public RegExpTree simplify(String flags) {
      return this;
    }

    @Override
    public boolean containsAnchor() {
      return true;
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      sb.append(type);
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      sb.append(type);
    }

    @Override
    public boolean equals(Object o) {
      return o instanceof Anchor && type == ((Anchor) o).type;
    }

    @Override
    public int hashCode() {
      return type ^ 0xe85317ff;
    }
  }

  /** Represents \b or \B */
  public static final class WordBoundary extends RegExpTreeAtom {
    final char type;

    WordBoundary(char type) {
      this.type = type;
    }

    @Override
    public RegExpTree simplify(String flags) {
      return this;
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      sb.append('\\').append(type);
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      sb.append(type);
    }

    @Override
    public boolean equals(Object o) {
      return o instanceof WordBoundary && type == ((WordBoundary) o).type;
    }

    @Override
    public int hashCode() {
      return 0x5673aa29 ^ type;
    }
  }

  /** Represents a reference to a previous group such as \1 or \2 */
  public static final class BackReference extends RegExpTreeAtom {
    final int groupIndex;

    BackReference(int groupIndex) {
      checkArgument(groupIndex >= 0 && groupIndex <= 99);
      this.groupIndex = groupIndex;
    }

    @Override
    public RegExpTree simplify(String flags) {
      return this;
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      sb.append('\\').append(groupIndex);
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      sb.append(groupIndex);
    }

    @Override
    public boolean equals(Object o) {
      return o instanceof BackReference
          && groupIndex == ((BackReference) o).groupIndex;
    }

    @Override
    public int hashCode() {
      return 0xff072663 ^ groupIndex;
    }
  }

  /** Represents a reference to a previous named group */
  public static final class NamedBackReference extends RegExpTreeAtom {
    final String groupName;

    NamedBackReference(String groupName) {
      this.groupName = groupName;
    }

    @Override
    public RegExpTree simplify(String flags) {
      return this;
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      sb.append("\\k<").append(groupName).append('>');
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      sb.append(groupName);
    }

    @Override
    public boolean equals(Object o) {
      return o instanceof NamedBackReference
          && groupName.equals(((NamedBackReference) o).groupName);
    }

    @Override
    public int hashCode() {
      return Objects.hashCode(groupName);
    }
  }

  /** Represents a run of non-special characters such as "foobar" */
  public static final class Text extends RegExpTreeAtom {
    final String text;

    Text(String text) {
      this.text = text;
    }

    /**
     * @param ch The code-unit to escape.
     * @param next The next code-unit or -1 if indeterminable.
     */
    private static void escapeRegularCharOnto(
        char ch, int next, StringBuilder sb) {
      switch (ch) {
        case '$':
        case '^':
        case '*':
        case '(':
        case ')':
        case '+':
        case '[':
        case '|':
        case '.':
        case '/':
        case '?':
          sb.append('\\').append(ch);
          break;
        case '{':
          // If possibly part of a repetition, then escape.
          // Concatenation is handled by the digitsMightBleed check.
          if ('0' <= next && next <= '9') {
            sb.append('\\');
          }
          sb.append(ch);
          break;
        default:
          escapeCharOnto(ch, sb);
      }
    }

    @Override
    public RegExpTree simplify(String flags) {
      int n = text.length();
      if (n == 0) {
        return Empty.INSTANCE;
      }
      if (flags.indexOf('i') >= 0) {
        String canonicalized = CaseCanonicalize.caseCanonicalize(text);
        if (!text.equals(canonicalized)) {
          return new Text(canonicalized);
        }
      }
      return this;
    }

    @Override
    public boolean isCaseSensitive() {
      for (int i = 0, n = text.length(); i < n; ++i) {
        if (CaseCanonicalize.CASE_SENSITIVE.contains(text.charAt(i))) {
          return true;
        }
      }
      return false;
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      for (int i = 0, n = text.length(); i < n; ++i) {
        escapeRegularCharOnto(text.charAt(i), i + 1 < n ? text.charAt(i + 1) : -1, sb);
      }
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      sb.append('`').append(text).append('`');
    }

    @Override
    public boolean equals(Object o) {
      return o instanceof Text && text.equals(((Text) o).text);
    }

    @Override
    public int hashCode() {
      return text.hashCode() ^ 0x617e310;
    }
  }

  /** Represents a repeating item such as ...+, ...*, or ...{0,1} */
  public static final class Repetition extends RegExpTree {
    final RegExpTree body;
    final int min, max;
    final boolean greedy;

    Repetition(RegExpTree body, int min, int max, boolean greedy) {
      this.body = body;
      this.min = min;
      this.max = max;
      this.greedy = greedy;
    }

    @Override
    public RegExpTree simplify(String flags) {
      RegExpTree body = this.body.simplify(flags);
      if (max == 0 && !body.hasCapturingGroup()) { return Empty.INSTANCE; }
      if (body instanceof Empty || NEVER_MATCHES.equals(body)) { return body; }
      int min = this.min;
      int max = this.max;
      if (body instanceof Repetition) {
        Repetition rbody = (Repetition) body;
        if (rbody.greedy == greedy) {
          long lmin = ((long) min) * rbody.min;
          long lmax = ((long) max) * rbody.max;
          if (lmin < Integer.MAX_VALUE) {
            body = rbody.body;
            min = (int) lmin;
            max = lmax >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) lmax;
          }
        }
      }
      if (min == 1 && max == 1) { return body; }
      boolean greedy = this.greedy || min == max;
      return body.equals(this.body) && min == this.min && max == this.max
          && greedy == this.greedy
          ? this
          : new Repetition(body, min, max, greedy).simplify(flags);
    }

    @Override
    public boolean isCaseSensitive() {
      return body.isCaseSensitive();
    }

    @Override
    public boolean containsAnchor() {
      return body.containsAnchor();
    }

    @Override
    public int numCapturingGroups() {
      return body.numCapturingGroups();
    }

    @Override
    public ImmutableList children() {
      return ImmutableList.of(body);
    }

    private void appendBodySourceCode(StringBuilder sb) {
      if (body instanceof Alternation || body instanceof Concatenation
          || body instanceof Repetition
          || (body instanceof Text && ((Text) body).text.length() > 1)) {
        sb.append("(?:");
        body.appendSourceCode(sb);
        sb.append(')');
      } else {
        body.appendSourceCode(sb);
      }
    }

    private static int suffixLen(int min, int max) {
      // This mirrors the branches that renders a suffix in appendSourceCode below.
      if (max == Integer.MAX_VALUE) {
        switch (min) {
          case 0:  // *
          case 1:  // +
            return 1;
          default:
            return 3 + numDecimalDigits(min); // {3,}
        }
      }
      if (min == 0 && max == 1) {
        return 1;  // ?
      }
      if (min == max) {
        if (min == 1) {
          return 0;  // No suffix needed for {1}.
        }
        return 2 + numDecimalDigits(min);  // {4}
      }
      return 3 + numDecimalDigits(min) + numDecimalDigits(max);  // {2,7}
    }

    private static int numDecimalDigits(int n) {
      if (n < 0) {
        // Negative values should not be passed in.
        throw new AssertionError();
        // If changing this code to support negative values,
        // Integer.MIN_VALUE is a corner-case..
      }
      int nDigits = 1;
      while (n >= 10) {
        ++nDigits;
        n /= 10;
      }
      return nDigits;
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      int bodyStart = sb.length();
      appendBodySourceCode(sb);
      int bodyEnd = sb.length();
      int bodyLen = bodyEnd - bodyStart;
      int min = this.min;
      int max = this.max;
      if (min >= 2 && max == Integer.MAX_VALUE || max - min <= 1) {
        int expanded =
           // If min == max then we want to try expanding to the limit and
           // attach the empty suffix, which is equivalent to min = max = 1,
           // i.e. /a/ vs /a{1}/.
           min == max
           // Give aa+ preference over aaa*.
           || max == Integer.MAX_VALUE
           ? min - 1
           : min;
        int expandedMin = min - expanded;
        int expandedMax = max == Integer.MAX_VALUE ? max : max - expanded;
        int suffixLen = suffixLen(min, max);
        int expandedSuffixLen = suffixLen(expandedMin, expandedMax);
        if (bodyLen * expanded + expandedSuffixLen < suffixLen
            && !body.hasCapturingGroup()) {
          // a{2} -> aa
          // a{2,} -> aa+
          // a{2,3} -> aaa?
          while (--expanded >= 0) {
            sb.append(sb, bodyStart, bodyEnd);
          }
          min = expandedMin;
          max = expandedMax;
        }
      }

      if (max == Integer.MAX_VALUE) {
        switch (min) {
          case 0: sb.append('*'); break;
          case 1: sb.append('+'); break;
          default:
            sb.append('{').append(min).append(",}");
        }
      } else if (min == 0 && max == 1) {
        sb.append('?');
      } else if (min == max) {
        if (min != 1) {
          sb.append('{').append(min).append('}');
        }
      } else {
        sb.append('{').append(min).append(',').append(max).append('}');
      }
      if (!greedy) {
        sb.append('?');
      }
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      sb.append(" min=").append(min).append(", max=").append(max);
      if (!greedy) { sb.append("  not_greedy"); }
    }

    @Override
    public boolean equals(Object o) {
      if (!(o instanceof Repetition)) { return false; }
      Repetition that = (Repetition) o;
      return this.body.equals(that.body)
          && this.min == that.min
          && this.max == that.max
          && this.greedy == that.greedy;
    }

    @Override
    public int hashCode() {
      return min + 31 * (max + 31 * ((greedy ? 1 : 0) + 31 * body.hashCode()));
    }
  }

  /** Represents the possibilities ["foo", "bar" ] for a RegExp /foo|bar/ */
  public static final class Alternation extends RegExpTree {
    final ImmutableList alternatives;

    Alternation(List alternatives) {
      this.alternatives = ImmutableList.copyOf(alternatives);
    }

    @Override
    public RegExpTree simplify(String flags) {
      List alternatives = new ArrayList<>();
      for (RegExpTree alternative : this.alternatives) {
        alternative = alternative.simplify(flags);
        if (alternative instanceof Alternation) {
          alternatives.addAll(((Alternation) alternative).alternatives);
        } else {
          alternatives.add(alternative);
        }
      }
      // Remove duplicates
      RegExpTree last = null;
      for (Iterator it = alternatives.iterator(); it.hasNext();) {
        RegExpTree alternative = it.next();
        if (alternative.equals(NEVER_MATCHES)) { continue; }
        if (alternative.equals(last) && !alternative.hasCapturingGroup()) {
          it.remove();
        } else {
          last = alternative;
        }
      }
      // Collapse character alternatives into character sets.
      for (int i = 0, n = alternatives.size(); i < n; ++i) {
        RegExpTree alternative = alternatives.get(i);
        if ((alternative instanceof Text
             && ((Text) alternative).text.length() == 1)
            || alternative instanceof Charset) {
          int end = i;
          int nCharsets = 0;
          while (end < n) {
            RegExpTree follower = alternatives.get(end);
            if (follower instanceof Charset) {
              ++nCharsets;
            } else if (!(follower instanceof Text
                         && ((Text) follower).text.length() == 1)) {
              break;
            }
            ++end;
          }
          if (end - i >= 3 || (nCharsets != 0 && end - i >= 2)) {
            int[] members = new int[end - i - nCharsets];
            int memberIdx = 0;
            CharRanges chars = CharRanges.EMPTY;
            CharRanges ieExplicits = CharRanges.EMPTY;
            List charAlternatives = alternatives.subList(i, end);
            for (RegExpTree charAlternative : charAlternatives) {
              if (charAlternative instanceof Text) {
                char ch = ((Text) charAlternative).text.charAt(0);
                members[memberIdx++] = ch;
                if (IE_SPEC_ERRORS.contains(ch)) {
                  ieExplicits = ieExplicits.union(CharRanges.inclusive(ch, ch));
                }
              } else if (charAlternative instanceof Charset) {
                Charset cs = (Charset) charAlternative;
                chars = chars.union(cs.ranges);
                ieExplicits = ieExplicits.union(cs.ieExplicits);
              }
            }
            chars = chars.union(CharRanges.withMembers(members));
            charAlternatives.clear();
            charAlternatives.add(
                new Charset(chars, ieExplicits).simplify(flags));
            n = alternatives.size();
          }
        }
      }
      switch (alternatives.size()) {
        case 0: return Empty.INSTANCE;
        case 1: return alternatives.get(0);
        case 2:
          if (alternatives.get(1) instanceof Empty) {  // (?:a|) -> a?
            return new Repetition(alternatives.get(0), 0, 1, true);
          } else if (alternatives.get(0) instanceof Empty) {
            return new Repetition(alternatives.get(1), 0, 1, false);
          }
          break;
      }
      // TODO: maybe pull out common prefix or suffix
      return alternatives.equals(this.alternatives)
          ? this : new Alternation(alternatives);
    }

    @Override
    public boolean isCaseSensitive() {
      for (RegExpTree alternative : alternatives) {
        if (alternative.isCaseSensitive()) { return true; }
      }
      return false;
    }

    @Override
    public boolean containsAnchor() {
      for (RegExpTree alternative : alternatives) {
        if (alternative.containsAnchor()) { return true; }
      }
      return false;
    }

    @Override
    public int numCapturingGroups() {
      int n = 0;
      for (RegExpTree alternative : alternatives) {
        n += alternative.numCapturingGroups();
      }
      return n;
    }

    @Override
    public ImmutableList children() {
      return alternatives;
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      for (int i = 0, n = alternatives.size(); i < n; ++i) {
        if (i != 0) {
          sb.append('|');
        }
        alternatives.get(i).appendSourceCode(sb);
      }
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      // Nothing besides children.
    }

    @Override
    public boolean equals(Object o) {
      return this == o || (
          (o instanceof Alternation)
          && alternatives.equals(((Alternation) o).alternatives));
    }

    @Override
    public int hashCode() {
      return 0x51b57cd1 ^ alternatives.hashCode();
    }
  }

  private static final RegExpTree NEVER_MATCHES = new LookaheadAssertion(
      Empty.INSTANCE, false);

  /** Represents a lookahead assertion such as (?=...) or (?!...) */
  public static final class LookaheadAssertion extends RegExpTree {
    final RegExpTree body;
    final boolean positive;

    LookaheadAssertion(RegExpTree body, boolean positive) {
      this.body = body;
      this.positive = positive;
    }

    @Override
    public RegExpTree simplify(String flags) {
      RegExpTree simpleBody = body.simplify(flags);
      if (simpleBody instanceof Empty) {
        if (positive) {  // Always true
          return simpleBody;
        }
      }
      return new LookaheadAssertion(simpleBody, positive);
    }

    @Override
    public boolean isCaseSensitive() {
      return body.isCaseSensitive();
    }

    @Override
    public boolean containsAnchor() {
      return body.containsAnchor();
    }

    @Override
    public int numCapturingGroups() {
      return body.numCapturingGroups();
    }

    @Override
    public ImmutableList children() {
      return ImmutableList.of(body);
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      sb.append(positive ? "(?=" : "(?!");
      body.appendSourceCode(sb);
      sb.append(')');
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      sb.append(positive ? "positive" : "negative");
    }

    @Override
    public boolean equals(Object o) {
      if (!(o instanceof LookaheadAssertion)) { return false; }
      LookaheadAssertion that = (LookaheadAssertion) o;
      return this.positive == that.positive && this.body.equals(that.body);
    }

    @Override
    public int hashCode() {
      return 0x723aba9 ^ body.hashCode();
    }
  }

  /** Represents a lookbehind assertion such as {@code (?<=...) } or  {@code (? children() {
      return ImmutableList.of(body);
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      sb.append(positive ? "(?<=" : "(? children() {
      return ImmutableList.of(body);
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      sb.append('(');
      body.appendSourceCode(sb);
      sb.append(')');
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      // Nothing besides children.
    }

    @Override
    public boolean equals(Object o) {
      return o instanceof CapturingGroup
          && body.equals(((CapturingGroup) o).body);
    }

    @Override
    public int hashCode() {
      return 0x55781738 ^ body.hashCode();
    }
  }

  /** Represents a named capture group */
  public static final class NamedCaptureGroup extends RegExpTree {
    final RegExpTree body;
    final String name;

    NamedCaptureGroup(RegExpTree body, String name) {
      this.body = body;
      this.name = name;
    }

    @Override
    public RegExpTree simplify(String flags) {
      return new NamedCaptureGroup(body.simplify(flags), name);
    }

    @Override
    public boolean isCaseSensitive() {
      return body.isCaseSensitive();
    }

    @Override
    public boolean containsAnchor() {
      return body.containsAnchor();
    }

    @Override
    public int numCapturingGroups() {
      return 1 + body.numCapturingGroups();
    }

    @Override
    public ImmutableList children() {
      return ImmutableList.of(body);
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      sb.append("(?<");
      sb.append(name);
      sb.append('>');
      body.appendSourceCode(sb);
      sb.append(')');
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      sb.append(" name=").append(name);
    }

    @Override
    public boolean equals(Object o) {
      return o instanceof NamedCaptureGroup
          && name.equals(((NamedCaptureGroup) o).name)
          && body.equals(((NamedCaptureGroup) o).body);
    }

    @Override
    public int hashCode() {
      return Objects.hashCode(name) ^ body.hashCode();
    }
  }

  /** Represents a Unicode Property Escape such as in /\p{Script=Greek}/u */
  public static final class UnicodePropertyEscape extends RegExpTreeAtom {

    private final String propertyName;
    private final String propertyValue;
    private final boolean negated;

    UnicodePropertyEscape(String propertyName, String propertyValue, boolean negated) {
      checkState(propertyValue != null);
      checkArgument(
          propertyName == null || !propertyName.isEmpty(),
          "if '=' is present in a unicode property escape, the name cannot be empty");
      checkArgument(!propertyValue.isEmpty(), "unicode property escape value cannot be empty");
      this.propertyName = propertyName;
      this.propertyValue = propertyValue;
      this.negated = negated;
    }

    @Override
    public RegExpTree simplify(String flags) {
      return this;
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      sb.append(negated ? "\\P{" : "\\p{");
      if (propertyName != null) {
        sb.append(propertyName);
        sb.append('=');
      }
      sb.append(propertyValue);
      sb.append("}");
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      // Nothing besides properties and possible negation.
    }

    @Override
    public boolean equals(Object o) {
      return (o instanceof UnicodePropertyEscape)
          && negated == ((UnicodePropertyEscape) o).negated
          && Objects.equals(propertyName, ((UnicodePropertyEscape) o).propertyName)
          && Objects.equals(propertyValue, ((UnicodePropertyEscape) o).propertyValue);
    }

    @Override
    public int hashCode() {
      return Objects.hash(negated, propertyName, propertyValue);
    }
  }

  private static final CharRanges DIGITS = CharRanges.inclusive('0', '9');

  private static final CharRanges UCASE_LETTERS
      = CharRanges.inclusive('A', 'Z');

  private static final CharRanges LCASE_LETTERS
      = CharRanges.inclusive('a', 'z');

  private static final CharRanges LETTERS = UCASE_LETTERS.union(LCASE_LETTERS);

  private static final CharRanges WORD_CHARS = DIGITS
      .union(LETTERS).union(CharRanges.withMembers('_'));

  private static final CharRanges INVERSE_WORD_CHARS
      = CharRanges.ALL_CODE_UNITS.difference(WORD_CHARS);

  private static final CharRanges SPACE_CHARS = CharRanges.withMembers(
      '\t', '\n', '\u000b', '\u000c', '\r', ' ', '\u00a0',
      // Unicode 3.0 Zs
      '\u1680', '\u180e', '\u2000', '\u2001',
      '\u2002', '\u2003', '\u2004', '\u2005',
      '\u2006', '\u2007', '\u2008', '\u2009',
      '\u200a',
      // Line terminator chars
      '\u2028', '\u2029',
      // Unicode 3.0 Zs
      '\u202f', '\u205f', '\u3000',
      // Byte order marker is a space character in ES5 but not ES3.
      '\ufeff'
      );

  /** IE is broken around \s.  IE (6, 7, 8 at least), only recognize these. */
  private static final CharRanges IE_SPACE_CHARS = CharRanges.withMembers(
    '\t', '\n', '\u000b', '\u000c', '\r', ' '
    );

  /** IE is broken around \s.  IE (6, 7, 8 at least), only recognize these. */
  private static final CharRanges IE_SPEC_ERRORS = SPACE_CHARS.difference(
      IE_SPACE_CHARS);

  private static final ImmutableMap NAMED_CHAR_GROUPS =
      ImmutableMap.builder()
          .put('d', DIGITS)
          .put('D', CharRanges.ALL_CODE_UNITS.difference(DIGITS))
          .put('s', SPACE_CHARS)
          .put('S', CharRanges.ALL_CODE_UNITS.difference(SPACE_CHARS))
          .put('w', WORD_CHARS)
          .put('W', INVERSE_WORD_CHARS)
          .buildOrThrow();

  private static final Charset DOT_CHARSET = new Charset(
      CharRanges.ALL_CODE_UNITS.difference(
          CharRanges.withMembers('\n', '\r', '\u2028', '\u2029')),
      CharRanges.EMPTY);

  /** Represents a set of possible characters structured as [a-zA-Z] or [^a-zA-Z] */
  public static final class Charset extends RegExpTreeAtom {
    final CharRanges ranges;
    /**
     * Code units that were mentioned explicitly and that might be matched by
     * a group according to ECMAScript 5 but would not because of specification
     * violations in IE.
     */
    final CharRanges ieExplicits;

    Charset(CharRanges ranges, CharRanges ieExplicits) {
      this.ranges = ranges;
      this.ieExplicits = ieExplicits;
    }

    private static int complexityWordFolded(CharRanges ranges) {
      return min(
          complexityWordFoldedHelper(ranges),
          1 + complexityWordFoldedHelper(CharRanges.ALL_CODE_UNITS.difference(ranges)));
    }

    private static int complexityWordFoldedHelper(CharRanges ranges) {
      int complexity = DecomposedCharset.complexity(ranges);
      if (ranges.containsAll(WORD_CHARS)) {
        complexity =
            min(complexity, 1 + DecomposedCharset.complexity(ranges.difference(WORD_CHARS)));
      }
      if (ranges.containsAll(INVERSE_WORD_CHARS)) {
        complexity =
            min(
                complexity,
                1 + DecomposedCharset.complexity(ranges.difference(INVERSE_WORD_CHARS)));
      }
      return complexity;
    }

    @Override
    public RegExpTree simplify(String flags) {
      if (ranges.isEmpty()) {
        return NEVER_MATCHES;
      }
      CharRanges best = ranges;
      if (flags.indexOf('i') >= 0) {
        Set options = new LinkedHashSet<>();
        options.add(CaseCanonicalize.expandToAllMatched(ranges));
        options.add(CaseCanonicalize.reduceToMinimum(ranges));

        CharRanges lcaseLetters = ranges.intersection(LCASE_LETTERS);
        CharRanges ucaseLetters = ranges.intersection(UCASE_LETTERS);

        CharRanges lcaseLettersToUpper = lcaseLetters.shift(-32);
        CharRanges ucaseLettersToLower = ucaseLetters.shift(32);

        options.add(ranges.union(ucaseLettersToLower));
        options.add(ranges.union(lcaseLettersToUpper));
        options.add(ranges.union(lcaseLettersToUpper)
                    .union(ucaseLettersToLower));

        options.add(ranges.union(ucaseLettersToLower).difference(ucaseLetters));
        options.add(ranges.union(lcaseLettersToUpper).difference(lcaseLetters));

        int bestComplexity = complexityWordFolded(ranges);

        for (CharRanges option : options) {
          int complexity = complexityWordFolded(option);
          if (complexity < bestComplexity) {
            bestComplexity = complexity;
            best = option;
          }
        }
      }

      if (best.getNumRanges() == 1
          && best.end(0) - best.start(0) == 1) {
        return new Text(Character.toString((char) best.start(0)));
      }

      if (!best.equals(ranges)) {
        return new Charset(best, ieExplicits);
      }

      return this;
    }

    @Override
    public boolean isCaseSensitive() {
      // We could test
      //     !ranges.equals(CaseCanonicalize.expandToAllMatched(ranges))
      // but we get better optimizations by leaving the 'i' flag on in most
      // cases.

      // Check whether skipping all the character groups that are known
      // case-insensitive leaves us with something that matches the above
      // definition.
      CharRanges withoutNamedGroups = decompose().ranges;
      return !withoutNamedGroups.equals(
            CaseCanonicalize.expandToAllMatched(withoutNamedGroups));
    }

    private DecomposedCharset decompose(CharRanges ranges, boolean inverted) {
      StringBuilder namedGroups = new StringBuilder();
      CharRanges rangesInterIeExplicits = ranges.intersection(ieExplicits);
      while (true) {
        char groupName = 0;
        CharRanges simplest = null;
        int minComplexity = DecomposedCharset.complexity(ranges);
        for (Map.Entry namedGroup
             : NAMED_CHAR_GROUPS.entrySet()) {
          CharRanges group = namedGroup.getValue();
          if (ranges.containsAll(group)) {
            CharRanges withoutGroup = ranges.difference(group).union(
                rangesInterIeExplicits);
            int complexity = DecomposedCharset.complexity(withoutGroup);
            if (complexity < minComplexity) {
              simplest = withoutGroup;
              groupName = namedGroup.getKey().charValue();
              minComplexity = complexity;
            }
          }
        }
        if (simplest != null) {
          namedGroups.append('\\').append(groupName);
          ranges = simplest;
        } else {
          break;
        }
      }
      return new DecomposedCharset(inverted, ranges, namedGroups.toString());
    }

    DecomposedCharset decompose() {
      CharRanges negRanges = CharRanges.ALL_CODE_UNITS.difference(ranges);
      if (!ieExplicits.isEmpty()) {
        if (negRanges.intersection(ieExplicits).isEmpty()) {
          return decompose(ranges, false);
        } else if (ranges.intersection(ieExplicits).isEmpty()) {
          return decompose(negRanges, true);
        }
      }
      DecomposedCharset positive = decompose(ranges, false);
      DecomposedCharset negative = decompose(negRanges, true);
      return positive.complexity() <= negative.complexity() ? positive : negative;
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      if (DOT_CHARSET.ranges.equals(ranges)) {
        sb.append('.');
        return;
      }
      decompose().appendSourceCode(sb);
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      sb.append(ranges);
    }

    @Override
    public boolean equals(Object o) {
      return o instanceof Charset && ranges.equals(((Charset) o).ranges);
    }

    @Override
    public int hashCode() {
      return ranges.hashCode() ^ 0xdede2246;
    }
  }

  /** Internal representation for [] charsets */
  static final class DecomposedCharset {
    boolean inverted;
    final CharRanges ranges;
    final String namedGroups;

    DecomposedCharset(
        boolean inverted, CharRanges ranges, String namedGroups) {
      this.inverted = inverted;
      this.ranges = ranges;
      this.namedGroups = namedGroups;
    }

    int complexity() {
      return (inverted ? 1 : 0) + namedGroups.length() + complexity(ranges);
    }

    void appendSourceCode(StringBuilder sb) {
      if (ranges.isEmpty()) {
        if (!inverted && namedGroups.length() == 2) {
          sb.append(namedGroups);
          return;
        } else if (ranges.isEmpty() && namedGroups.isEmpty()) {
          sb.append(inverted ? "[\\S\\s]" : "(?!)");
          return;
        }
      }
      sb.append('[');
      if (inverted) { sb.append('^'); }
      sb.append(namedGroups);
      boolean rangesStartCharset = !inverted && namedGroups.isEmpty();
      boolean emitDashAtEnd = false;
      for (int i = 0, n = ranges.getNumRanges(); i < n; ++i) {
        char start = (char) ranges.start(i);
        char end = (char) (ranges.end(i) - 1);
        switch (end - start) {
          case 0:
            if (start == '-') {
              // Put it at the end where it doesn't need escaping.
              emitDashAtEnd = true;
            } else {
              escapeRangeCharOnto(
                  start, rangesStartCharset, i == 0, i + 1 == n, sb);
            }
            break;
          case 1:
            escapeRangeCharOnto(start, rangesStartCharset, i == 0, false, sb);
            escapeRangeCharOnto(
                end, rangesStartCharset, false, i + 1 == n, sb);
            break;
          default:
            escapeRangeCharOnto(start, rangesStartCharset, i == 0, false, sb);
            sb.append('-');
            escapeRangeCharOnto(end, rangesStartCharset, false, true, sb);
            break;
        }
      }
      if (emitDashAtEnd) { sb.append('-'); }
      sb.append(']');
    }

    static void escapeRangeCharOnto(
        char ch, boolean startIsFlush, boolean atStart, boolean atEnd,
        StringBuilder sb) {
      switch (ch) {
        case '\b':
          sb.append("\\b");
          break;
        case '^':
          sb.append(atStart && startIsFlush ? "\\^" : "^");
          break;
        case '-':
          sb.append(atStart || atEnd ? "-" : "\\-");
          break;
        case '\\':
        case ']':
          sb.append('\\').append(ch);
          break;
        default:
          escapeCharOnto(ch, sb);
      }
    }

    static int complexity(CharRanges ranges) {
      int complexity = 0;
      for (int i = 0, n = ranges.getNumRanges(); i < n; ++i) {
        int start = ranges.start(i);
        int end = ranges.end(i) - 1;
        if (start < 0x20 || start >= 0x7f) {
          complexity += start >= 0x100 ? 6 : 4;
        } else {
          ++complexity;
        }
        switch (end - start) {
          case 0: continue;
          case 1: break;
          default: complexity += 1;
        }
        if (end < 0x20 || end >= 0x7f) {
          complexity += end >= 0x100 ? 6 : 4;
        } else {
          ++complexity;
        }
      }
      return complexity;
    }

    @Override
    public boolean equals(Object o) {
      if (!(o instanceof DecomposedCharset)) {
        return false;
      }
      DecomposedCharset that = (DecomposedCharset) o;
      return this.inverted = that.inverted
          && this.ranges.equals(that.ranges)
          && this.namedGroups.equals(that.namedGroups);
    }

    @Override
    public int hashCode() {
      return ranges.hashCode()
          + 31 * (namedGroups.hashCode() + (inverted ? 1 : 0));
    }
  }

  /** Represents a series of nodes chained one after another such as (?:...)[a-z]*(...) */
  public static final class Concatenation extends RegExpTree {
    final ImmutableList elements;

    Concatenation(RegExpTree a, RegExpTree b) {
      elements = ImmutableList.of(a, b);
    }

    Concatenation(List elements) {
      this.elements = ImmutableList.copyOf(elements);
    }

    @Override
    public RegExpTree simplify(final String flags) {
      class Simplifier {
        final List simplified = new ArrayList<>();

        void simplify(RegExpTree t) {
          if (t instanceof Concatenation) {
            for (RegExpTree child : ((Concatenation) t).elements) {
              simplify(child);
            }
          } else if (t instanceof Empty) {
            // Do nothing
          } else {
            int lastIndex = simplified.size() - 1;
            if (lastIndex >= 0) {
              RegExpTree pairwise = simplifyPairwise(
                  simplified.get(lastIndex), t);
              if (pairwise != null) {
                simplified.set(lastIndex, pairwise);
                return;
              }
            }
            simplified.add(t);
          }
        }

        RegExpTree simplifyPairwise(RegExpTree before, RegExpTree after) {
          if (before instanceof Text && after instanceof Text) {
            return new Text(
                ((Text) before).text + ((Text) after).text).simplify(flags);
          }
          // Fold adjacent repetitions.
          int beforeMin = 1, beforeMax = 1;
          RegExpTree beforeBody = before;
          boolean beforeGreedy = false;
          if (before instanceof Repetition) {
            Repetition r = (Repetition) before;
            beforeMin = r.min;
            beforeMax = r.max;
            beforeBody = r.body;
            beforeGreedy = r.greedy;
          }
          int afterMin = 1, afterMax = 1;
          RegExpTree afterBody = after;
          boolean afterGreedy = false;
          if (after instanceof Repetition) {
            Repetition r = (Repetition) after;
            afterMin = r.min;
            afterMax = r.max;
            afterBody = r.body;
            afterGreedy = r.greedy;
          }
          if (beforeBody.equals(afterBody)
              && !beforeBody.hasCapturingGroup()) {
            long lmin = ((long) beforeMin) + afterMin;
            long lmax = ((long) beforeMax) + afterMax;
            if (lmin < Integer.MAX_VALUE) {
              int min = (int) lmin;
              int max = lmax >= Integer.MAX_VALUE
                  ? Integer.MAX_VALUE : (int) lmax;
              return new Repetition(
                  beforeBody, min, max,
                  beforeGreedy || afterGreedy || min == max);
            }
          }
          return null;
        }
      }

      Simplifier s = new Simplifier();
      for (RegExpTree element : elements) {
        s.simplify(element.simplify(flags));
      }

      switch (s.simplified.size()) {
        case 0: return Empty.INSTANCE;
        case 1: return s.simplified.get(0);
        default: return new Concatenation(s.simplified);
      }
    }

    @Override
    public boolean isCaseSensitive() {
      for (RegExpTree element : elements) {
        if (element.isCaseSensitive()) {
          return true;
        }
      }
      return false;
    }

    @Override
    public boolean containsAnchor() {
      for (RegExpTree element : elements) {
        if (element.containsAnchor()) {
          return true;
        }
      }
      return false;
    }

    @Override
    public int numCapturingGroups() {
      int n = 0;
      for (RegExpTree element : elements) {
        n += element.numCapturingGroups();
      }
      return n;
    }

    @Override
    public ImmutableList children() {
      return elements;
    }

    @Override
    protected void appendSourceCode(StringBuilder sb) {
      // True if the last content written might consume
      // decimal digits written subsequently.
      boolean digitsMightBleed = false;
      for (RegExpTree element : elements) {
        boolean parenthesize = false;
        if (element instanceof Alternation
            || element instanceof Concatenation) {
          parenthesize = true;
        }
        if (parenthesize) {
          sb.append("(?:");
          element.appendSourceCode(sb);
          sb.append(')');
        } else {
          int start = sb.length();
          element.appendSourceCode(sb);
          if (digitsMightBleed && sb.length() > start) {
            char firstChar = sb.charAt(start);
            if ('0' <= firstChar && firstChar <= '9') {
              // Bleeding happened.
              // If the last character would be ambiguous
              // with a repetition, escape it.
              if (sb.charAt(start - 1) == '{') {
                // Concatenation from optimization of
                // /{(?:0,}/ -> /\{0,}/
                sb.insert(start - 1, '\\');
              } else {
                // Or parenthesize otherwise.
                // Concatenation from optimization of
                // /(.)\1(?:0)/ -> /(.)\1(?:0)/.
                sb.insert(start, "(?:").append(')');
              }
            }
          }
        }
        digitsMightBleed = (
            // \1(?:0) bleeds if there are 10 or more
            // capturing groups preceding.
            (element instanceof BackReference
             && ((BackReference) element).groupIndex < 10)
            // foo{(?:10}) bleeds.
            || (element instanceof Text
                && ((Text) element).text.endsWith("{")));
      }
    }

    @Override
    protected void appendDebugInfo(StringBuilder sb) {
      // Nothing besides children.
    }

    @Override
    public boolean equals(Object o) {
      return o instanceof Concatenation
          && elements.equals(((Concatenation) o).elements);
    }

    @Override
    public int hashCode() {
      return 0x20997e3e ^ elements.hashCode();
    }
  }

  static void escapeCharOnto(char ch, StringBuilder sb) {
    switch (ch) {
      case '\u0000':
        sb.append("\\0");
        break;
      case '\f':
        sb.append("\\f");
        break;
      case '\t':
        sb.append("\\t");
        break;
      case '\n':
        sb.append("\\n");
        break;
      case '\r':
        sb.append("\\r");
        break;
      case '\\':
        sb.append("\\\\");
        break;
      default:
        if (ch < 0x20 || ch >= 0x7f) {
          if (ch >= 0x100) {
            sb.append("\\u");
            sb.append("0123456789abcdef".charAt((ch >> 12) & 0xf));
            sb.append("0123456789abcdef".charAt((ch >> 8) & 0xf));
            sb.append("0123456789abcdef".charAt((ch >> 4) & 0xf));
            sb.append("0123456789abcdef".charAt((ch) & 0xf));
          } else {
            sb.append("\\x");
            sb.append("0123456789abcdef".charAt((ch >> 4) & 0xf));
            sb.append("0123456789abcdef".charAt((ch) & 0xf));
          }
        } else {
          sb.append(ch);
        }
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy