All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.re2j.Regexp Maven / Gradle / Ivy

The newest version!
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/syntax/regexp.go

package com.google.re2j;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

/**
 * Regular expression abstract syntax tree.
 * Produced by parser, used by compiler.
 * NB, this corresponds to {@code syntax.regexp} in the Go implementation;
 * Go's {@code regexp} is called {@code RE2} in Java.
 */
class Regexp {

  enum Op {
    NO_MATCH,           // Matches no strings.
    EMPTY_MATCH,        // Matches empty string.
    LITERAL,            // Matches runes[] sequence
    CHAR_CLASS,         // Matches Runes interpreted as range pair list
    ANY_CHAR_NOT_NL,    // Matches any character except '\n'
    ANY_CHAR,           // Matches any character
    BEGIN_LINE,         // Matches empty string at end of line
    END_LINE,           // Matches empty string at end of line
    BEGIN_TEXT,         // Matches empty string at beginning of text
    END_TEXT,           // Matches empty string at end of text
    WORD_BOUNDARY,      // Matches word boundary `\b`
    NO_WORD_BOUNDARY,   // Matches word non-boundary `\B`
    CAPTURE,            // Capturing subexpr with index cap, optional name name
    STAR,               // Matches subs[0] zero or more times.
    PLUS,               // Matches subs[0] one or more times.
    QUEST,              // Matches subs[0] zero or one times.
    REPEAT,             // Matches subs[0] [min, max] times; max=-1 => no limit.
    CONCAT,             // Matches concatenation of subs[]
    ALTERNATE,          // Matches union of subs[]

    // Pseudo ops, used internally by Parser for parsing stack:
    LEFT_PAREN,
    VERTICAL_BAR;

    boolean isPseudo() {
      return ordinal() >= LEFT_PAREN.ordinal();
    }
  }

  static final Regexp[] EMPTY_SUBS = {};

  Op op;                   // operator
  int flags;               // bitmap of parse flags
  Regexp[] subs;           // subexpressions, if any.  Never null.
                           // subs[0] is used as the freelist.
  int[] runes;             // matched runes, for LITERAL, CHAR_CLASS
  int min, max;            // min, max for REPEAT
  int cap;                 // capturing index, for CAPTURE
  String name;             // capturing name, for CAPTURE
  // Do update copy ctor when adding new fields!

  Regexp(Op op) {
    this.op = op;
  }

  // Shallow copy constructor.
  Regexp(Regexp that) {
    this.op = that.op;
    this.flags = that.flags;
    this.subs = that.subs;
    this.runes = that.runes;
    this.min = that.min;
    this.max = that.max;
    this.cap = that.cap;
    this.name = that.name;
  }

  void reinit() {
    this.flags = 0;
    subs = EMPTY_SUBS;
    runes = null;
    cap = min = max = 0;
    name = null;
  }

  @Override public String toString() {
    StringBuilder out = new StringBuilder();
    appendTo(out);
    return out.toString();
  }

  private static void quoteIfHyphen(StringBuilder out, int rune) {
    if (rune == '-') {
      out.append('\\');
    }
  }

  // appendTo() appends the Perl syntax for |this| regular expression to |out|.
  private void appendTo(StringBuilder out) {
    switch (op) {
      case NO_MATCH:
        out.append("[^\\x00-\\x{10FFFF}]");
        break;
      case EMPTY_MATCH:
        out.append("(?:)");
        break;
      case STAR:
      case PLUS:
      case QUEST:
      case REPEAT: {
        Regexp sub = subs[0];
        if (sub.op.ordinal() > Op.CAPTURE.ordinal() ||
            sub.op == Op.LITERAL && sub.runes.length > 1) {
          out.append("(?:");
          sub.appendTo(out);
          out.append(')');
        } else {
          sub.appendTo(out);
        }
        switch (op) {
          case STAR:
            out.append('*');
            break;
          case PLUS:
            out.append('+');
            break;
          case QUEST:
            out.append('?');
            break;
          case REPEAT:
            out.append('{').append(min);
            if (min != max) {
              out.append(',');
              if (max >= 0) {
                out.append(max);
              }
            }
            out.append('}');
            break;
        }
        if ((flags & RE2.NON_GREEDY) != 0) {
          out.append('?');
        }
        break;
      }
      case CONCAT:
        for (Regexp sub : subs) {
          if (sub.op  == Op.ALTERNATE) {
            out.append("(?:");
            sub.appendTo(out);
            out.append(')');
          } else {
            sub.appendTo(out);
          }
        }
        break;
      case ALTERNATE: {
        String sep = "";
        for (Regexp sub : subs) {
          out.append(sep);
          sep = "|";
          sub.appendTo(out);
        }
        break;
      }
      case LITERAL:
        if ((flags & RE2.FOLD_CASE) != 0) {
          out.append("(?i:");
        }
        for (int rune : runes) {
          Utils.escapeRune(out, rune);
        }
        if ((flags & RE2.FOLD_CASE) != 0) {
          out.append(')');
        }
        break;
      case ANY_CHAR_NOT_NL:
        out.append("(?-s:.)");
        break;
      case ANY_CHAR:
        out.append("(?s:.)");
        break;
      case CAPTURE:
        if (name == null || name.isEmpty()) {
          out.append('(');
        } else {
          out.append("(?P<");
          out.append(name);
          out.append(">");
        }
        if (subs[0].op != Op.EMPTY_MATCH) {
          subs[0].appendTo(out);
        }
        out.append(')');
        break;
      case BEGIN_TEXT:
        out.append("\\A");
        break;
      case END_TEXT:
        if ((flags & RE2.WAS_DOLLAR) != 0) {
          out.append("(?-m:$)");
        } else {
          out.append("\\z");
        }
        break;
      case BEGIN_LINE:
        out.append('^');
        break;
      case END_LINE:
        out.append('$');
        break;
      case WORD_BOUNDARY:
        out.append("\\b");
        break;
      case NO_WORD_BOUNDARY:
        out.append("\\B");
        break;
      case CHAR_CLASS:
        if (runes.length % 2 != 0) {
          out.append("[invalid char class]");
          break;
        }
        out.append('[');
        if (runes.length == 0) {
          out.append("^\\x00-\\x{10FFFF}");
        } else if (runes[0] == 0 &&
                   runes[runes.length - 1] == Unicode.MAX_RUNE) {
          // Contains 0 and MAX_RUNE.  Probably a negated class.
          // Print the gaps.
          out.append('^');
          for (int i = 1; i < runes.length - 1; i += 2) {
            int lo = runes[i] + 1;
            int hi = runes[i+1] - 1;
            quoteIfHyphen(out, lo);
            Utils.escapeRune(out, lo);
            if (lo != hi) {
              out.append('-');
              quoteIfHyphen(out, hi);
              Utils.escapeRune(out, hi);
            }
          }
        } else {
          for (int i = 0; i < runes.length; i += 2) {
            int lo = runes[i];
            int hi = runes[i + 1];
            quoteIfHyphen(out, lo);
            Utils.escapeRune(out, lo);
            if (lo != hi) {
              out.append('-');
              quoteIfHyphen(out, hi);
              Utils.escapeRune(out, hi);
            }
          }
        }
        out.append(']');
        break;
      default:  // incl. pseudos
        out.append(op);
        break;
    }
  }

  // maxCap() walks the regexp to find the maximum capture index.
  int maxCap() {
    int m = 0;
    if (op == Op.CAPTURE) {
      m = cap;
    }
    if (subs != null) {
      for (Regexp sub : subs) {
        int n = sub.maxCap();
        if (m < n) {
          m = n;
        }
      }
    }
    return m;
  }

  // namedGroupIndexes() walks the regexp to build a map of indexes for named groups.
  Map namedGroupIndexes() {
    Map indexes = new HashMap<>();
    putNamedGroupIndexes(indexes);
    return indexes;
  }

  private void putNamedGroupIndexes(Map indexes) {
    if (op == Op.CAPTURE && name != null) {
      indexes.put(name, cap);
    }
    if (subs != null) {
      for (Regexp sub: subs) {
        sub.putNamedGroupIndexes(indexes);
      }
    }
  }

  // equals() returns true if this and that have identical structure.
  @Override public boolean equals(Object that) {
    if (!(that instanceof Regexp)) {
      return false;
    }
    Regexp x = this;
    Regexp y = (Regexp) that;
    if (x.op != y.op) {
      return false;
    }
    switch (x.op) {
      case END_TEXT:
        // The parse flags remember whether this is \z or \Z.
        if ((x.flags & RE2.WAS_DOLLAR) != (y.flags & RE2.WAS_DOLLAR)) {
          return false;
        }
        break;
      case LITERAL:
      case CHAR_CLASS:
        if (!Arrays.equals(x.runes, y.runes)) {
          return false;
        }
        break;
      case ALTERNATE:
      case CONCAT:
        if (x.subs.length != y.subs.length) {
          return false;
        }
        for (int i = 0; i < x.subs.length; ++i) {
          if (!x.subs[i].equals(y.subs[i])) {
            return false;
          }
        }
        break;
      case STAR:
      case PLUS:
      case QUEST:
        if ((x.flags & RE2.NON_GREEDY) != (y.flags & RE2.NON_GREEDY) ||
            !x.subs[0].equals(y.subs[0])) {
          return false;
        }
        break;
      case REPEAT:
        if ((x.flags & RE2.NON_GREEDY) != (y.flags & RE2.NON_GREEDY) ||
            x.min != y.min || x.max != y.max || !x.subs[0].equals(y.subs[0])) {
          return false;
        }
        break;
      case CAPTURE:
        if (x.cap != y.cap || x.name != y.name ||
            !x.subs[0].equals(y.subs[0])) {
          return false;
        }
        break;
    }
    return true;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy