 
                        
        
                        
        com.google.re2j.Matcher Maven / Gradle / Ivy
/*
 * Copyright (c) 2020 The Go Authors. All rights reserved.
 *
 * Use of this source code is governed by a BSD-style
 * license that can be found in the LICENSE file.
 */
package com.google.re2j;
import com.google.re2j.MatcherInput.Encoding;
import java.util.Map;
/**
 * A stateful iterator that interprets a regex {@code Pattern} on a specific input. Its interface
 * mimics the JDK 1.4.2 {@code java.util.regex.Matcher}.
 *
 * 
 * Conceptually, a Matcher consists of four parts:
 * 
 * - A compiled regular expression {@code Pattern}, set at construction and fixed for the lifetime
 * of the matcher.*
 *
- The remainder of the input string, set at construction or {@link #reset()} and advanced by
 * each match operation such as {@link #find}, {@link #matches} or {@link #lookingAt}.*
 *
- The current match information, accessible via {@link #start}, {@link #end}, and
 * {@link #group}, and updated by each match operation.*
 *
- The append position, used and advanced by {@link #appendReplacement} and {@link #appendTail}
 * if performing a search and replace from the input to an external {@code StringBuffer}.
 *
 * 
*
 *
 * See the package-level documentation for an overview of how to use this
 * API.
 * 
 *
 * @author [email protected] (Russ Cox)
 */
public final class Matcher {
  // The pattern being matched.
  private final Pattern pattern;
  // The group indexes, in [start, end) pairs.  Zeroth pair is overall match.
  private final int[] groups;
  private final Map namedGroups;
  // The number of submatches (groups) in the pattern.
  private final int groupCount;
  private MatcherInput matcherInput;
  // The input length in UTF16 codes.
  private int inputLength;
  // The append position: where the next append should start.
  private int appendPos;
  // Is there a current match?
  private boolean hasMatch;
  // Have we found the submatches (groups) of the current match?
  // group[0], group[1] are set regardless.
  private boolean hasGroups;
  // The anchor flag to use when repeating the match to find subgroups.
  private int anchorFlag;
  private Matcher(Pattern pattern) {
    if (pattern == null) {
      throw new NullPointerException("pattern is null");
    }
    this.pattern = pattern;
    RE2 re2 = pattern.re2();
    groupCount = re2.numberOfCapturingGroups();
    groups = new int[2 + 2 * groupCount];
    namedGroups = re2.namedGroups;
  }
  /** Creates a new {@code Matcher} with the given pattern and input. */
  Matcher(Pattern pattern, CharSequence input) {
    this(pattern);
    reset(input);
  }
  Matcher(Pattern pattern, MatcherInput input) {
    this(pattern);
    reset(input);
  }
  /** Returns the {@code Pattern} associated with this {@code Matcher}. */
  public Pattern pattern() {
    return pattern;
  }
  /**
   * Resets the {@code Matcher}, rewinding input and discarding any match information.
   *
   * @return the {@code Matcher} itself, for chained method calls
   */
  public Matcher reset() {
    inputLength = matcherInput.length();
    appendPos = 0;
    hasMatch = false;
    hasGroups = false;
    return this;
  }
  /**
   * Resets the {@code Matcher} and changes the input.
   *
   * @param input the new input string
   * @return the {@code Matcher} itself, for chained method calls
   */
  public Matcher reset(CharSequence input) {
    return reset(MatcherInput.utf16(input));
  }
  /**
   * Resets the {@code Matcher} and changes the input.
   *
   * @param bytes utf8 bytes of the input string.
   * @return the {@code Matcher} itself, for chained method calls
   */
  public Matcher reset(byte[] bytes) {
    return reset(MatcherInput.utf8(bytes));
  }
  private Matcher reset(MatcherInput input) {
    if (input == null) {
      throw new NullPointerException("input is null");
    }
    matcherInput = input;
    reset();
    return this;
  }
  /**
   * Returns the start position of the most recent match.
   *
   * @throws IllegalStateException if there is no match
   */
  public int start() {
    return start(0);
  }
  /**
   * Returns the end position of the most recent match.
   *
   * @throws IllegalStateException if there is no match
   */
  public int end() {
    return end(0);
  }
  /**
   * Returns the start position of a subgroup of the most recent match.
   *
   * @param group the group index; 0 is the overall match
   * @throws IllegalStateException if there is no match
   * @throws IndexOutOfBoundsException if {@code group < 0} or {@code group > groupCount()}
   */
  public int start(int group) {
    loadGroup(group);
    return groups[2 * group];
  }
  /**
   * Returns the start of the named group of the most recent match, or -1 if the group was not
   * matched.
   *
   * @param group the group name
   * @throws IllegalArgumentException if no group with that name exists
   */
  public int start(String group) {
    Integer g = namedGroups.get(group);
    if (g == null) {
      throw new IllegalArgumentException("group '" + group + "' not found");
    }
    return start(g);
  }
  /**
   * Returns the end position of a subgroup of the most recent match.
   *
   * @param group the group index; 0 is the overall match
   * @throws IllegalStateException if there is no match
   * @throws IndexOutOfBoundsException if {@code group < 0} or {@code group > groupCount()}
   */
  public int end(int group) {
    loadGroup(group);
    return groups[2 * group + 1];
  }
  /**
   * Returns the end of the named group of the most recent match, or -1 if the group was not
   * matched.
   *
   * @param group the group name
   * @throws IllegalArgumentException if no group with that name exists
   */
  public int end(String group) {
    Integer g = namedGroups.get(group);
    if (g == null) {
      throw new IllegalArgumentException("group '" + group + "' not found");
    }
    return end(g);
  }
  /**
   * Returns the most recent match.
   *
   * @throws IllegalStateException if there is no match
   */
  public String group() {
    return group(0);
  }
  /**
   * Returns the subgroup of the most recent match.
   *
   * @throws IllegalStateException if there is no match
   * @throws IndexOutOfBoundsException if {@code group < 0} or {@code group > groupCount()}
   */
  public String group(int group) {
    int start = start(group);
    int end = end(group);
    if (start < 0 && end < 0) {
      // Means the subpattern didn't get matched at all.
      return null;
    }
    return substring(start, end);
  }
  /**
   * Returns the named group of the most recent match, or {@code null} if the group was not matched.
   *
   * @param group the group name
   * @throws IllegalArgumentException if no group with that name exists
   */
  public String group(String group) {
    Integer g = namedGroups.get(group);
    if (g == null) {
      throw new IllegalArgumentException("group '" + group + "' not found");
    }
    return group(g);
  }
  /**
   * Returns the number of subgroups in this pattern.
   *
   * @return the number of subgroups; the overall match (group 0) does not count
   */
  public int groupCount() {
    return groupCount;
  }
  /** Helper: finds subgroup information if needed for group. */
  private void loadGroup(int group) {
    if (group < 0 || group > groupCount) {
      throw new IndexOutOfBoundsException("Group index out of bounds: " + group);
    }
    if (!hasMatch) {
      throw new IllegalStateException("perhaps no match attempted");
    }
    if (group == 0 || hasGroups) {
      return;
    }
    // Include the character after the matched text (if there is one).
    // This is necessary in the case of inputSequence abc and pattern
    // (a)(b$)?(b)? . If we do pass in the trailing c,
    // the groups evaluate to new String[] {"ab", "a", null, "b" }
    // If we don't, they evaluate to new String[] {"ab", "a", "b", null}
    // We know it won't affect the total matched because the previous call
    // to match included the extra character, and it was not matched then.
    int end = groups[1] + 1;
    if (end > inputLength) {
      end = inputLength;
    }
    boolean ok =
        pattern.re2().match(matcherInput, groups[0], end, anchorFlag, groups, 1 + groupCount);
    // Must match - hasMatch says that the last call with these
    // parameters worked just fine.
    if (!ok) {
      throw new IllegalStateException("inconsistency in matching group data");
    }
    hasGroups = true;
  }
  /**
   * Matches the entire input against the pattern (anchored start and end). If there is a match,
   * {@code matches} sets the match state to describe it.
   *
   * @return true if the entire input matches the pattern
   */
  public boolean matches() {
    return genMatch(0, RE2.ANCHOR_BOTH);
  }
  /**
   * Matches the beginning of input against the pattern (anchored start). If there is a match,
   * {@code lookingAt} sets the match state to describe it.
   *
   * @return true if the beginning of the input matches the pattern
   */
  public boolean lookingAt() {
    return genMatch(0, RE2.ANCHOR_START);
  }
  /**
   * Matches the input against the pattern (unanchored). The search begins at the end of the last
   * match, or else the beginning of the input. If there is a match, {@code find} sets the match
   * state to describe it.
   *
   * @return true if it finds a match
   */
  public boolean find() {
    int start = 0;
    if (hasMatch) {
      start = groups[1];
      if (groups[0] == groups[1]) { // empty match - nudge forward
        start++;
      }
    }
    return genMatch(start, RE2.UNANCHORED);
  }
  /**
   * Matches the input against the pattern (unanchored), starting at a specified position. If there
   * is a match, {@code find} sets the match state to describe it.
   *
   * @param start the input position where the search begins
   * @return true if it finds a match
   * @throws IndexOutOfBoundsException if start is not a valid input position
   */
  public boolean find(int start) {
    if (start < 0 || start > inputLength) {
      throw new IndexOutOfBoundsException("start index out of bounds: " + start);
    }
    reset();
    return genMatch(start, 0);
  }
  /** Helper: does match starting at start, with RE2 anchor flag. */
  private boolean genMatch(int startByte, int anchor) {
    // TODO(rsc): Is matches/lookingAt supposed to reset the append or input positions?
    // From the JDK docs, looks like no.
    boolean ok = pattern.re2().match(matcherInput, startByte, inputLength, anchor, groups, 1);
    if (!ok) {
      return false;
    }
    hasMatch = true;
    hasGroups = false;
    anchorFlag = anchor;
    return true;
  }
  /** Helper: return substring for [start, end). */
  String substring(int start, int end) {
    // UTF_8 is matched in binary mode. So slice the bytes.
    if (matcherInput.getEncoding() == Encoding.UTF_8) {
      return new String(matcherInput.asBytes(), start, end - start);
    }
    // This is fast for both StringBuilder and String.
    return matcherInput.asCharSequence().subSequence(start, end).toString();
  }
  /** Helper for Pattern: return input length. */
  int inputLength() {
    return inputLength;
  }
  /**
   * Quotes '\' and '$' in {@code s}, so that the returned string could be used in
   * {@link #appendReplacement} as a literal replacement of {@code s}.
   *
   * @param s the string to be quoted
   * @return the quoted string
   */
  public static String quoteReplacement(String s) {
    if (s.indexOf('\\') < 0 && s.indexOf('$') < 0) {
      return s;
    }
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < s.length(); ++i) {
      char c = s.charAt(i);
      if (c == '\\' || c == '$') {
        sb.append('\\');
      }
      sb.append(c);
    }
    return sb.toString();
  }
  /**
   * Appends to {@code sb} two strings: the text from the append position up to the beginning of the
   * most recent match, and then the replacement with submatch groups substituted for references of
   * the form {@code $n}, where {@code n} is the group number in decimal. It advances the append
   * position to where the most recent match ended.
   *
   * 
   * To embed a literal {@code $}, use \$ (actually {@code "\\$"} with string escapes). The escape
   * is only necessary when {@code $} is followed by a digit, but it is always allowed. Only
   * {@code $} and {@code \} need escaping, but any character can be escaped.
   *
   * 
   * The group number {@code n} in {@code $n} is always at least one digit and expands to use more
   * digits as long as the resulting number is a valid group number for this pattern. To cut it off
   * earlier, escape the first digit that should not be used.
   *
   * @param sb the {@link StringBuffer} to append to
   * @param replacement the replacement string
   * @return the {@code Matcher} itself, for chained method calls
   * @throws IllegalStateException if there was no most recent match
   * @throws IndexOutOfBoundsException if replacement refers to an invalid group
   */
  public Matcher appendReplacement(StringBuffer sb, String replacement) {
    StringBuilder result = new StringBuilder();
    appendReplacement(result, replacement);
    sb.append(result);
    return this;
  }
  /**
   * Appends to {@code sb} two strings: the text from the append position up to the beginning of the
   * most recent match, and then the replacement with submatch groups substituted for references of
   * the form {@code $n}, where {@code n} is the group number in decimal. It advances the append
   * position to where the most recent match ended.
   *
   * 
   * To embed a literal {@code $}, use \$ (actually {@code "\\$"} with string escapes). The escape
   * is only necessary when {@code $} is followed by a digit, but it is always allowed. Only
   * {@code $} and {@code \} need escaping, but any character can be escaped.
   *
   * 
   * The group number {@code n} in {@code $n} is always at least one digit and expands to use more
   * digits as long as the resulting number is a valid group number for this pattern. To cut it off
   * earlier, escape the first digit that should not be used.
   *
   * @param sb the {@link StringBuilder} to append to
   * @param replacement the replacement string
   * @return the {@code Matcher} itself, for chained method calls
   * @throws IllegalStateException if there was no most recent match
   * @throws IndexOutOfBoundsException if replacement refers to an invalid group
   */
  public Matcher appendReplacement(StringBuilder sb, String replacement) {
    int s = start();
    int e = end();
    if (appendPos < s) {
      sb.append(substring(appendPos, s));
    }
    appendPos = e;
    appendReplacementInternal(sb, replacement);
    return this;
  }
  private void appendReplacementInternal(StringBuilder sb, String replacement) {
    int last = 0;
    int i = 0;
    int m = replacement.length();
    for (; i < m - 1; i++) {
      if (replacement.charAt(i) == '\\') {
        if (last < i) {
          sb.append(replacement.substring(last, i));
        }
        i++;
        last = i;
        continue;
      }
      if (replacement.charAt(i) == '$') {
        int c = replacement.charAt(i + 1);
        if ('0' <= c && c <= '9') {
          int n = c - '0';
          if (last < i) {
            sb.append(replacement.substring(last, i));
          }
          for (i += 2; i < m; i++) {
            c = replacement.charAt(i);
            if (c < '0' || c > '9' || n * 10 + c - '0' > groupCount) {
              break;
            }
            n = n * 10 + c - '0';
          }
          if (n > groupCount) {
            throw new IndexOutOfBoundsException("n > number of groups: " + n);
          }
          String group = group(n);
          if (group != null) {
            sb.append(group);
          }
          last = i;
          i--;
          continue;
        } else if (c == '{') {
          if (last < i) {
            sb.append(replacement.substring(last, i));
          }
          i++; // skip {
          int j = i + 1;
          while (j < replacement.length()
              && replacement.charAt(j) != '}'
              && replacement.charAt(j) != ' ') {
            j++;
          }
          if (j == replacement.length() || replacement.charAt(j) != '}') {
            throw new IllegalArgumentException("named capture group is missing trailing '}'");
          }
          String groupName = replacement.substring(i + 1, j);
          sb.append(group(groupName));
          last = j + 1;
        }
      }
    }
    if (last < m) {
      sb.append(replacement, last, m);
    }
  }
  /**
   * Appends to {@code sb} the substring of the input from the append position to the end of the
   * input.
   *
   * @param sb the {@link StringBuffer} to append to
   * @return the argument {@code sb}, for method chaining
   */
  public StringBuffer appendTail(StringBuffer sb) {
    sb.append(substring(appendPos, inputLength));
    return sb;
  }
  /**
   * Appends to {@code sb} the substring of the input from the append position to the end of the
   * input.
   *
   * @param sb the {@link StringBuilder} to append to
   * @return the argument {@code sb}, for method chaining
   */
  public StringBuilder appendTail(StringBuilder sb) {
    sb.append(substring(appendPos, inputLength));
    return sb;
  }
  /**
   * Returns the input with all matches replaced by {@code replacement}, interpreted as for
   * {@code appendReplacement}.
   *
   * @param replacement the replacement string
   * @return the input string with the matches replaced
   * @throws IndexOutOfBoundsException if replacement refers to an invalid group
   */
  public String replaceAll(String replacement) {
    return replace(replacement, true);
  }
  /**
   * Returns the input with the first match replaced by {@code replacement}, interpreted as for
   * {@code appendReplacement}.
   *
   * @param replacement the replacement string
   * @return the input string with the first match replaced
   * @throws IndexOutOfBoundsException if replacement refers to an invalid group
   */
  public String replaceFirst(String replacement) {
    return replace(replacement, false);
  }
  /** Helper: replaceAll/replaceFirst hybrid. */
  private String replace(String replacement, boolean all) {
    reset();
    StringBuffer sb = new StringBuffer();
    while (find()) {
      appendReplacement(sb, replacement);
      if (!all) {
        break;
      }
    }
    appendTail(sb);
    return sb.toString();
  }
}