All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.re2j.Matcher Maven / Gradle / Ivy

The newest version!
// Copyright 2010 Google Inc. All Rights Reserved.

package com.google.re2j;

import com.google.re2j.RE2.Anchor;

import io.airlift.slice.DynamicSliceOutput;
import io.airlift.slice.Slice;
import io.airlift.slice.SliceOutput;

import static com.google.re2j.RE2.Anchor.ANCHOR_BOTH;
import static com.google.re2j.RE2.Anchor.ANCHOR_START;
import static com.google.re2j.RE2.Anchor.UNANCHORED;

/**
 * A stateful iterator that interprets a regex {@code Pattern} on a
 * specific input.  Its interface mimics the JDK 1.4.2
 * {@code java.util.regex.Matcher}.
 *
 * 

Conceptually, a Matcher consists of four parts: *

    *
  1. A compiled regular expression {@code Pattern}, set at * construction and fixed for the lifetime of the matcher.
  2. * *
  3. The remainder of the input string, set at construction or * {@link #reset()} and advanced by each match operation such as * {@link #find}, {@link #matches} or {@link #lookingAt}.
  4. * *
  5. The current match information, accessible via {@link #start}, * {@link #end}, and {@link #group}, and updated by each match * operation.
  6. * *
  7. The append position, used and advanced by * {@link #appendReplacement} and {@link #appendTail} if performing a * search and replace from the input to an external {@code StringBuffer}. * *
* *

See the package-level * documentation for an overview of how to use this API.

* * @author [email protected] (Russ Cox) */ public final class Matcher { // The pattern being matched. private final Pattern pattern; // The group indexes, in [start, end) pairs. Zeroth pair is overall match. private final int[] groups; // The number of submatches (groups) in the pattern. private final int groupCount; private Slice input; // The append position: where the next append should start. private int appendPos; // Is there a current match? private boolean hasMatch; // Have we found the submatches (groups) of the current match? // group[0], group[1] are set regardless. private boolean hasGroups; // The anchor flag to use when repeating the match to find subgroups. private Anchor anchorFlag; private Matcher(Pattern pattern) { if (pattern == null) { throw new NullPointerException("pattern is null"); } this.pattern = pattern; RE2 re2 = pattern.re2(); groupCount = re2.numberOfCapturingGroups(); groups = new int[2 + 2 * groupCount]; } /** Creates a new {@code Matcher} with the given pattern and input. */ Matcher(Pattern pattern, Slice input) { this(pattern); reset(input); } /** Returns the {@code Pattern} associated with this {@code Matcher}. */ public Pattern pattern() { return pattern; } /** * Resets the {@code Matcher}, rewinding input and * discarding any match information. * * @return the {@code Matcher} itself, for chained method calls */ public Matcher reset() { appendPos = 0; hasMatch = false; hasGroups = false; return this; } /** * Resets the {@code Matcher} and changes the input. * * @param input the new input {@link Slice} * @return the {@code Matcher} itself, for chained method calls */ public Matcher reset(Slice input) { if (input == null) { throw new NullPointerException("input is null"); } reset(); this.input = input; return this; } /** * Returns the start position of the most recent match. * * @throws IllegalStateException if there is no match */ public int start() { return start(0); } /** * Returns the end position of the most recent match. * * @throws IllegalStateException if there is no match */ public int end() { return end(0); } /** * Returns the start position of a subgroup of the most recent match. * * @param group the group index; 0 is the overall match * @throws IllegalStateException if there is no match * @throws IndexOutOfBoundsException * if {@code group < 0} or {@code group > groupCount()} */ public int start(int group) { loadGroup(group); return groups[2 * group]; } /** * Returns the end position of a subgroup of the most recent match. * * @param group the group index; 0 is the overall match * @throws IllegalStateException if there is no match * @throws IndexOutOfBoundsException * if {@code group < 0} or {@code group > groupCount()} */ public int end(int group) { loadGroup(group); return groups[2 * group + 1]; } /** * Returns the most recent match. * * @throws IllegalStateException if there is no match */ public Slice group() { return group(0); } /** * Returns the subgroup of the most recent match. * * @throws IllegalStateException if there is no match * @throws IndexOutOfBoundsException if {@code group < 0} * or {@code group > groupCount()} */ public Slice group(int group) { int start = start(group); int end = end(group); if (start < 0 && end < 0) { // Means the subpattern didn't get matched at all. return null; } return input.slice(start, end - start); } /** * Returns the number of subgroups in this pattern. * * @return the number of subgroups; the overall match (group 0) does not count */ public int groupCount() { return groupCount; } /** Helper: finds subgroup information if needed for group. */ private void loadGroup(int group) { if (group < 0 || group > groupCount) { throw new IndexOutOfBoundsException( "Group index out of bounds: " + group); } if (!hasMatch) { throw new IllegalStateException("perhaps no match attempted"); } if (group == 0 || hasGroups) { return; } // Include the character after the matched text (if there is one). // This is necessary in the case of inputSequence abc and pattern // (a)(b$)?(b)? . If we do pass in the trailing c, // the groups evaluate to new String[] {"ab", "a", null, "b" } // If we don't, they evaluate to new String[] {"ab", "a", "b", null} // We know it won't affect the total matched because the previous call // to match included the extra character, and it was not matched then. int end = groups[1] + 1; if (end > input.length()) { end = input.length(); } boolean ok = pattern.re2().match(input.slice(0, end), groups[0], anchorFlag, groups, 1 + groupCount); // Must match - hasMatch says that the last call with these // parameters worked just fine. if (!ok) { throw new IllegalStateException("inconsistency in matching group data"); } hasGroups = true; } /** * Matches the entire input against the pattern (anchored start and end). * If there is a match, {@code matches} sets the match state to describe it. * * @return true if the entire input matches the pattern */ public boolean matches() { return genMatch(0, ANCHOR_BOTH); } /** * Matches the beginning of input against the pattern (anchored start). * If there is a match, {@code lookingAt} sets the match state to describe it. * * @return true if the beginning of the input matches the pattern */ public boolean lookingAt() { return genMatch(0, ANCHOR_START); } /** * Matches the input against the pattern (unanchored). * The search begins at the end of the last match, or else the beginning * of the input. * If there is a match, {@code find} sets the match state to describe it. * * @return true if it finds a match */ public boolean find() { int start = 0; if (hasMatch) { start = groups[1]; if (groups[0] == groups[1]) { // empty match - nudge forward start++; } } return genMatch(start, UNANCHORED); } /** * Matches the input against the pattern (unanchored), * starting at a specified position. * If there is a match, {@code find} sets the match state to describe it. * * @param start the input position where the search begins * @return true if it finds a match * @throws IndexOutOfBoundsException if start is not a valid input position */ public boolean find(int start) { if (start < 0 || start > input.length()) { throw new IndexOutOfBoundsException( "start index out of bounds: " + start); } reset(); return genMatch(start, UNANCHORED); } /** Helper: does match starting at start, with RE2 anchor flag. */ private boolean genMatch(int startByte, Anchor anchor) { // TODO(rsc): Is matches/lookingAt supposed to reset the append or input positions? // From the JDK docs, looks like no. boolean ok = pattern.re2().match(input, startByte, anchor, groups, 1); if (!ok) { return false; } hasMatch = true; hasGroups = false; anchorFlag = anchor; return true; } /** Helper: return substring for [start, end). */ Slice substring(int start, int end) { return input.slice(start, end - start); } /** Helper for Pattern: return input length. */ int inputLength() { return input.length(); } /** * Appends to {@code so} two slices: the text from the append position up * to the beginning of the most recent match, and then the replacement with * submatch groups substituted for references of the form {@code $n}, where * {@code n} is the group number in decimal. It advances the append position * to the position where the most recent match ended. * *

To embed a literal {@code $}, use \$ (actually {@code "\\$"} with string * escapes). The escape is only necessary when {@code $} is followed by a * digit, but it is always allowed. Only {@code $} and {@code \} need * escaping, but any character can be escaped. * *

The group number {@code n} in {@code $n} is always at least one digit * and expands to use more digits as long as the resulting number is a * valid group number for this pattern. To cut it off earlier, escape the * first digit that should not be used. * * @param so the {@link SliceOutput} to append to * @param replacement the replacement {@link Slice} * @return the {@code Matcher} itself, for chained method calls * @throws IllegalStateException if there was no most recent match * @throws IndexOutOfBoundsException if replacement refers to an invalid group * @throws IllegalArgumentException if replacement has incorrect format */ public Matcher appendReplacement(SliceOutput so, Slice replacement) { int s = start(); int e = end(); if (appendPos < s) { so.writeBytes(input, appendPos, s - appendPos); } appendPos = e; SliceUtils.appendReplacement(so, replacement, this); return this; } /** * Appends to {@code so} the subslice of the input from the * append position to the end of the input. * * @param so the {@link SliceOutput} to append to * @return the argument {@code so}, for method chaining */ public SliceOutput appendTail(SliceOutput so) { so.writeBytes(input, appendPos, input.length() - appendPos); return so; } /** * Returns the input with all matches replaced by {@code replacement}, * interpreted as for {@code appendReplacement}. * * @param replacement the replacement {@link Slice} * @return the input {@link Slice} with the matches replaced * @throws IndexOutOfBoundsException if replacement refers to an invalid group */ public Slice replaceAll(Slice replacement) { return replace(replacement, true); } /** * Returns the input with the first match replaced by {@code replacement}, * interpreted as for {@code appendReplacement}. * * @param replacement the replacement {@link Slice} * @return the input {@link Slice} with the first match replaced * @throws IndexOutOfBoundsException if replacement refers to an invalid group */ public Slice replaceFirst(Slice replacement) { return replace(replacement, false); } /** Helper: replaceAll/replaceFirst hybrid. */ private Slice replace(Slice replacement, boolean all) { reset(); SliceOutput so = new DynamicSliceOutput(input.length()); while (find()) { appendReplacement(so, replacement); if (!all) { break; } } appendTail(so); return so.slice(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy