com.google.code.regexp.Pattern Maven / Gradle / Ivy
Show all versions of named-regexp Show documentation
/**
* Copyright (C) 2012-2013 The named-regexp Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.code.regexp;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.PatternSyntaxException;
/**
* A compiled representation of a regular expression. This is a wrapper
* for the java.util.regex.Pattern with support for named capturing
* groups. The named groups are specified with "(?<name>exp)", which
* is identical to Java 7 named groups.
*
* @since 0.1.9
*/
public class Pattern {
/** Pattern to match group names */
private static final String NAME_PATTERN = "[^!=].*?";
/** Pattern to match named capture groups in a pattern string */
private static final java.util.regex.Pattern NAMED_GROUP_PATTERN = java.util.regex.Pattern.compile("\\(\\?<(" + NAME_PATTERN + ")>", java.util.regex.Pattern.DOTALL);
/** Pattern to match back references for named capture groups */
private static final java.util.regex.Pattern BACKREF_NAMED_GROUP_PATTERN = java.util.regex.Pattern.compile("\\\\k<(" + NAME_PATTERN + ")>", java.util.regex.Pattern.DOTALL);
/** Pattern to match properties for named capture groups in a replacement string */
private static final java.util.regex.Pattern PROPERTY_PATTERN = java.util.regex.Pattern.compile("\\$\\{(" + NAME_PATTERN + ")\\}", java.util.regex.Pattern.DOTALL);
/** index of group within patterns above where group name is captured */
private static final int INDEX_GROUP_NAME = 1;
private java.util.regex.Pattern pattern;
private String namedPattern;
private List groupNames;
private Map > groupInfo;
/**
* Constructs a named pattern with the given regular expression and flags
*
* @param regex the expression to be compiled
* @param flags Match flags, a bit mask that may include:
*
* - {@link java.util.regex.Pattern#CASE_INSENSITIVE}
* - {@link java.util.regex.Pattern#MULTILINE}
* - {@link java.util.regex.Pattern#DOTALL}
* - {@link java.util.regex.Pattern#UNICODE_CASE}
* - {@link java.util.regex.Pattern#CANON_EQ}
* - {@link java.util.regex.Pattern#UNIX_LINES}
* - {@link java.util.regex.Pattern#LITERAL}
* - {@link java.util.regex.Pattern#COMMENTS}
*
*/
protected Pattern(String regex, int flags) {
namedPattern = regex;
// group info must be parsed before building the standard pattern
// because the pattern relies on group info to determine the indexes
// of named back-references
groupInfo = extractGroupInfo(regex);
pattern = buildStandardPattern(regex, flags);
}
/**
* Compiles the given regular expression into a pattern
*
* @param regex the expression to be compiled
* @return the pattern
*/
public static Pattern compile(String regex) {
return new Pattern(regex, 0);
}
/**
* Compiles the given regular expression into a pattern with the given flags
*
* @param regex the expression to be compiled
* @param flags Match flags, a bit mask that may include:
*
* - {@link java.util.regex.Pattern#CASE_INSENSITIVE}
* - {@link java.util.regex.Pattern#MULTILINE}
* - {@link java.util.regex.Pattern#DOTALL}
* - {@link java.util.regex.Pattern#UNICODE_CASE}
* - {@link java.util.regex.Pattern#CANON_EQ}
* - {@link java.util.regex.Pattern#UNIX_LINES}
* - {@link java.util.regex.Pattern#LITERAL}
* - {@link java.util.regex.Pattern#COMMENTS}
*
* @return the pattern
*/
public static Pattern compile(String regex, int flags) {
return new Pattern(regex, flags);
}
/**
* Gets the group index of a named capture group
*
* @param groupName name of capture group
* @return group index or -1 if not found
*/
public int indexOf(String groupName) {
return indexOf(groupName, 0);
}
/**
* Gets the group index of a named capture group at the
* specified index. If only one instance of the named
* group exists, use index 0.
*
* @param groupName name of capture group
* @param index the instance index of the named capture group within
* the pattern; e.g., index is 2 for the third instance
* @return group index or -1 if not found
* @throws IndexOutOfBoundsException if instance index is out of bounds
*/
public int indexOf(String groupName, int index) {
int idx = -1;
if (groupInfo.containsKey(groupName)) {
List list = groupInfo.get(groupName);
idx = list.get(index).groupIndex();
}
return idx;
}
/**
* Returns this pattern's match flags
*
* @return The match flags specified when this pattern was compiled
*/
public int flags() {
return pattern.flags();
}
/**
* Creates a matcher that will match the given input against this pattern.
*
* @param input The character sequence to be matched
* @return A new matcher for this pattern
*/
public Matcher matcher(CharSequence input) {
return new Matcher(this, input);
}
/**
* Returns the wrapped {@link java.util.regex.Pattern}
* @return the pattern
*/
public java.util.regex.Pattern pattern() {
return pattern;
}
/**
* Returns the regular expression from which this pattern was compiled.
*
* @return The source of this pattern
*/
public String standardPattern() {
return pattern.pattern();
}
/**
* Returns the original regular expression (including named groups)
*
* @return The regular expression
*/
public String namedPattern() {
return namedPattern;
}
/**
* Gets the names of all capture groups
*
* @return the list of names
*/
public List groupNames() {
if (groupNames == null) {
groupNames = new ArrayList(groupInfo.keySet());
}
return groupNames;
}
/**
* Gets the names and group info (group index and string position
* within the named pattern) of all named capture groups
*
* @return a map of group names and their info
*/
public Map > groupInfo() {
return groupInfo;
}
/**
* Replaces group-name properties (e.g., ${named}
) in
* a replacement pattern with the equivalent reference that uses the
* corresponding group index (e.g., $2
). If the string
* contains literal "$", it must be escaped with slash or else this call
* will attempt to parse it as a group-name property.
*
* This is meant to be used to transform the parameter for:
*
* - {@link Matcher#replaceAll(String)}
* - {@link Matcher#replaceFirst(String)}
* - {@link Matcher#appendReplacement(StringBuffer, String)}
*
* @param replacementPattern the input string to be evaluated
* @return the modified string
* @throws PatternSyntaxException group name was not found
*/
public String replaceProperties(String replacementPattern) {
return replaceGroupNameWithIndex(
new StringBuilder(replacementPattern),
PROPERTY_PATTERN,
"$"
).toString();
}
/**
* Splits the given input sequence around matches of this pattern.
*
* The array returned by this method contains each substring of the
* input sequence that is terminated by another subsequence that matches
* this pattern or is terminated by the end of the input sequence. The
* substrings in the array are in the order in which they occur in the
* input. If this pattern does not match any subsequence of the input
* then the resulting array has just one element, namely the input
* sequence in string form.
*
* The limit parameter controls the number of times the pattern is
* applied and therefore affects the length of the resulting array. If
* the limit n is greater than zero then the pattern will be applied
* at most n - 1 times, the array's length will be no greater than n,
* and the array's last entry will contain all input beyond the last
* matched delimiter. If n is non-positive then the pattern will be
* applied as many times as possible and the array can have any length.
* If n is zero then the pattern will be applied as many times as
* possible, the array can have any length, and trailing empty strings
* will be discarded.
*
* @param input The character sequence to be split
* @param limit The result threshold, as described above
* @return The array of strings computed by splitting the input around
* matches of this pattern
*/
public String[] split(CharSequence input, int limit) {
return pattern.split(input, limit);
}
/**
* Splits the given input sequence around matches of this pattern.
*
* @param input The character sequence to be split
* @return The array of strings computed by splitting the input around
* matches of this pattern
*/
public String[] split(CharSequence input) {
return pattern.split(input);
}
/**
* Returns a string representation of this pattern
*
* @return the string
*/
public String toString() {
return namedPattern;
}
/**
* Determines if the character at the specified position
* of a string is escaped
*
* @param s string to evaluate
* @param pos the position of the character to evaluate
* @return true if the character is escaped; otherwise false
*/
static private boolean isEscapedChar(String s, int pos) {
return isSlashEscapedChar(s, pos) || isQuoteEscapedChar(s, pos);
}
/**
* Determines if the character at the specified position
* of a string is escaped with a backslash
*
* @param s string to evaluate
* @param pos the position of the character to evaluate
* @return true if the character is escaped; otherwise false
*/
static private boolean isSlashEscapedChar(String s, int pos) {
// Count the backslashes preceding this position. If it's
// even, there is no escape and the slashes are just literals.
// If it's odd, one of the slashes (the last one) is escaping
// the character at the given position.
int numSlashes = 0;
while (pos > 0 && (s.charAt(pos - 1) == '\\')) {
pos--;
numSlashes++;
}
return numSlashes % 2 != 0;
}
/**
* Determines if the character at the specified position
* of a string is quote-escaped (between \\Q and \\E)
*
* @param s string to evaluate
* @param pos the position of the character to evaluate
* @return true if the character is quote-escaped; otherwise false
*/
static private boolean isQuoteEscapedChar(String s, int pos) {
boolean openQuoteFound = false;
boolean closeQuoteFound = false;
// find last non-escaped open-quote
String s2 = s.substring(0, pos);
int posOpen = pos;
while ((posOpen = s2.lastIndexOf("\\Q", posOpen - 1)) != -1) {
if (!isSlashEscapedChar(s2, posOpen)) {
openQuoteFound = true;
break;
}
}
if (openQuoteFound) {
// search remainder of string (after open-quote) for a close-quote;
// no need to check that it's slash-escaped because it can't be
// (the escape character itself is part of the literal when quoted)
if (s2.indexOf("\\E", posOpen) != -1) {
closeQuoteFound = true;
}
}
return openQuoteFound && !closeQuoteFound;
}
/**
* Determines if a string's character is within a regex character class
*
* @param s string to evaluate
* @param pos the position of the character to evaluate
* @return true if the character is inside a character class; otherwise false
*/
static private boolean isInsideCharClass(String s, int pos) {
boolean openBracketFound = false;
boolean closeBracketFound = false;
// find last non-escaped open-bracket
String s2 = s.substring(0, pos);
int posOpen = pos;
while ((posOpen = s2.lastIndexOf('[', posOpen - 1)) != -1) {
if (!isEscapedChar(s2, posOpen)) {
openBracketFound = true;
break;
}
}
if (openBracketFound) {
// search remainder of string (after open-bracket) for a close-bracket
String s3 = s.substring(posOpen, pos);
int posClose = -1;
while ((posClose = s3.indexOf(']', posClose + 1)) != -1) {
if (!isEscapedChar(s3, posClose)) {
closeBracketFound = true;
break;
}
}
}
return openBracketFound && !closeBracketFound;
}
/**
* Determines if the parenthesis at the specified position
* of a string is for a non-capturing group, which is one of
* the flag specifiers (e.g., (?s) or (?m) or (?:pattern).
* If the parenthesis is followed by "?", it must be a non-
* capturing group unless it's a named group (which begins
* with "?<"). Make sure not to confuse it with the lookbehind
* construct ("?<=" or "?= 0 && pos + 4 < len)*/ {
String pre = s.substring(pos, pos+4);
isLookbehind = pre.equals("(?<=") || pre.equals("(?= 0 && pos + 2 < len) &&*/
s.charAt(pos + 1) == '?' &&
(isLookbehind || s.charAt(pos + 2) != '<');
}
/**
* Counts the open-parentheses to the left of a string position,
* excluding escaped parentheses
*
* @param s string to evaluate
* @param pos ending position of string; characters to the left
* of this position are evaluated
* @return number of open parentheses
*/
static private int countOpenParens(String s, int pos) {
java.util.regex.Pattern p = java.util.regex.Pattern.compile("\\(");
java.util.regex.Matcher m = p.matcher(s.subSequence(0, pos));
int numParens = 0;
while (m.find()) {
// ignore parentheses inside character classes: [0-9()a-f]
// which are just literals
if (isInsideCharClass(s, m.start())) {
continue;
}
// ignore escaped parens
if (isEscapedChar(s, m.start())) continue;
if (!isNoncapturingParen(s, m.start())) {
numParens++;
}
}
return numParens;
}
/**
* Parses info on named capture groups from a pattern
*
* @param namedPattern regex the regular expression pattern to parse
* @return list of group info for all named groups
*/
static public Map > extractGroupInfo(String namedPattern) {
Map > groupInfo = new LinkedHashMap >();
java.util.regex.Matcher matcher = NAMED_GROUP_PATTERN.matcher(namedPattern);
while(matcher.find()) {
int pos = matcher.start();
// ignore escaped paren
if (isEscapedChar(namedPattern, pos)) continue;
String name = matcher.group(INDEX_GROUP_NAME);
int groupIndex = countOpenParens(namedPattern, pos);
List list;
if (groupInfo.containsKey(name)) {
list = groupInfo.get(name);
} else {
list = new ArrayList();
}
list.add(new GroupInfo(groupIndex, pos));
groupInfo.put(name, list);
}
return groupInfo;
}
/**
* Replaces strings matching a pattern with another string. If the string
* to be replaced is escaped with a slash, it is skipped.
*
* @param input the string to evaluate
* @param pattern the pattern that matches the string to be replaced
* @param replacement the string to replace the target
* @return the modified string (original instance of {@code input})
*/
static private StringBuilder replace(StringBuilder input, java.util.regex.Pattern pattern, String replacement) {
java.util.regex.Matcher m = pattern.matcher(input);
while (m.find()) {
if (isEscapedChar(input.toString(), m.start())) {
continue;
}
// since we're replacing the original string being matched,
// we have to reset the matcher so that it searches the new
// string
input.replace(m.start(), m.end(), replacement);
m.reset(input);
}
return input;
}
/**
* Replaces referenced group names with the reference to the corresponding group
* index (e.g., \k<named>
} to \k2
};
* ${named}
to $2
}).
* This assumes the group names have already been parsed from the pattern.
*
* @param input the string to evaluate
* @param pattern the pattern that matches the string to be replaced
* @param prefix string to prefix to the replacement (e.g., "$" or "\\")
* @return the modified string (original instance of {@code input})
* @throws PatternSyntaxException group name was not found
*/
private StringBuilder replaceGroupNameWithIndex(StringBuilder input, java.util.regex.Pattern pattern, String prefix) {
java.util.regex.Matcher m = pattern.matcher(input);
while (m.find()) {
if (isEscapedChar(input.toString(), m.start())) {
continue;
}
int index = indexOf(m.group(INDEX_GROUP_NAME));
if (index >= 0) {
index++;
} else {
throw new PatternSyntaxException("unknown group name", input.toString(), m.start(INDEX_GROUP_NAME));
}
// since we're replacing the original string being matched,
// we have to reset the matcher so that it searches the new
// string
input.replace(m.start(), m.end(), prefix + index);
m.reset(input);
}
return input;
}
/**
* Builds a {@code java.util.regex.Pattern} from a given regular expression
* pattern (which may contain named groups) and flags
*
* @param namedPattern the expression to be compiled
* @param flags Match flags, a bit mask that may include:
*
* - {@link java.util.regex.Pattern#CASE_INSENSITIVE}
* - {@link java.util.regex.Pattern#MULTILINE}
* - {@link java.util.regex.Pattern#DOTALL}
* - {@link java.util.regex.Pattern#UNICODE_CASE}
* - {@link java.util.regex.Pattern#CANON_EQ}
* - {@link java.util.regex.Pattern#UNIX_LINES}
* - {@link java.util.regex.Pattern#LITERAL}
* - {@link java.util.regex.Pattern#COMMENTS}
*
* @return the standard {@code java.util.regex.Pattern}
*/
private java.util.regex.Pattern buildStandardPattern(String namedPattern, Integer flags) {
// replace the named-group construct with left-paren but
// make sure we're actually looking at the construct (ignore escapes)
StringBuilder s = new StringBuilder(namedPattern);
s = replace(s, NAMED_GROUP_PATTERN, "(");
s = replaceGroupNameWithIndex(s, BACKREF_NAMED_GROUP_PATTERN, "\\");
return java.util.regex.Pattern.compile(s.toString(), flags);
}
/*
* (non-Javadoc)
* @see java.lang.Object#equals(java.lang.Object)
*/
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (obj == null) {
return false;
}
if (!(obj instanceof Pattern)) {
return false;
}
Pattern other = (Pattern)obj;
return namedPattern.equals(other.namedPattern) && pattern.flags() == other.pattern.flags();
}
/*
* (non-Javadoc)
* @see java.lang.Object#hashCode()
*/
@Override
public int hashCode() {
return namedPattern.hashCode() ^ pattern.flags();
}
}