org.gwtproject.regexp.server.JavaRegExp Maven / Gradle / Ivy
/*
* Copyright © 2019 The GWT Project Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.gwtproject.regexp.server;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.gwtproject.regexp.shared.GwtIncompatible;
import org.gwtproject.regexp.shared.MatchResult;
import org.gwtproject.regexp.shared.RegExp;
import org.gwtproject.regexp.shared.SplitResult;
@GwtIncompatible
public class JavaRegExp implements RegExp {
// In JS syntax, a \ in the replacement string has no special meaning.
// In Java syntax, a \ in the replacement string escapes the next character,
// so we have to translate \ to \\ before passing it to Java.
private static final Pattern REPLACEMENT_BACKSLASH = Pattern.compile("\\\\");
// To get \\, we have to say \\\\\\\\:
// \\\\\\\\ --> Java string unescape --> \\\\
// \\\\ ---> Pattern replacement unescape in replacement preprocessing --> \\
private static final String REPLACEMENT_BACKSLASH_FOR_JAVA = "\\\\\\\\";
// In JS syntax, a $& in the replacement string stands for the whole match.
// In Java syntax, the equivalent is $0, so we have to translate $& to
// $0 before passing it to Java. However, we have to watch out for $$&, which
// is actually a Javascript $$ (see below) followed by a & with no special
// meaning, and must not get translated.
private static final Pattern REPLACEMENT_DOLLAR_AMPERSAND =
Pattern.compile("((?:^|\\G|[^$])(?:\\$\\$)*)\\$&");
private static final String REPLACEMENT_DOLLAR_AMPERSAND_FOR_JAVA = "$1\\$0";
// In JS syntax, a $` and $' in the replacement string stand for everything
// before the match and everything after the match.
// In Java syntax, there is no equivalent, so we detect and reject $` and $'.
// However, we have to watch out for $$` and $$', which are actually a JS $$
// (see below) followed by a ` or ' with no special meaning, and must not be
// rejected.
private static final Pattern REPLACEMENT_DOLLAR_APOSTROPHE =
Pattern.compile("(?:^|[^$])(?:\\$\\$)*\\$[`']");
// In JS syntax, a $$ in the replacement string stands for a (single) dollar
// sign, $.
// In Java syntax, the equivalent is \$, so we have to translate $$ to \$
// before passing it to Java.
private static final Pattern REPLACEMENT_DOLLAR_DOLLAR = Pattern.compile("\\$\\$");
// To get \$, we have to say \\\\\\$:
// \\\\\\$ --> Java string unescape --> \\\$
// \\\$ ---> Pattern replacement unescape in replacement preprocessing --> \$
private static final String REPLACEMENT_DOLLAR_DOLLAR_FOR_JAVA = "\\\\\\$";
private final boolean globalFlag;
private final Pattern pattern;
private final String source;
private int lastIndex;
private JavaRegExp(String source, Pattern pattern, boolean globalFlag) {
this.source = source;
this.pattern = pattern;
this.globalFlag = globalFlag;
lastIndex = 0;
}
/**
* Creates a regular expression object from a pattern with no flags.
*
* @param pattern the Javascript regular expression pattern to compile
* @return a new regular expression
* @throws RuntimeException if the pattern is invalid
*/
public static RegExp compile(String pattern) {
return compile(pattern, "");
}
/**
* Creates a regular expression object from a pattern using the given flags.
*
* @param pattern the Javascript regular expression pattern to compile
* @param flags the flags string, containing at most one occurrence of {@code 'g'} ({@link
* #getGlobal()}), {@code 'i'} ({@link #getIgnoreCase()}), or {@code 'm'} ({@link
* #getMultiline()}).
* @return a new regular expression
* @throws RuntimeException if the pattern or the flags are invalid
*/
public static RegExp compile(String pattern, String flags) {
// Parse flags
boolean globalFlag = false;
int javaPatternFlags = Pattern.UNIX_LINES;
for (char flag : parseFlags(flags)) {
switch (flag) {
case 'g':
globalFlag = true;
break;
case 'i':
javaPatternFlags |= Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
break;
case 'm':
javaPatternFlags |= Pattern.MULTILINE;
break;
default:
throw new IllegalArgumentException("Unknown regexp flag: '" + flag + "'");
}
}
Pattern javaPattern = Pattern.compile(pattern, javaPatternFlags);
return new JavaRegExp(pattern, javaPattern, globalFlag);
}
/**
* Parses a flags string as a set of characters. Does not reject unknown flags.
*
* @param flags the flag string to parse
* @return a set of flags
* @throws IllegalArgumentException if a flag is duplicated
*/
private static Set parseFlags(String flags) {
Set flagsSet = new HashSet(flags.length());
for (int flagIndex = 0; flagIndex < flags.length(); flagIndex++) {
char flag = flags.charAt(flagIndex);
if (!flagsSet.add(flag)) {
throw new IllegalArgumentException("Flag cannot be specified twice: '" + flag + "'");
}
}
return flagsSet;
}
/**
* Returns a literal pattern String
for the specified String
.
*
* This method produces a String
that can be used to create a RegExp
* that would match the string s
as if it were a literal pattern. Metacharacters or
* escape sequences in the input sequence will be given no special meaning.
*
* @param input The string to be literalized
* @return A literal string replacement
*/
public static String quote(String input) {
return Pattern.quote(input);
}
@Override
public MatchResult exec(String input) {
// Start the search at lastIndex if the global flag is true.
int searchStartIndex = (globalFlag) ? lastIndex : 0;
Matcher matcher;
if (input == null || searchStartIndex < 0 || searchStartIndex > input.length()) {
// Avoid exceptions: Javascript is more tolerant than Java
matcher = null;
} else {
matcher = pattern.matcher(input);
if (!matcher.find(searchStartIndex)) {
matcher = null;
}
}
if (matcher != null) {
// Match: create a result
// Retrieve the matched groups.
int groupCount = matcher.groupCount();
List groups = new ArrayList(1 + groupCount);
for (int group = 0; group <= groupCount; group++) {
groups.add(matcher.group(group));
}
if (globalFlag) {
lastIndex = matcher.end();
}
return new JavaMatchResult(matcher.start(), input, groups);
} else {
// No match
if (globalFlag) {
lastIndex = 0;
}
return null;
}
}
@Override
public boolean getGlobal() {
return globalFlag;
}
@Override
public boolean getIgnoreCase() {
return (pattern.flags() & Pattern.CASE_INSENSITIVE) != 0;
}
@Override
public int getLastIndex() {
return lastIndex;
}
@Override
public boolean getMultiline() {
return (pattern.flags() & Pattern.MULTILINE) != 0;
}
@Override
public String getSource() {
return source;
}
@Override
public String replace(String input, String replacement) {
// Replace \ in the replacement with \\ to escape it for Java replace.
replacement =
REPLACEMENT_BACKSLASH.matcher(replacement).replaceAll(REPLACEMENT_BACKSLASH_FOR_JAVA);
// Replace the Javascript-ese $& in the replacement with Java-ese $0, but
// watch out for $$&, which should stay $$&, to be changed to \$& below.
replacement =
REPLACEMENT_DOLLAR_AMPERSAND
.matcher(replacement)
.replaceAll(REPLACEMENT_DOLLAR_AMPERSAND_FOR_JAVA);
// Test for Javascript-ese $` and $', which we do not support in the pure
// Java version.
if (REPLACEMENT_DOLLAR_APOSTROPHE.matcher(replacement).find()) {
throw new UnsupportedOperationException("$` and $' replacements are not supported");
}
// Replace the Javascript-ese $$ in the replacement with Java-ese \$.
replacement =
REPLACEMENT_DOLLAR_DOLLAR
.matcher(replacement)
.replaceAll(REPLACEMENT_DOLLAR_DOLLAR_FOR_JAVA);
return globalFlag
? pattern.matcher(input).replaceAll(replacement)
: pattern.matcher(input).replaceFirst(replacement);
}
@Override
public void setLastIndex(int lastIndex) {
this.lastIndex = lastIndex;
}
@Override
public SplitResult split(String input) {
return split(input, -1);
}
@Override
public SplitResult split(String input, int limit) {
String[] result;
if (source.length() == 0) {
// Javascript split using a completely empty regular expression splits the
// string into its constituent characters.
int resultLength = input.length();
if (resultLength > limit && limit >= 0) {
resultLength = limit;
}
result = new String[resultLength];
for (int i = 0; i < resultLength; i++) {
result[i] = input.substring(i, i + 1);
}
} else {
result = pattern.split(input, limit < 0 ? -1 : (limit + 1));
if (result.length > limit && limit >= 0) {
// Chop off the unsplit part of the string which has been put in
// result[limit]. Javascript split does not return it.
String[] realResult = new String[limit];
for (int i = 0; i < limit; i++) {
realResult[i] = result[i];
}
result = realResult;
}
}
return new JavaSplitResult(result);
}
@Override
public boolean test(String input) {
return exec(input) != null;
}
}