
aQute.libg.re.Catalog Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of biz.aQute.bndlib Show documentation
Show all versions of biz.aQute.bndlib Show documentation
bndlib: A Swiss Army Knife for OSGi
The newest version!
package aQute.libg.re;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.eclipse.jdt.annotation.Nullable;
import aQute.libg.re.RE.C;
import aQute.libg.re.RE.F;
import aQute.libg.re.RE.F.Flag;
import aQute.libg.re.RE.G;
import aQute.libg.re.RE.Q;
/**
* This class provides an implementation of the RE types. The class is useful as
* static imports. (For Eclipse users, look at favorites in the preferences.)
* However, it can also be used as base class. If that is the case, field names
* can be used as named capture groups. In this constellation, the static
* methods are also in scope, not requiring many static imports.
*
*
* void foo() {
* class X extends Catalog {
* RE match = lit("abc");
* RE namedMatch = named(match);
* }
* X x;
* x.....
* }
*
*/
public class Catalog {
/**
* If this class is extended, the named fields in that class can be used in
* named groups. This method will lookup the name of a field and create a
* capturing group with this name. It finds the field by comparing the
* content.
*
* @param re the RE that should be in a field in this class.
* @return a group RE
*/
public RE named(RE re) {
String name = findFieldWith(re);
assert name != null;
return g(name, re);
}
String findFieldWith(RE re) {
Class> c = getClass();
for (Field f : c.getDeclaredFields())
try {
f.setAccessible(true);
if (f.get(this) == re)
return f.getName();
} catch (Exception e) {
// ignore
}
return null;
}
/**
* Return a control char. For example, `control('b') returns ^b. See the
* sequence `\\cb`.
*
* @param c the control character
* @return an RE representing the control character
*/
public static RE control(char c) {
return new REImpl("\\c" + c);
}
/**
* Create a non capturing group
*
* @param res the members
* @return a non capturing group
*/
public static RE g(RE... res) {
return new Group(Group.Type.NONCAPTURING, res);
}
/**
* Create an OR combination of a number of RE's
*
* @param res the set of RE's that are the members of the OR
* @return the RE representing the OR
*/
public static RE or(RE... res) {
assert res != null;
return switch (res.length) {
case 0 -> empty;
case 1 -> res[0];
default -> {
StringBuilder sb = new StringBuilder();
String del = "";
for (RE re : res) {
sb.append(del)
.append(re);
del = "|";
}
yield new Group(null, sb.toString(), Group.Type.NONCAPTURING, names(res));
}
};
}
/**
* Create an OR combination of a number of Strings. The strings are
* converted with {@link #lit(String)}.
*
* @param res the strings
* @return the RE representing the OR
*/
public static RE or(String... res) {
assert res != null;
return or(Stream.of(res)
.map(Catalog::lit)
.toArray(RE[]::new));
}
/**
* Create an or combination of character classes.
*
* @param res the character classes
* @return an RE representing the combined clases
*/
public static RE or(C... res) {
assert res != null;
return switch (res.length) {
case 0 -> empty;
case 1 -> res[0];
default -> {
StringBuilder sb = new StringBuilder();
for (C re : res) {
sb.append(re.asSetContent());
}
yield new CharacterClass(sb.toString());
}
};
}
/**
* Create a named capturing group
*
* @param name the name of the group. This must be a valid Java identifier
* @param res the members.
* @return a new named capture group
*/
public static RE g(String name, RE... res) {
assert isValidGroupName(name);
if (res == null || res.length == 0)
return empty;
return new Group(name, res);
}
private static boolean isValidGroupName(String name) {
return name == null || javaId.matches(name)
.isPresent();
}
/**
* Return a named group but where each member that is not a whitespace, will
* be preceded with a #setWs.
*
* @param name the name of the group or null for a non-named group
* @param res the members
* @return a group, either named or capturing
*/
public static RE term(@Nullable
String name, RE... res) {
assert isValidGroupName(name) : name;
if (res == null || res.length == 0)
return empty;
List out = new ArrayList<>();
RE last = setWs;
out.add(last);
boolean lastWs = true;
for (int i = 0; i < res.length; i++) {
RE next = res[i];
boolean nextWs = isWhiteSpace(next);
int n = 0;
if (lastWs)
n = 1;
if (nextWs)
n += 2;
switch (n) {
case 0 -> {
out.add(setWs);
out.add(next);
}
case 1, 2 -> {
out.add(next);
}
case 3 -> {
}
}
last = next;
lastWs = nextWs;
}
return new Group(name, out.toArray(RE[]::new));
}
/**
* See {@link #term(String,RE...)} with a null for name
*
* @param res the members
* @return a new
*/
public static RE term(RE... res) {
return term(null, res);
}
/**
* Create a character class. I.e. `[abc]`. Do not include the ^ to negate
* the set, use the not() method.
*
* @param allowed the allowed characters
* @return a character class.
*/
public static C cc(String allowed) {
return new CharacterClass(allowed, true, null);
}
/**
* Provide a literal text. This lit can contain characters that normally
* have a special meaning. All characters that have a special meaning are
* escaped with the backslash ('\').
*
* @param s the literal string
* @return an RE
*/
public static RE lit(String s) {
StringBuilder sb = null;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if ("()$\\{}[]^+*?.| ".indexOf(c) >= 0) {
if (sb == null) {
sb = new StringBuilder();
sb.append(s, 0, i);
}
sb.append("\\")
.append(c);
} else if (sb != null)
sb.append(c);
}
return new REImpl(sb == null ? s : sb.toString());
}
/**
* Useful if you need a number of literal REs
*
* @param s the strings
* @return an array of RE
*/
public static RE[] lit(String... s) {
return Stream.of(s)
.map(ss -> lit(ss))
.toArray(RE[]::new);
}
/**
* Use the quoting facility built into {@link Pattern#quote(String)}
*
* @param s the string
* @return the quoted string
*/
public static RE quote(String s) {
return new REImpl(Pattern.quote(s));
}
/**
* Use the Unicode name. Is \\N
*
* @param name the unicode name.
* @return the RE representing the unicode name.
*/
public static RE unicode(String name) {
return new REImpl("\\N{".concat(name)
.concat("}"));
}
/**
* Used to reference a previous capturing group. Unfortunately this cannot
* be done by name. This class will by default create non-capturing groups,
* so only explicit groups need to be counted.
*
* @param group the group number
* @return a new RE referencing a previous group
*/
public static RE back(int group) {
assert group < 10 && group > 0;
return new REImpl("\\" + group);
}
/**
* Used to reference a previous named capturing group.
*
* @param group the group name
* @return a new RE referencing a previous group
*/
public static RE back(String group) {
return new REImpl("\\k<" + group + ">");
}
/**
* Create a list of clauses separated by a separator. The clauses and
* separators will be separated by zero or more whitespace.
*
* @param clause
* @param separator
* @return a new RE that presents a list of clauses
*/
public static RE list(RE clause, RE separator) {
return term(clause, set(term(separator, clause)));
}
/**
* Create a list of clauses separated by a comma. The clauses and separators
* will be separated by zero or more whitespace.
*
* @param clause
* @return a new RE that presents a list of clauses separated by commas
*/
public static RE list(RE clause) {
return list(clause, Catalog.comma);
}
/**
* Return an optional RE
*
* @param res the members of the optional
* @return a Q representing the optional
*/
public static Q opt(RE... res) {
return new Quantified(0, 1, Quantified.Type.greedy, res);
}
/**
* Return an optional literal (see {@link #lit(String)}t
*
* @param s the literal
* @return a Q representing the optional
*/
public static Q opt(String s) {
return opt(lit(s));
}
/**
* Return a group of some members.
*
* @param res the members
* @return a Q representing the some
*/
public static Q some(RE... res) {
return new Quantified(1, Integer.MAX_VALUE, Q.Type.greedy, res);
}
/**
* Return a group of some members.
*
* @param res the members
* @return a Q representing the some
*/
public static Q set(RE... res) {
return new Quantified(0, Integer.MAX_VALUE, Q.Type.greedy, res);
}
/**
* Creates a regular expression that matches the negation of the provided
* regular expression. This method wraps the given regex pattern in a
* negative construct.
*
* @param re the regular expression to be negated
* @return a new RE instance representing the negated version of the
* provided regular expression
*/
public static RE not(RE re) {
return re.not();
}
/**
* Modifies the given quantified regular expression to match reluctantly. A
* reluctant quantifier matches as few characters as possible.
*
* @param re the quantified regular expression to be modified
* @return a new RE instance with a reluctant quantification
*/
public static RE reluctant(Q re) {
return re.reluctant();
}
/**
* Modifies the given quantified regular expression to match greedily. A
* greedy quantifier matches as many characters as possible.
*
* @param re the quantified regular expression to be modified
* @return a new RE instance with a greedy quantification
*/
public static RE greedy(Q re) {
return re.greedy();
}
/**
* Modifies the given quantified regular expression to match in a possessive
* manner. A possessive quantifier does not give up matches as the regex
* engine backtracks.
*
* @param re the quantified regular expression to be modified
* @return a new RE instance with a possessive quantification
*/
public static RE possesive(Q re) {
return re.possesive();
}
/**
* Creates a regular expression that matches anything except the specified
* string. For a single character, it creates a negated character class;
* otherwise, it negates the literal string.
*
* @param s the string to be negated in the match
* @return an RE instance that matches anything but the specified string
*/
public static RE anythingBut(String s) {
if (s.length() == 1)
return set(new CharacterClass(s).not());
else
return set(new REImpl(s).not());
}
/**
* Creates a regular expression that optionally matches the given string.
* The string is wrapped in a non-capturing group with a quantifier allowing
* zero or one occurrence.
*
* @param s the string to be optionally matched
* @return an RE instance that optionally matches the specified string
*/
public static RE maybe(String s) {
return g(setAll, opt(s));
}
/**
* Creates a quantified regular expression that matches a specified minimum
* and maximum number of occurrences. This method applies a greedy
* quantifier.
*
* @param minimum the minimum number of occurrences to match
* @param maximum the maximum number of occurrences to match
* @param res the regular expressions to be quantified
* @return a Q instance representing the specified quantification
*/
public static Q multiple(int minimum, int maximum, RE... res) {
return new Quantified(minimum, maximum, Quantified.Type.greedy, res);
}
/**
* Creates a quantified regular expression that matches a specified minimum
* and maximum number of occurrences of a literal string. This method
* applies a greedy quantifier.
*
* @param minimum the minimum number of occurrences to match
* @param maximum the maximum number of occurrences to match
* @param lit the literal string to be quantified
* @return a Q instance representing the specified quantification of the
* literal string
*/
public static Q multiple(int minimum, int maximum, String lit) {
return new Quantified(minimum, maximum, Quantified.Type.greedy, lit(lit));
}
/**
* Creates a quantified regular expression that matches at least a specified
* minimum number of occurrences. This method applies a greedy quantifier.
*
* @param minimum the minimum number of occurrences to match
* @param res the regular expressions to be quantified
* @return a Q instance representing the quantification with the specified
* minimum and no maximum limit
*/
public static Q atLeast(int minimum, RE... res) {
return new Quantified(minimum, Integer.MAX_VALUE, Quantified.Type.greedy, res);
}
/**
* Creates a quantified regular expression that matches at least a specified
* minimum number of occurrences of a literal string. This method applies a
* greedy quantifier.
*
* @param minimum the minimum number of occurrences to match
* @param lit the literal string to be quantified
* @return a Q instance representing the quantification with the specified
* minimum and no maximum limit
*/
public static Q atLeast(int minimum, String lit) {
return atLeast(minimum, lit(lit));
}
/**
* Applies the case-insensitive flag to the provided regular expressions.
* This method makes the given patterns match characters regardless of their
* case.
*
* @param res the regular expressions to be affected by the case-insensitive
* flag
* @return an F instance with the case-insensitive flag applied
*/
public static F caseInsenstive(RE... res) {
return new Option(EnumSet.of(F.Flag.CASE_INSENSITIVE), null, res);
}
/**
* Turns off the case-insensitive flag for the provided regular expressions.
* This method reverts the given patterns to match characters considering
* their case.
*
* @param res the regular expressions to be affected by turning off the
* case-insensitive flag
* @return an F instance with the case-insensitive flag turned off
*/
public static F caseInsenstiveOff(RE... res) {
return new Option(null, EnumSet.of(F.Flag.CASE_INSENSITIVE), res);
}
/**
* Applies the dotall flag to the provided regular expressions. With this
* flag, the dot ('.') pattern matches any character, including a line
* terminator.
*
* @param res the regular expressions to be affected by the dotall flag
* @return an F instance with the dotall flag applied
*/
public static F dotall(RE... res) {
return new Option(EnumSet.of(F.Flag.DOTALL), null, res);
}
/**
* Turns off the dotall flag for the provided regular expressions. With the
* flag turned off, the dot ('.') pattern does not match line terminators by
* default.
*
* @param res the regular expressions to be affected by turning off the
* dotall flag
* @return an F instance with the dotall flag turned off
*/
public static F dotallOff(RE... res) {
return new Option(null, EnumSet.of(F.Flag.DOTALL), res);
}
/**
* Applies the comments flag to the provided regular expressions. This flag
* allows whitespace and comments within the pattern for better readability.
*
* @param res the regular expressions to be affected by the comments flag
* @return an F instance with the comments flag applied
*/
public static F comments(RE... res) {
return new Option(EnumSet.of(F.Flag.COMMENTS), null, res);
}
/**
* Turns off the comments flag for the provided regular expressions. With
* the flag turned off, whitespace and comments within the pattern are no
* longer ignored.
*
* @param res the regular expressions to be affected by turning off the
* comments flag
* @return an F instance with the comments flag turned off
*/
public static F commentsOff(RE... res) {
return new Option(null, EnumSet.of(F.Flag.COMMENTS), res);
}
/**
* Applies the multiline flag to the provided regular expressions. This flag
* changes the behavior of '^' and '$' from matching at the start and end of
* the input string to matching at the start and end of each line.
*
* @param res the regular expressions to be affected by the multiline flag
* @return an F instance with the multiline flag applied
*/
public static F multiline(RE... res) {
return new Option(EnumSet.of(F.Flag.MULTILINE), null, res);
}
/**
* Turns off the multiline flag for the provided regular expressions. With
* the flag turned off, '^' and '$' match only at the start and end of the
* entire input string.
*
* @param res the regular expressions to be affected by turning off the
* multiline flag
* @return an F instance with the multiline flag turned off
*/
public static F multilineOff(RE... res) {
return new Option(null, EnumSet.of(F.Flag.MULTILINE), res);
}
/**
* Applies the unicode character class flag to the provided regular
* expressions. This flag enables the Unicode versions of predefined
* character classes and POSIX character classes.
*
* @param res the regular expressions to be affected by the unicode
* character class flag
* @return an F instance with the unicode character class flag applied
*/
public static F unicodeCharacterClass(RE... res) {
return new Option(EnumSet.of(F.Flag.UNICODE_CHARACTER_CLASS), null, res);
}
/**
* Turns off the unicode character class flag for the provided regular
* expressions. With the flag turned off, the ASCII versions of predefined
* character classes and POSIX character classes are used.
*
* @param res the regular expressions to be affected by turning off the
* unicode character class flag
* @return an F instance with the unicode character class flag turned off
*/
public static F unicodeCharacterClassOff(RE... res) {
return new Option(null, EnumSet.of(F.Flag.UNICODE_CHARACTER_CLASS), res);
}
/**
* Applies the unicode case flag to the provided regular expressions. This
* flag enables the correct handling of character cases in Unicode when
* applying case-insensitive matching.
*
* @param res the regular expressions to be affected by the unicode case
* flag
* @return an F instance with the unicode case flag applied
*/
public static F unicodeCase(RE... res) {
return new Option(EnumSet.of(F.Flag.UNICODE_CASE), null, res);
}
/**
* Turns off the unicode case flag for the provided regular expressions.
* With the flag turned off, character cases in Unicode are not correctly
* handled when applying case-insensitive matching.
*
* @param res the regular expressions to be affected by turning off the
* unicode case flag
* @return an F instance with the unicode case flag turned off
*/
public static F unicodeCaseOff(RE... res) {
return new Option(null, EnumSet.of(F.Flag.UNICODE_CASE), res);
}
/**
* Applies the unix lines flag to the provided regular expressions. This
* flag affects how line terminators are matched. With this flag, only the
* '\n' line terminator is recognized.
*
* @param res the regular expressions to be affected by the unix lines flag
* @return an F instance with the unix lines flag applied
*/
public static F unixLines(RE... res) {
return new Option(EnumSet.of(F.Flag.UNIX_LINES), null, res);
}
/**
* Turns off the unix lines flag for the provided regular expressions. With
* the flag turned off, line terminators are matched in a
* platform-independent manner.
*
* @param res the regular expressions to be affected by turning off the unix
* lines flag
* @return an F instance with the unix lines flag turned off
*/
public static F unixLinesOff(RE... res) {
return new Option(null, EnumSet.of(F.Flag.UNIX_LINES), res);
}
/**
* Combine the res into a single atomic Group.
*
* @see #atomic(String)
* @param res the constituents.
*/
public static G atomic(RE... res) {
return new Group(G.Type.ATOMIC, res);
}
/**
* Creates an atomic group with the provided string. An atomic group
* prevents the regex engine from backtracking once the group has matched.
*
* @param string the literal string to be included in the atomic group
* @return a G instance representing an atomic group containing the provided
* string
*/
public static G atomic(String string) {
return new Group(G.Type.ATOMIC, lit(string));
}
/**
* Creates a lookahead group with the provided regular expressions. A
* lookahead group asserts that the given pattern must be matched next in
* the input sequence.
*
* @param res the regular expressions to be included in the lookahead group
* @return a G instance representing a lookahead group containing the
* provided expressions
*/
public static G ahead(RE... res) {
return new Group(G.Type.AHEAD, res);
}
/**
* Creates a lookbehind group with the provided regular expressions. A
* lookbehind group asserts that the given pattern must precede the current
* position in the input sequence.
*
* @param res the regular expressions to be included in the lookbehind group
* @return a G instance representing a lookbehind group containing the
* provided expressions
*/
public static G behind(RE... res) {
return new Group(G.Type.BEHIND, res);
}
/**
* Creates a sequence of regular expressions. This method groups the
* provided expressions in the order they are given, without any additional
* logic.
*
* @param res the regular expressions to be sequenced
* @return a G instance representing a sequence of the provided regular
* expressions
*/
public static G seq(RE... res) {
return new Group(G.Type.NONE, res);
}
/**
* Creates a conditional regular expression. The resulting pattern matches
* 'thenExpect' if 'condition' matches, otherwise it matches
* 'otherWiseExpect'.
*
* @param condition the conditional regular expression
* @param thenExpect the regular expression to match if the condition is
* true
* @param otherWiseExpect the regular expression to match if the condition
* is false
* @return an RE instance representing the conditional regular expression
*/
public static RE if_(RE condition, RE thenExpect, RE otherWiseExpect) {
RE pos = seq(ahead(condition), thenExpect);
RE all = or(pos, otherWiseExpect);
return all;
}
/**
* Creates a conditional regular expression with no alternative case. The
* resulting pattern matches 'then' only if 'condition' matches.
*
* @param condition the conditional regular expression
* @param then the regular expression to match if the condition is true
* @return an RE instance representing the conditional regular expression
* with no alternative case
*/
public static RE if_(RE condition, RE then) {
return seq(ahead(condition), g(then));
}
/**
* Creates a regular expression that repeats the 'thenExpect' pattern as
* long as 'condition' matches.
*
* @param condition the condition for repetition
* @param thenExpect the regular expression to be repeated
* @return an RE instance representing the repeated regular expression
*/
public static RE while_(RE condition, RE thenExpect) {
return set(seq(ahead(condition), thenExpect));
}
/**
* Creates a regular expression that matches 'thenExpect' until 'condition'
* becomes true.
*
* @param condition the condition to terminate matching
* @param thenExpect the regular expression to match until the condition is
* met
* @return an RE instance representing the regular expression matching until
* the condition
*/
public static RE until(RE condition, RE thenExpect) {
return set(seq(thenExpect, ahead(condition)));
}
/**
* Creates a capturing group with the provided regular expressions. This
* method groups the expressions and captures them for later reference.
*
* @param res the regular expressions to be included in the capturing group
* @return a G instance representing a capturing group containing the
* provided expressions
*/
public static G capture(RE... res) {
return new Group(G.Type.CAPTURING, res);
}
/**
* Return a string that is delimeted on both sides with the same character.
* The character cannot be used directly but must be escaped with a
* backslash.
*/
public static RE string(char delimeter) {
C del = cc(delimeter + "");
C notBackslashOrDel = del.or(backslash)
.not();
RE backslashFollowedByAll = g(backslash, all);
RE inner = or(notBackslashOrDel, backslashFollowedByAll);
return g(del, set(inner), del);
}
final public static C ws = new Special("\\s");
final public static Q setWs = set(ws);
final public static Q someWs = some(ws);
final public static RE all = new REImpl(".");
final public static Q setAll = set(all);
final public static RE someAll = some(all);
final public static C backslash = new CharacterClass("\\\\");
final public static C Lu = new Predefined("Lu", true);
final public static C Ll = new Predefined("Ll", true);
final public static C Lt = new Predefined("Lt", true);
final public static C Lm = new Predefined("Lm", true);
final public static C Lo = new Predefined("Lo", true);
final public static C Nd = new Predefined("Nd", true);
final public static C Nl = new Predefined("Nl", true);
final public static C No = new Predefined("No", true);
final public static C Z = new Predefined("Z", true);
final public static C P = new Predefined("P", true);
final public static C S = new Predefined("S", true);
final public static C Cc = new Predefined("Cc", true);
final public static C Cf = new Predefined("Cf", true);
final public static C Cn = new Predefined("Cn", true);
final public static C Lower = new Predefined("Lower", true);
final public static C Upper = new Predefined("Upper", true);
final public static C ASCII = new Predefined("ASCII", true);
final public static C Alpha = new Predefined("Alpha", true);
final public static C Digit = new Predefined("Digit", true);
final public static C Alnum = new Predefined("Alnum", true);
final public static C Punct = new Predefined("Punct", true);
final public static C Graph = new Predefined("Graph", true);
final public static C Print = new Predefined("Print", true);
final public static C Blank = new Predefined("Blank", true);
final public static C Cntrl = new Predefined("Cntrl", true);
final public static C XDigit = new Predefined("XDigit", true);
final public static C Space = new Predefined("Space", true);
final public static C letter = new Special("\\w");
final public static C dollar = new Special("\\$");
final public static C euro = new Special("€");
final public static Q word = some(letter);
final public static C digit = new Special("\\d");
final public static C nonDigit = digit.not();
final public static C lineEnd = new Special("\\b");
final public static C dot = new Special("\\.");
final public static C comma = new Special(",");
final public static C semicolon = new Special(";");
final public static C colon = new Special(":");
final public static C nl = new Special("\\R");
final public static C cr = new Special("\r");
final public static C lf = new Special("\n");
final public static C ff = new Special("\f");
final public static C alarm = new Special("\\a");
final public static C escape = new Special("\\e");
final public static RE eof = new Boundary("$");
final public static RE eol = or(nl, eof);
final public static C parOpen = new Special("\\(");
final public static C parClose = new Special("\\)");
final public static RE empty = new REImpl("");
final public static C tab = new Special("\t");
final public static RE number = some(digit);
public static C hexdigit = cc("0-9A-F");
public static C bindigit = cc("0-1");
public static RE hexnumber = some(hexdigit);
final public static C minus = new CharacterClass("-");
final public static C dquote = new CharacterClass("\"");
final public static C squote = new CharacterClass("'");
final public static C backQuote = new CharacterClass("`");
final public static C underscore = new CharacterClass("_");
final public static Q qualifier = some(or(Alpha, Digit, underscore, minus));
final public static RE version = //
g(number, opt(g(dot, number, opt(g(dot, number, opt(g(dot, qualifier)))))));
final public static C javaLowerCase = new Predefined("javaLowerCase", true);
final public static C javaUpperCase = new Predefined("javaUpperCase", true);
final public static C javaWhitespace = new Predefined("javaWhitespace", true);
final public static C javaMirrored = new Predefined("javaMirrored", true);
final public static C javaJavaIdentifierStart = new Predefined("javaJavaIdentifierStart", true);
final public static C javaJavaIdentifierPart = new Predefined("javaJavaIdentifierPart", true);
final public static RE javaId = seq(javaJavaIdentifierStart, set(javaJavaIdentifierPart));
final public static RE fullyQualifiedName = seq(javaId, set(dot, javaId));
final public static RE startOfLine = new Boundary("^");
final public static RE endOfLine = new Boundary("$");
final public static RE wordBoundary = new Boundary("\\b");
final public static RE beginInput = new Boundary("\\A");
final public static RE endOfPreviousMatch = new Boundary("\\G");
final public static RE endOfInputForFinal = new Boundary("\\Z");
final public static RE endOfInput = new Boundary("\\z");
final public static C isLatin = new Predefined("IsLatin", true);
final public static C inGreek = new Predefined("InGreek", true);
final public static C isAlphabetic = new Predefined("isAlphabetic", true);
final public static C sc = new Predefined("Sc", true);
static class REImpl implements RE {
final String literal;
final Set groups;
volatile Pattern pattern;
REImpl(String literal) {
this.literal = literal;
groups = null;
}
REImpl(String literal, String... names) {
this.literal = literal;
this.groups = names.length > 0 ? new LinkedHashSet<>() : null;
for (String name : names)
this.groups.add(name);
}
@Override
public RE not() {
StringBuilder sb = new StringBuilder();
sb.append("(?!");
sb.append(literal);
sb.append(")");
return new REImpl(sb.toString());
}
@Override
public String toString() {
return literal;
}
@Override
public Pattern pattern(RE.F.Flag... type) {
int options = 0;
for (Flag flag : type) {
options |= flag.option;
}
Pattern p = pattern;
if (p == null || p.flags() != options) {
p = Pattern.compile(toString(), options);
}
return pattern = p;
}
@Override
public boolean isMatch(String string) {
return pattern().matcher(string)
.matches();
}
@Override
public Optional matches(String string) {
if (string == null)
return Optional.empty();
return matches0(string, Matcher::matches);
}
Optional matches0(String string, Predicate m) {
if (string == null)
return Optional.empty();
Matcher matcher = pattern().matcher(string);
return matches1(string, m, matcher);
}
private Optional matches1(String string, Predicate m, Matcher matcher) {
if (m.test(matcher)) {
abstract class Base {
public int length() {
return value().length();
}
public char charAt(int index) {
return value().charAt(index);
}
public CharSequence subSequence(int start, int end) {
return value().subSequence(start, end);
}
public Matcher getMatcher() {
return matcher;
}
public abstract String value();
@Override
public String toString() {
return value();
}
}
class MatchGroupImpl extends Base implements MatchGroup {
final String name;
String value;
int start = -1;
int end = -1;
MatchGroupImpl(String name, String value) {
this.name = name;
this.value = value;
}
@Override
public String name() {
return name;
}
@Override
public String value() {
return value == null ? value : matcher.group(name);
}
@Override
public int start() {
return start < 0 ? start = matcher.start(name) : start;
}
@Override
public int end() {
return end < 0 ? end = matcher.start(name) : end;
}
}
class MatchGroupImplIndex extends Base implements MatchGroup {
final int name;
String value;
int start = -1;
int end = -1;
MatchGroupImplIndex(int name, String value) {
this.name = name;
this.value = value;
}
@Override
public String name() {
return Integer.toString(name);
}
@Override
public String value() {
return value == null ? value : matcher.group(name);
}
@Override
public int start() {
return start < 0 ? start = matcher.start(name) : start;
}
@Override
public int end() {
return end < 0 ? end = matcher.end(name) : end;
}
}
class MatchImpl extends Base implements Match {
Map matchGroups;
Map matchValues;
int rover;
@Override
public String name() {
return "";
}
@Override
public String value() {
return matcher.group();
}
@Override
public int start() {
return matcher.start();
}
@Override
public int end() {
return matcher.end();
}
@Override
public Map getGroups() {
if (matchGroups == null) {
if (groups == null)
matchGroups = Collections.emptyMap();
else {
Map result = new TreeMap<>();
for (String name : groups) {
String value = matcher.group(name);
if (value != null) {
MatchGroupImpl mg = new MatchGroupImpl(name, value);
result.put(name, mg);
}
}
matchGroups = Collections.unmodifiableMap(result);
}
}
return matchGroups;
}
@Override
public Map getGroupValues() {
if (matchValues == null) {
Map result = new LinkedHashMap<>();
getGroups().forEach((k, v) -> result.put(k, v.value()));
matchValues = Collections.unmodifiableMap(result);
}
return matchValues;
}
@Override
public Optional group(String name) {
if (groups == null)
throw new IllegalArgumentException("no groups defined");
if (!groups.contains(name)) {
throw new IllegalArgumentException("no group name defined: " + name + " in " + groups);
}
String value = matcher.group(name);
if (value == null) {
Optional.empty();
}
return Optional.of(new MatchGroupImpl(name, value));
}
@Override
public String tryMatch(RE expected) {
Matcher m = expected.getMatcher(this);
m.region(rover, length());
if (m.lookingAt()) {
rover = m.end();
return m.group();
} else
return null;
}
@Override
public Optional group(int group) {
if (matcher.groupCount() < group)
return Optional.empty();
String value = matcher.group(group);
if (value == null)
return Optional.empty();
return Optional.of(new MatchGroupImplIndex(group, value));
}
@Override
public String presentGroup(String groupName) {
String group = matcher.group(groupName);
if (group == null)
throw new IllegalArgumentException("no such group " + groupName);
return group;
}
}
return Optional.of(new MatchImpl());
} else
return Optional.empty();
}
@Override
public Optional lookingAt(String string) {
return matches0(string, Matcher::lookingAt);
}
private Predicate predicate(Predicate subPredicate) {
Pattern pattern = pattern();
return s -> subPredicate.test(pattern.matcher(s));
}
@Override
public Optional findIn(String string) {
return matches0(string, Matcher::find);
}
@Override
public Stream findAllIn(String string) {
return stream(string);
}
@Override
public Predicate asMatchPredicate() {
return predicate(Matcher::matches);
}
@Override
public Predicate asFindPredicate() {
return predicate(Matcher::find);
}
@Override
public Predicate asLookingAtPredicate() {
return predicate(Matcher::lookingAt);
}
Stream stream(String string) {
Spliterator spliterator = spliterator(string, Spliterator.ORDERED | Spliterator.NONNULL);
return StreamSupport.stream(spliterator, false);
}
Spliterator spliterator(String string, int options) {
Iterator iterator = iterator(string);
return Spliterators.spliteratorUnknownSize(iterator, options);
}
Iterator iterator(String string) {
Matcher matcher = pattern().matcher(string);
return new Iterator() {
Optional match;
@Override
public boolean hasNext() {
match = matches1(string, Matcher::find, matcher);
return match.isPresent();
}
@Override
public Match next() {
return match.get();
}
};
}
@Override
public Matcher getMatcher(CharSequence string) {
return pattern().matcher(string);
}
@Override
public boolean isSingle() {
return literal.length() == 1;
}
@Override
public Optional merge(RE re) {
return Optional.empty();
}
@Override
public Set getGroupNames() {
return groups == null ? new HashSet<>() : new HashSet<>(groups);
}
@Override
public void append(StringBuilder sb, String string, Function replacement) {
AtomicInteger begin = new AtomicInteger(0);
stream(string).forEach(match -> {
sb.append(string.subSequence(begin.getAndSet(match.end()), match.start()));
String r = replacement.apply(match);
if (r != null)
sb.append(r);
});
sb.append(string.substring(begin.get()));
}
}
static class Group extends REImpl implements G {
final Type type;
final String name;
Group(Type type, RE... res) {
this(null, toGroupedString(false, res), type, names(res));
}
Group(Type type, String literal) {
this(null, literal, type);
}
Group(String name, RE... res) {
this(name, toGroupedString(false, res), name == null ? Group.Type.NONCAPTURING : Group.Type.NAMED,
names(res));
}
private Group(String name, String literal, Type type, String... names) {
super(literal, name == null ? names : combine(name, names));
this.type = type;
this.name = name;
}
@Override
public Group not() {
Type type = switch (this.type) {
case AHEAD -> Type.NOT_AHEAD;
case BEHIND -> Type.NOT_BEHIND;
case NOT_AHEAD -> Type.AHEAD;
case NOT_BEHIND -> Type.BEHIND;
case NOT -> Type.NONCAPTURING;
case NONCAPTURING -> Type.NOT;
default -> null;
};
if (type == null)
return new Group(Type.NOT, this);
if (type == this.type)
return this;
return new Group(null, literal, type);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(type.prefix);
if (type == Type.NAMED) {
sb.append(name);
sb.append(">");
}
sb.append(literal);
sb.append(type.suffix);
return sb.toString();
}
@Override
public boolean isSingle() {
return true;
}
@Override
public Type groupType() {
return type;
}
}
static class Predefined extends CharacterClass {
Predefined(String literal, boolean positive) {
super(literal, positive, null);
}
Predefined(String literal) {
this(literal, true);
}
@Override
public boolean isSingle(String literal) {
return true;
}
@Override
public String asSetContent() {
return toString();
}
@Override
public Predefined not() {
return new Predefined(literal, !positive);
}
@Override
public String toString() {
if (positive)
return "\\p{" + literal + "}";
else
return "\\P{" + literal + "}";
}
}
static class Special extends CharacterClass {
Special(String literal, boolean positive) {
super(literal, positive, null);
}
Special(String literal) {
this(literal, true);
}
@Override
public boolean isSingle() {
return true;
}
@Override
public Special not() {
return new Special(literal, !positive);
}
@Override
public String asSetContent() {
return toString();
}
@Override
public String toString() {
if (positive)
return literal;
else {
String s = switch (literal) {
case "\\s" -> "\\S";
case "\\S" -> "\\s";
case "\\w" -> "\\W";
case "\\W" -> "\\w";
case "\\d" -> "\\D";
case "\\D" -> "\\d";
case "\\h" -> "\\H";
case "\\H" -> "\\h";
case "\\v" -> "\\V";
case "\\V" -> "\\v";
default -> {
StringBuilder sb = new StringBuilder();
sb.append("[^");
sb.append(literal);
sb.append("]");
yield sb.toString();
}
};
return s;
}
}
}
static class Quantified extends REImpl implements Q {
final Type type;
final int minimum;
final int maximum;
Quantified(int minimum, int maximum, Type type, RE... res) {
this(toGroupedString(true, res), minimum, maximum, type, names(res));
}
public Quantified(String grouped, int minimum, int maximum, Type type, String... names) {
super(grouped, names);
assert minimum >= 0;
assert maximum > 0;
this.minimum = minimum;
this.maximum = maximum;
this.type = type;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
if (minimum == 1 && maximum == 1 && type == Type.greedy)
return literal;
sb.append(literal);
if (minimum == 0 && maximum == 1) {
sb.append("?");
} else if (minimum == 0 && maximum == Integer.MAX_VALUE) {
sb.append("*");
} else if (minimum == 1 && maximum == Integer.MAX_VALUE) {
sb.append("+");
} else if (minimum == maximum) {
sb.append("{")
.append(minimum)
.append("}");
} else if (maximum == Integer.MAX_VALUE) {
sb.append("{")
.append(minimum)
.append(",")
.append("}");
} else {
sb.append("{")
.append(minimum)
.append(",")
.append(maximum)
.append("}");
}
switch (type) {
case greedy :
break;
case possesive :
sb.append("+");
break;
case reluctant :
sb.append("?");
break;
}
return sb.toString();
}
@Override
public RE reluctant() {
return new Quantified(literal, minimum, maximum, Type.reluctant);
}
@Override
public RE greedy() {
return new Quantified(literal, minimum, maximum, Type.reluctant);
}
@Override
public RE possesive() {
return new Quantified(literal, minimum, maximum, Type.possesive);
}
}
static class Boundary extends REImpl {
Boundary(String literal) {
super(literal);
}
@Override
public RE not() {
return switch (literal) {
case "^" -> Catalog.startOfLine;
case "$" -> Catalog.endOfLine;
case "\\b" -> new Boundary("\\B");
case "\\A" -> Catalog.endOfInput;
case "\\z" -> Catalog.beginInput;
default -> super.not();
};
}
}
static class Option extends REImpl implements F {
private static final EnumSet NONE_OF = EnumSet.noneOf(Flag.class);
final EnumSet positive;
final EnumSet negative;
Option(EnumSet positive, EnumSet negative, RE... res) {
this(toGroupedString(false, res), positive, negative, names(res));
}
Option(String ungrouped, EnumSet p, EnumSet n, String... names) {
super(ungrouped, names);
this.positive = p == null ? NONE_OF : p;
this.negative = n == null ? NONE_OF : n;
}
@Override
public String toString() {
EnumSet p = EnumSet.copyOf(positive);
p.removeAll(negative);
EnumSet n = EnumSet.copyOf(negative);
n.removeAll(positive);
if (p.isEmpty() && n.isEmpty())
return super.toString();
StringBuilder sb = new StringBuilder();
sb.append("(?");
for (Flag f : p) {
sb.append(f.flag);
}
if (!n.isEmpty()) {
sb.append("-");
for (Flag f : n) {
sb.append(f.flag);
}
}
if (!literal.isEmpty()) {
sb.append(":");
sb.append(literal);
}
sb.append(")");
return sb.toString();
}
@Override
public Optional merge(RE re) {
if (re instanceof Option op) {
EnumSet p = EnumSet.copyOf(positive);
EnumSet n = EnumSet.copyOf(negative);
p.addAll(op.positive);
n.addAll(op.negative);
return Optional.of(new Option(literal, p, n));
} else
return Optional.empty();
}
@Override
public Set positive() {
return EnumSet.copyOf(positive);
}
@Override
public Set negative() {
return EnumSet.copyOf(negative);
}
}
static class CharacterClass extends REImpl implements C {
final boolean positive;
final CharacterClass[] conjunction;
CharacterClass(String literal, boolean positive, CharacterClass[] conjunction) {
super(literal);
this.positive = positive;
this.conjunction = conjunction == null ? new CharacterClass[0] : conjunction;
}
public CharacterClass(String string) {
this(string, true, new CharacterClass[0]);
}
@Override
public CharacterClass not() {
return new CharacterClass(literal, !positive, conjunction);
}
@Override
public Optional merge(RE other) {
return Optional.empty();
}
/**
* https://www.regular-expressions.info/charclassintersect.html
*/
@Override
public String toString() {
if (isSingle() && positive) {
return asSetContent();
}
StringBuilder sb = new StringBuilder("[");
if (!positive)
sb.append("^");
sb.append(asSetContent());
for (CharacterClass c : conjunction) {
sb.append("&&");
if (c.isSingle()) {
sb.append(c.asSetContent());
} else {
sb.append(c);
}
}
sb.append("]");
return sb.toString();
}
@Override
public String asSetContent() {
return literal;
}
boolean isSingle(String literal) {
return literal.length() == 1 || (literal.length() == 2 && literal.charAt(0) == '\\');
}
@Override
public boolean isSingle() {
return positive && isSingle(literal);
}
@Override
public C and(C and) {
CharacterClass[] copyOf = Arrays.copyOf(conjunction, conjunction.length + 1);
copyOf[conjunction.length] = (CharacterClass) and;
return new CharacterClass(asSetContent(), positive, copyOf);
}
@Override
public C or(C or) {
return new CharacterClass(asSetContent() + or.asSetContent(), positive, conjunction);
}
}
static RE[] merge(RE[] res) {
if (res.length < 2)
return res;
List result = new ArrayList<>();
RE last = null;
for (RE re : res) {
if (last == null)
last = re;
else {
RE merged = last.merge(re)
.orElse(null);
if (merged != null) {
last = merged;
} else {
result.add(last);
last = re;
}
}
}
result.add(last);
return result.toArray(RE[]::new);
}
static String[] names(RE... res) {
return Stream.of(res)
.map(RE::getGroupNames)
.filter(Objects::nonNull)
.flatMap(Collection::stream)
.toArray(String[]::new);
}
static String[] combine(String name, String... names) {
if (names == null || names.length == 0)
return new String[] {
name
};
String[] result = Arrays.copyOf(names, names.length + 1);
result[names.length] = name;
return result;
}
static String toGroupedString(boolean force, RE... res) {
res = merge(res);
if (res.length == 0)
return "";
if (res.length == 1 && res[0].isSingle()) {
return res[0].toString();
}
StringBuilder sb = new StringBuilder();
if (force) {
sb.append("(?:");
}
for (RE re : res) {
sb.append(re);
}
if (force) {
sb.append(")");
}
return sb.toString();
}
static boolean isWhiteSpace(RE re) {
if (re == ws || re == setWs || re == someWs)
return true;
String s = re.toString();
return switch (s) {
case "\\s", " ", "\t", "\\s*", "\\s+", "(\\s)*", "(\\s)+", " *", " +", "( )*", "( )+" -> true;
default -> false;
};
}
public static RE re(String regex) {
return new REImpl(regex);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy