info.codesaway.util.regex.RefactorUtility Maven / Gradle / Ivy
Show all versions of regexplus Show documentation
package info.codesaway.util.regex;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
/**
* Contains utility functions / fields that are used when refactoring the
* inputted regular expression. These functions / fields are only meant to
* be called from the {@link Refactor} class.
*/
class RefactorUtility
{
/** String pattern to match a group name (excluding occurrence). */
// private static final String groupName = "\\w++";
private static final String groupName = "(?:\\p{javaJavaIdentifierStart}|\\d)\\p{javaJavaIdentifierPart}*+";
/**
* String pattern to match an optional group name (excluding
* occurrence).
*/
// private static final String optGroupName = "\\w*+";
private static final String optGroupName = "(?:(?:\\p{javaJavaIdentifierStart}|\\d)\\p{javaJavaIdentifierPart}*+)?+";
/**
* String pattern to match a group name (including occurrence)
*
* (2 groups)
*
* - Group index wrapped in '[]' or group name (can be empty string)
* - Group occurrence or group index if first group is empty (wrapped in '[]')
*
*/
static final java.util.regex.Pattern fullGroupName = java.util.regex.Pattern
.compile("(\\[-?\\d++]|" + optGroupName + ")(?:\\[(-?\\d++)])?");
// private static String[] parseGroup(String group)
// {
// String groupName;
// String occurrence;
//
// int bracketIndex = group.indexOf('[');
//
// if (bracketIndex != -1)
// {
// // Has bracket
// // [1], [-1], [1][1], groupName[occurrence]
//
// int lastBracketIndex = group.lastIndexOf('[');
//
// if (bracketIndex == lastBracketIndex)
// {
// // Has only one bracket
// // [1], [-1], groupName[occurrence]
//
// if (bracketIndex == 0)
// {
// // Starts with bracket
// // [1], [-1]
//
// if (!group.endsWith("]"))
// throw noNamedGroup(group);
//
// groupName = group;
// occurrence = null;
// }
// else
// {
// // Has bracket, not a beginning
// // groupName[occurrence]
//
// if (!group.endsWith("]"))
// throw noNamedGroup(group);
//
// groupName = group.substring(0, bracketIndex);
//
// // Get part between brackets
// occurrence = group.substring(bracketIndex + 1, group.length() - 1);
// }
// }
// else
// {
// // Has two brackets
// // [1][1], [1][-1]
//
// if (bracketIndex != 0)
// throw noNamedGroup(group);
//
// int closeBracket = group.indexOf(']');
//
// if (closeBracket != lastBracketIndex - 1)
// throw noNamedGroup(group);
//
// if (!group.endsWith("]"))
// throw noNamedGroup(group);
//
// groupName = group.substring(0, lastBracketIndex);
//
// if (groupName.indexOf('[', 1) != -1)
// {
// // Has multiple opening brackets
// // [[1]
// throw noNamedGroup(group);
// }
//
// occurrence = group.substring(lastBracketIndex + 1, group.length() - 1);
// }
// }
// else
// {
// // Has no bracket, just a group name
// groupName = group;
// occurrence = null;
// }
//
// return new String[] { groupName, occurrence };
// }
// static final java.util.regex.Pattern fullGroupName = java.util.regex.Pattern
// .compile("(\\[-?\\d++\\]|-?\\d++|" + optGroupName + ")(?:\\[(-?\\d++)])?");
/**
* String pattern to match an "any group"
*
* (e.g. groupName, groupName[0])
*
* (1 group)
*/
private static final String fullGroupName0 = "(\\[-?\\d++]|" +
groupName + ")(?:\\[0++])?";
/**
* Formats an integer character code (0 through 255) as
* \x
hh.
*/
static final String hexCodeFormat = "\\x%1$02x";
/**
* Formats an integer character code as
* \u
hhhh.
*/
static final String unicodeFormat = "\\u%1$04x";
static final Map posixClasses = new HashMap(
13);
static {
posixClasses.put("alnum", "Alnum");
posixClasses.put("alpha", "Alpha");
posixClasses.put("ascii", "ASCII");
posixClasses.put("blank", "Blank");
posixClasses.put("cntrl", "Cntrl");
posixClasses.put("digit", "Digit");
posixClasses.put("graph", "Graph");
posixClasses.put("lower", "Lower");
posixClasses.put("print", "Print");
posixClasses.put("punct", "Punct");
posixClasses.put("space", "Space");
posixClasses.put("upper", "Upper");
posixClasses.put("xdigit", "XDigit");
}
/**
* Pattern to match parts of the inputted regular expression that need
* to be processed before refactoring.
*/
static final java.util.regex.Pattern preRefactor = java.util.regex.Pattern
.compile(
/*
* matches "(?onFlags-offFlags)" or "(?onFlags-offFlags:" (also
* matches a non-capture group - onFlags/offFlags are omitted)
*
* group: onFlags (empty string if none) group + 1: offFlags
* (empty string if none; null, if omitted) (2 groups)
*/
"\\Q(?\\E(\\w*+)(?:-(\\w*+))?[:\\)]|"
/*
* matches a named capture group
* "(?" (form 0)
* "(?'name'" (form 1)
* "(?P" (form 2)
*
* group: everything after first symbol
* group + 1: the name
* (6 groups)
*/
/* form 0 - 2 start */
+ "\\Q(?\\E(?:"
/* form 0 */
+ "<((" + groupName + ")>?)|"
/* form 1 */
+ "'((" + optGroupName + ")'?)|"
/* form 2 */
+ "P<((" + optGroupName + ")>?)"
/* form 0 - 2 end */
+ ")|"
/*
* matches an unnamed capture group "(" - not followed by a "?" (or a '*', used by verbs)
*
* group: everything (1 group)
*/
+ "(\\((?![?*]))|"
/*
* matches a back reference (by name)
* "\g{name}" (form 0)
* "\k" (form 1)
* "\k'name'" (form 2)
* "\k{name}" (form 3)
* "(?P=name)" (form 4)
*
* group : the name
*
* (5 groups)
* ( can only be an "any group" (e.g. groupName[0]))
*/
// NOTE: escaped closing '}' because required for Android (for some reason)
/* form 0 */
+ "\\Q\\g{\\E" + fullGroupName0 + "\\}|"
/* form 1 - 3 start */
+ "\\\\k(?:"
/* form 1 */
+ "<" + fullGroupName0 + ">|"
/* form 2 */
+ "'" + fullGroupName0 + "'|"
/* form 3 */
+ "\\{" + fullGroupName0 + "\\}"
/* form 1 - 3 end */
+ ")|"
/* form 4 */
+ "\\Q(?P=\\E" + fullGroupName0 + "\\)|"
/*
* matches an assert condition
* "(?(?=", "(?(?!", "(?(?<=), or "(?(?)" (form 0),
* "(?('name')" (form 1), or
* "(?(name)" (form 2)
*
* group: everything after first symbol (excluding ")")
* group + 1: the name
* group + 2: the occurrence (if specified)
* (9 groups)
*/
/* form 0 - 2 start */
+ "\\Q(?(\\E(?:"
/* form 0 */
+ "<(" + fullGroupName + ">?)|"
/* form 1 */
+ "'(" + fullGroupName + "'?)|"
/* form 2 */
+ "(" + fullGroupName + ")"
/* form 0 - 2 end */
+ ")\\)?|"
/*
* matches comment group
* "(?#comment) - comment cannot contain ")"
*
* group: everything
* (1 group)
*/
+ "(\\Q(?#\\E[^\\)]*+\\)?)|"
/*
* matches a "branch reset" subpattern "(?|"
*
* group: everything
* (1 group)
*/
+ "(\\Q(?|\\E)|"
/*
* FAIL verb (from PCRE) - always fails
* (*FAIL) or (*F) - case sensitive
*
* synonym for (?!)
*/
+ "\\Q(*\\E(F(?:AIL)?)\\)|"
/* an open parenthesis - depth + 1 */
+ "\\(|"
/* a closed parenthesis - depth - 1 */
+ "\\)|"
/* a pike (|) - only react if inside condition */
+ "\\||"
/* an open square bracket - starts a character class */
+ "\\[|"
/* a closed square bracket - ends a character class */
+ "\\]|"
/* an open curly brace */
+ "\\{|"
/* a closed curly brace - tracks when needs to be escaped */
// (escaping done since Android development doesn't allow '}'
+ "\\}|"
/* a "#" - starts comments when COMMENTS flag is enabled */
+ "#|"
/* matches a \Q..\E block */
+ "\\\\Q(?s:.*?)(?:\\\\E|$)|"
/* matches an escaped character */
+ "\\\\[^Q]|"
/* matches a line terminator */
+ "[\n\r\u0085\u2028\u2029]"
);
/**
* Pattern to match parts of the inputted regular expression that need
* to be processed after refactoring.
*/
static final java.util.regex.Pattern afterRefactor = java.util.regex.Pattern
.compile(
/*
* matches:
* "\g{##group-mappingName}"
* "\g{##branchGroup-mappingName}"
* "\g{##test-mappingName}"
* "\g{##testF-mappingName}"
*
* group: the number - mapped to the position to show the error
* group + 1: type ("group", "test", or "testF")
* group + 2: mappingName
* (3 groups)
*/
"\\Q\\g{\\E(\\d++)(group|branchGroup|testF?)-([^}]++)\\}|"
/*
* matches "(?onFlags-offFlags)" or "(?onFlags-offFlags:"
* (also matches a non-capture group - onFlags/offFlags are
* omitted)
*
* group: onFlags (empty string if none)
* group + 1: offFlags (empty string if none; null, if omitted)
* (2 groups)
*/
+ "\\Q(?\\E(\\w*+)(?:-(\\w*+))?[:\\)]|"
/*
* matches "\x{hhh..}" - a hex code
*
* group: the number
* (1 group)
*/
+ "\\Q\\x{\\E([0-9a-fA-F]++)\\}|"
/*
* matches "\xh" or "\xhh" - a hex code
*
* group: the number
* (1 group)
*/
+ "\\\\x([0-9a-fA-F]{1,2})|"
/*
* matches a unicode character
*
* group: the number
* (1 group)
*/
+ "\\\\u([0-9a-fA-F]{1,4})|"
/*
* matches a POSIX character class
*
* group: "^" or "" (whether to negate or not)
* group + 1: the class name
* (2 group)
*/
+ "\\[:(\\^?)([A-Za-z]++):]|"
/*
* matches a control character - \cA through \cZ
*
* These are equivalent to \x01 through \x1A (26 decimal).
*
* group: the control character's letter
* (either upper and lower case are allowed)
* (1 group)
*/
+ "\\\\c([A-Za-z])|"
/*
* matches an unnamed capture group "(" - not followed by a "?" (or a '*', used by verbs)
*
* group: everything (1 group)
*/
+ "(\\((?![?*]))|"
/* an open parenthesis - depth + 1 */
+ "\\(|"
/* a closed parenthesis - depth - 1 */
+ "\\)|"
/* an open square bracket - starts a character class */
+ "\\[|"
/* a closed square bracket - ends a character class */
+ "\\]|"
/* a "#" - starts comments when COMMENTS flag is enabled */
+ "#|"
/* matches a \Q..\E block */
+ "\\\\Q(?s:.*?)(?:\\\\E|$)|"
/* matches an escaped character */
+ "\\\\[^Q]|"
/* matches a line terminator */
+ "[\n\r\u0085\u2028\u2029]"
);
// /* matches an escaped character */
// + "\\\\.");
/**
* A cache of Patterns used during the refactoring.
*
* The Integer
key is the number of digits in the number
* of capture groups for the inputted regular expression.
*/
// private static final Map
// refactorPatterns = new HashMap();
/**
* Pattern used during refactoring.
*/
// private static final java.util.regex.Pattern refactorPattern = createRefactorPattern();
static final java.util.regex.Pattern refactor = java.util.regex.Pattern
.compile(
/*
* matches an unnamed subroutine reference
* "(?[-+]n)" (form 0)
*
* group: number (including [-+] to make it relative)
* (1 group)
*
* TODO: add format checks
*/
"\\Q(?\\E([-+]?\\d++)\\)|"
/*
* matches a named subroutine reference
* "(?&group)" (form 0)
* "(?P>group)" (form 1)
*
* group: group name
* group + 1: occurrence
* (2 group)
*
* TODO: add format checks
*/
+ "\\Q(?\\E(?:&|P>)" + fullGroupName + "\\)|"
/*
* matches "(?onFlags-offFlags)" or "(?onFlags-offFlags:"
* (also matches a non-capture group
* - onFlags/offFlags are omitted)
*
* group: onFlags (empty string if none)
* group + 1: offFlags
* (empty string if none; null, if omitted)
*
* (2 groups)
*/
+ "\\Q(?\\E(\\w*+)(?:-(\\w*+))?[:\\)]|"
/*
* matches a named capture group
* "(?" (form 0)
* "(?'name'" (form 1)
* "(?P" (form 2)
*
* group: the name (3 groups)
*/
/* form 0 - 2 start */
+ "\\Q(?\\E(?:"
/* form 0 */
+ "<("
+
groupName
+
")>|"
/* form 1 */
+
"'("
+
groupName
+
")'|"
/* form 2 */
+
"P<("
+
groupName
+
")>"
/* form 0 - 2 end */
+
")|"
/*
* matches an unnamed capture group "(" - not
* followed by a "?" (or a '*', used by verbs)
*
* group: everything (1 group)
*/
+
"(\\((?![?*]))|"
/*
* matches a back reference (by number)
* "\n" (form 0)
* "\gn" (form 1)
* "\g{n}" or "\g{-n}" (form 2)
*
* group: the number
* last group: the next character (if a digit)
* (4 groups)
*/
/* form 0 - 2 start */
+
"(?:"
/* form 0 */
+
"\\\\(\\d++)|"
/* form 1 */
+
"\\\\g(-?\\d++)|"
/* form 2 */
+
"\\Q\\g{\\E(-?\\d++)\\}"
/* form 0 - 2 end */
+
")(\\d?)|"
/*
* matches a back reference (by name)
* "\g{name}" (form 0),
* "\k" (form 1),
* "\k'name'" (form 2),
* "\k{name}" (form 3),
* "(?P=name)" (form 4)
*
* group: everything after the first symbol
* group + 1: the name
* group + 2: the occurrence (if specified)
* last group: the next character (if a digit)
* (16 groups)
*/
/* form 0 - 4 start */
+
"(?:"
/* form 0 */
+
"\\Q\\g{\\E("
+
fullGroupName
+
"\\}?)|"
/* form 1 - 3 start */
+
"\\\\k(?:"
/* form 1 */
+
"<("
+
fullGroupName
+
">?)|"
/* form 2 */
+
"'("
+
fullGroupName
+
"'?)|"
/* form 3 */
+
"\\{("
+
fullGroupName
+
"\\}?)"
/* form 1 - 3 end */
+
")|"
/* form 4 */
+
"\\Q(?P=\\E("
+
fullGroupName
+
"\\)?)"
/* form 0 - 4 end */
+
")(\\d?)|"
/*
* matches an assert condition
* "(?(?=)", "(?(?!)", "(?(?<=)", or "(?(?)" (form 0),
* "(?('name')" (form 1), or
* "(?(name)" (form 2)
*
* group: the name
* group + 1: the occurrence (if specified)
* (6 groups)
*/
/* form 0 - 2 start */
+
"\\Q(?(\\E(?:"
/* form 0 */
+
"<"
+
fullGroupName
+
">|"
/* form 1 */
+
"'"
+
fullGroupName
+
"'|"
/* form 2 */
+
fullGroupName
/* form 0 - 2 end */
+
")\\)|"
/*
* matches a "branch reset" subpattern "(?|"
*
* group: everything (1 group)
*/
+
"(\\Q(?|\\E)|"
// TODO: combine syntax with range
// allows options
// 6/1/2020 - if specify decimal, must have digit before and after decimal
// Fix issue with (?Z[0...2]) and (?Z[0....2]) errors
/*
* matches an unbounded numeric range
* such as "(?Z[<1.234])"
*
* group: "Z" or "NZ"
* group + 1: comparison (such as "<")
* group + 2: value (such as "1.234")
* (3 groups)
*
* TODO: add "?" (optional) tags
* (then, check of format - in refactor step)
*/
+ "\\Q(?\\E(N?Z[id]?(?:\\d++[LU]?)?)\\[([<>]=?)"
+ "(-?(?:[0-9a-zA-Z]++(?:\\.[0-9a-zA-Z]+)?))\\]?\\)?|"
// + "(-?(?:[0-9a-zA-Z]++(?:\\.[0-9a-zA-Z]*)?|\\.[0-9a-zA-Z]++))\\]\\)|"
/*
* matches a numeric range
* "(?Z[start..end])" or "(?NZ[start..end])"
*
* group: "Z" or "NZ" (optional base and L/U)
* //group + 1: "r" for raw mode, or
* group + 1:'>' if exclusive start; if inclusive
* group + 2: start
* group + 3:'<' if exclusive end; if inclusive
* group + 4: end
* (5 groups)
*/
// "\\Q(?\\E(N?Z(r)?(?:\\d++[LU]?)?)\\[(?:(-?\\d++)\\.\\.(-?\\d++)\\])?\\)?|"
+ "\\Q(?\\E(N?Z[id]?(?:\\d++[LU]?)?)\\[(?:([<>])?"
// "(-?[0-9a-zA-Z]++)" +
+ "(-?(?:[0-9a-zA-Z]++(?:\\.[0-9a-zA-Z]+)?))"
// + "(-?(?:[0-9a-zA-Z]++(?:\\.[0-9a-zA-Z]*)?|\\.[0-9a-zA-Z]++))"
+ "\\.\\.([<>])?"
// + "(-?[0-9a-zA-Z]++)"
+ "(-?(?:[0-9a-zA-Z]++(?:\\.[0-9a-zA-Z]+)?|))"
// + "(-?(?:[0-9a-zA-Z]++(?:\\.[0-9a-zA-Z]*)?|\\.[0-9a-zA-Z]++))"
+ "\\]?)?\\)?|"
/* an open parenthesis */
+ "\\(|"
/* a closed parenthesis */
+ "\\)|"
/* a pike (|) */
+ "\\||"
/* an open square bracket - starts a character class */
+ "\\[|"
/* a closed square bracket - ends a character class */
+ "\\]|"
/*
* a "#" - starts comments when COMMENTS flag is
* enabled
*/
+ "#|"
/* matches a \Q..\E block */
+ "\\\\Q(?s:.*?)(?:\\\\E|$)|"
/* matches an escaped character */
+ "\\\\[^Q]|"
/* matches a line terminator */
+ "[\n\r\u0085\u2028\u2029]"
);
// /* matches an escaped character */
// + "\\\\.");
/**
* A cache of Patterns used to match a number with a given digit count.
*
* The Integer
key is the number of digits, and the
* Pattern
value is the pattern to match a number with that
* many digits. The digits are in the first group, and any trailing
* digits are in the second.
*/
private static final Map digitCountPatterns = new HashMap(
2);
static final java.util.regex.Pattern perl_octal = java.util.regex.Pattern
.compile("^([0-3]?[0-7]{1,2})(\\d*+)$");
/**
* Returns the pattern to use when refactoring an inputted regular
* expression.
*
* @param groupCount
* number of capture groups in the inputted regular
* expression
* @return the pattern to use when refactoring
*/
// static java.util.regex.Pattern getRefactorPattern(int groupCount)
// {
// // number of digits in the largest group index
// // (e.g. 999 would be 3, and 1000 would be 4)
// Integer digitCount = digitCount(groupCount);
//
// if (!refactorPatterns.containsKey(digitCount)) {
// java.util.regex.Pattern fixup = createRefactorPattern();
//
// refactorPatterns.put(digitCount, fixup);
// return fixup;
// }
//
// return refactorPatterns.get(digitCount);
// }
/**
* Returns the pattern to use when refactoring an inputted regular
* expression.
*
* @return the pattern to use when refactoring
*/
// static java.util.regex.Pattern getRefactorPattern()
// {
// return refactorPattern;
// }
/**
* Returns a pattern that matches digitCount
digits
* followed by zero or more digits. The pattern has two capture groups.
* The required digits are in the first group, and any trailing digits
* are in the second.
*
* @param digitCount
* the number of digits the pattern should match
* @return a pattern that matches digitCount
digits
* followed by zero or more digits
*/
static java.util.regex.Pattern getDigitCountPattern(int digitCount)
{
Integer digitCountI = digitCount;
if (digitCountPatterns.containsKey(digitCountI))
return digitCountPatterns.get(digitCountI);
else {
java.util.regex.Pattern pattern = java.util.regex.Pattern
.compile("(\\d{1," + digitCount + "})(\\d*+)");
digitCountPatterns.put(digitCountI, pattern);
return pattern;
}
}
/**
* Returns the number of digits in the given number
*
* @param number
* the number whose digit count is returned
* @return the number of digits in the given number
*/
static int digitCount(int number)
{
return String.valueOf(number).length();
}
/**
*
* Returns whether the specified mapping name is an "any group".
*
*
*
* Note: these are groups with an occurrence of 0, for example,
* "groupName[0]". However "[0]" is not an "any group", but instead
* refers to the match itself.
*
*
*
* Any groups allow referring to the first matched group in the case of
* multiple groups with the same name. If there is only one group, the
* "any group" is the first group.
*
*
* @param mappingName
* the mapping name for the group
* @return true
if the given group is an "any group"
*/
static boolean isAnyGroup(String mappingName)
{
// greater than, because "[0]" is not an "any group"
// (requires a group name)
return mappingName.length() > "[0]".length() &&
mappingName.endsWith("[0]");
}
/**
* Removes the trailing "[0]" for the inputted "any group".
*
* @param mappingName
* must be an "any group"
* @return the group name for this "any group"
*/
static String anyGroupName(String mappingName)
{
return mappingName.substring(0, mappingName.length() -
"[0]".length());
}
/**
* Surrounds the passed string in a non-capture group.
*
* Note: If the pattern is compiled with the {@link Pattern#ONLY_CAPTURING_GROUPS} flag, then the
* group will be a
* capture group, and the {@link #totalGroups} will increase by one to
* reflect this.
*
* @param str
* the string to surround
* @return the RegEx for the given string surrounded by a non-capture
* group.
*/
static String nonCaptureGroup(String str)
{
return startNonCaptureGroup() + str + ")";
}
/**
* Returns the string that represents the start of a non-capture group,
* "(?:".
*
* @return the string that represents the start of a non-capture group
*/
static String startNonCaptureGroup()
{
// if (!supportedSyntax(NONCAPTURE_GROUPS)) {
// totalGroups++;
// return "(";
// } else
return "(?:";
}
/**
*
* Returns a regular expression that matches the given target group.
*
*/
static String acceptTestingGroup(int targetGroup)
{
return "(?=\\" + targetGroup + ")";
}
/**
*
* Returns a regular expression which fails if the specified target
* group matches.
*
*/
static String failTestingGroup(int targetGroup)
{
return "(?!\\" + targetGroup + ")";
}
/**
* Returns a regular expression that always fails.
*
* @return a regular expression that always fails
*/
static String fail()
{
// Use basic syntax to allow the internal pattern to be used in languages which don't support assertions
// return "\\b\\B";
// Fails fast (and is self documenting).
// If single-line (no "m" flag), max 4 probes (if text starts with "f", "fa", "fai" or "fail")
// if multi-line, N probes
// return "^fail^";
// Uses assertion, because it's possible java optimizes for it
// Fails fast - 0 probes
return "(?!)";
}
/**
* @throws IllegalArgumentException
* if the string does not contain a parsable integer.
* @see Integer#parseInt(String)
*/
static int parseInt(String string)
{
try {
return Integer.parseInt(string);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Cannot parse integer: " +
string);
}
}
/**
* Returns a String that is never used as a mapping name
*
* @return
*/
static String neverUsedMappingName()
{
return "$neverUsed";
}
/**
* Indicates whether the specified group name is an unnamed group
*
* @param groupName
* the group name
* @return true
if, and only if, groupName
is
* the name of an unnamed group (e.g. [1])
*/
static boolean isUnnamedGroup(String groupName)
{
return groupName.charAt(0) == '[';
}
/**
* Returns a regular expression that matches the given digits literally.
*
* @param trailingDigits
* the literal digits that follow a back reference
*/
static String fixTrailing(String trailingDigits)
{
if (trailingDigits.length() == 0)
return "";
return "[" + trailingDigits.charAt(0) + "]" +
trailingDigits.substring(1);
}
}