regexodus.regex.Pattern Maven / Gradle / Ivy
package regexodus.regex;
import com.jtransc.annotation.JTranscInvisible;
import regexodus.REFlags;
import java.io.Serializable;
import java.util.ArrayList;
/**
* Created by Tommy Ettinger on 6/7/2016.
*/
@SuppressWarnings("WeakerAccess")
@JTranscInvisible
public class Pattern implements Serializable {
public regexodus.Pattern internal;
private int flags;
/**
* Not used; present for compatibility.
*/
public static final int UNIX_LINES = 0x01;
/**
* Enables case-insensitive matching.
*
* Unicode-aware case-insensitive matching is always enabled by this
* flag, regardless of unicode flag status.
*
*
Case-insensitive matching can also be enabled via the embedded flag
* expression (?i).
*/
public static final int CASE_INSENSITIVE = 0x02;
/**
* Permits whitespace and comments in pattern.
*
*
In this mode, whitespace is ignored, and embedded comments starting
* with # are ignored until the end of a line.
*
*
Comments mode can also be enabled via the embedded flag
* expression (?x).
*/
public static final int COMMENTS = 0x04;
/**
* Enables multiline mode.
*
* In multiline mode the expressions ^ and $ match
* just after or just before, respectively, a line terminator or the end of
* the input sequence. By default these expressions only match at the
* beginning and the end of the entire input sequence.
*
* Multiline mode can also be enabled via the embedded flag
* expression (?m).
*/
public static final int MULTILINE = 0x08;
/**
* Enables literal mode.
*
* In literal mode, metacharacters are not interpreted at all, and they
* match the exact, literal string used for the Pattern. This is done
* by running {@link Pattern#quote(String)} on the regexp string when
* this flag is specified.
*/
public static final int LITERAL = 0x10;
/**
* Enables dotall mode.
*
* In dotall mode, the expression . matches any character,
* including a line terminator. By default this expression does not match
* line terminators.
*
* Dotall mode can also be enabled via the embedded flag
* expression (?s). (The s is a mnemonic for
* "single-line" mode, which is what this is called in Perl.)
*/
public static final int DOTALL = 0x20;
/**
* Not used; present for compatibility.
*/
public static final int UNICODE_CASE = 0x40;
/**
* Not used; present for compatibility.
*/
public static final int CANON_EQ = 0x80;
/**
* Enables the Unicode version of Predefined character classes and
* POSIX character classes.
*
* When this flag is specified then the (US-ASCII only)
* Predefined character classes and POSIX character classes
* are in conformance with
* Unicode Technical
* Standard #18: Unicode Regular Expression
* Annex C: Compatibility Properties.
*
* The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
* flag expression (?u).
*
* Specifying this flag may impose a performance penalty.
*/
public static final int UNICODE_CHARACTER_CLASS = 0x100;
/**
* Compiles the given regular expression into a pattern.
*
* @param regex
* The expression to be compiled
*
* @throws PatternSyntaxException
* If the expression's syntax is invalid
*/
public static Pattern compile(String regex) {
return new Pattern(regex, 0);
}
/**
* Compiles the given regular expression into a pattern with the given
* flags.
*
* @param regex
* The expression to be compiled
*
* @param flags
* Match flags, a bit mask that may include
* {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
* {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
* {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}
* and {@link #COMMENTS}
*
* @throws IllegalArgumentException
* If bit values other than those corresponding to the defined
* match flags are set in flags
*
* @throws PatternSyntaxException
* If the expression's syntax is invalid
*/
public static Pattern compile(String regex, int flags) {
return new Pattern(regex, flags);
}
/**
* Returns the regular expression from which this pattern was compiled.
*
* @return The source of this pattern
*/
public String pattern() {
return internal.toString();
}
/**
* Returns the string representation of this pattern. This
* is the regular expression from which this pattern was
* compiled.
* @return The string representation of this pattern
* @since 1.5
*/
public String toString() {
return internal.toString();
}
/**
* Creates a matcher that will match the given input against this pattern.
* @param input
* The character sequence to be matched
*
* @return A new matcher for this pattern
*/
public Matcher matcher(CharSequence input) {
return new Matcher(this, input);
}
/**
* Returns this pattern's match flags.
*
* @return The match flags specified when this pattern was compiled
*/
public int flags() {
return flags;
}
/**
* Compiles the given regular expression and attempts to match the given
* input against it.
*
* An invocation of this convenience method of the form
*
*
* Pattern.matches(regex, input);
*
* behaves in exactly the same way as the expression
*
*
* Pattern.compile(regex).matcher(input).matches()
*
* If a pattern is to be used multiple times, compiling it once and reusing
* it will be more efficient than invoking this method each time.
*
* @param regex
* The expression to be compiled
*
* @param input
* The character sequence to be matched
*
* @throws PatternSyntaxException
* If the expression's syntax is invalid
*/
public static boolean matches(String regex, CharSequence input) {
Pattern p = Pattern.compile(regex);
return p.matcher(input).matches();
}
/**
* Splits the given input sequence around matches of this pattern.
*
* The array returned by this method contains each substring of the
* input sequence that is terminated by another subsequence that matches
* this pattern or is terminated by the end of the input sequence. The
* substrings in the array are in the order in which they occur in the
* input. If this pattern does not match any subsequence of the input then
* the resulting array has just one element, namely the input sequence in
* string form.
*
* The limit parameter controls the number of times the
* pattern is applied and therefore affects the length of the resulting
* array. If the limit n is greater than zero then the pattern
* will be applied at most n - 1 times, the array's
* length will be no greater than n, and the array's last entry
* will contain all input beyond the last matched delimiter. If n
* is non-positive then the pattern will be applied as many times as
* possible and the array can have any length. If n is zero then
* the pattern will be applied as many times as possible, the array can
* have any length, and trailing empty strings will be discarded.
*
* The input "boo:and:foo", for example, yields the following
* results with these parameters:
*
*
* Regex
* Limit
* Result
* :
* 2
* { "boo", "and:foo" }
* :
* 5
* { "boo", "and", "foo" }
* :
* -2
* { "boo", "and", "foo" }
* o
* 5
* { "b", "", ":and:f", "", "" }
* o
* -2
* { "b", "", ":and:f", "", "" }
* o
* 0
* { "b", "", ":and:f" }
*
*
*
* @param input
* The character sequence to be split
*
* @param limit
* The result threshold, as described above
*
* @return The array of strings computed by splitting the input
* around matches of this pattern
*/
public String[] split(CharSequence input, int limit) {
int index = 0;
boolean matchLimited = limit > 0;
ArrayList matchList = new ArrayList();
regexodus.Matcher m = new regexodus.Matcher(internal, input);
// Add segments before each match found
while(m.find()) {
if (!matchLimited || matchList.size() < limit - 1) {
String match = input.subSequence(index, m.start()).toString();
matchList.add(match);
index = m.end();
} else if (matchList.size() == limit - 1) { // last one
String match = input.subSequence(index,
input.length()).toString();
matchList.add(match);
index = m.end();
}
}
// If no match was found, return this
if (index == 0)
return new String[] {input.toString()};
// Add remaining segment
if (!matchLimited || matchList.size() < limit)
matchList.add(input.subSequence(index, input.length()).toString());
// Construct result
int resultSize = matchList.size();
if (limit == 0)
while (resultSize > 0 && matchList.get(resultSize-1).equals(""))
resultSize--;
String[] result = new String[resultSize];
return matchList.subList(0, resultSize).toArray(result);
}
/**
* Splits the given input sequence around matches of this pattern.
*
* This method works as if by invoking the two-argument {@link
* #split(java.lang.CharSequence, int) split} method with the given input
* sequence and a limit argument of zero. Trailing empty strings are
* therefore not included in the resulting array.
*
* The input "boo:and:foo", for example, yields the following
* results with these expressions:
*
*
* Regex
* Result
* :
* { "boo", "and", "foo" }
* o
* { "b", "", ":and:f" }
*
*
* @param input
* The character sequence to be split
*
* @return The array of strings computed by splitting the input
* around matches of this pattern
*/
public String[] split(CharSequence input) {
return split(input, 0);
}
/**
* Returns a literal pattern String
for the specified
* String
.
*
* This method produces a String
that can be used to
* create a Pattern
that would match the string
* s
as if it were a literal pattern. Metacharacters
* or escape sequences in the input sequence will be given no special
* meaning.
*
* @param s The string to be literalized
* @return A literal string replacement
*/
public static String quote(String s) {
int slashEIndex = s.indexOf("\\E");
if (slashEIndex == -1)
return "\\Q" + s + "\\E";
StringBuilder sb = new StringBuilder(s.length() * 2);
sb.append("\\Q");
int current = 0;
while ((slashEIndex = s.indexOf("\\E", current)) != -1) {
sb.append(s.substring(current, slashEIndex));
current = slashEIndex + 2;
sb.append("\\E\\\\E\\Q");
}
sb.append(s.substring(current, s.length()));
sb.append("\\E");
return sb.toString();
}
private Pattern(String p, int flags)
{
int fm = (flags & CASE_INSENSITIVE) != 0 ? REFlags.IGNORE_CASE : 0;
this.flags = flags;
fm |= (flags & DOTALL) != 0 ? REFlags.DOTALL : 0;
fm |= (flags & COMMENTS) != 0 ? REFlags.IGNORE_SPACES : 0;
fm |= (flags & MULTILINE) != 0 ? REFlags.MULTILINE : 0;
fm |= (flags & UNICODE_CHARACTER_CLASS) != 0 ? REFlags.UNICODE : 0;
if((flags & LITERAL) != 0)
internal = regexodus.Pattern.compile(quote(p), fm);
else
internal = regexodus.Pattern.compile(p, fm);
}
}