All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.vesalainen.regex.Regex Maven / Gradle / Ivy

Go to download

Java Lookahead Parser Generator. Generator produces LALR(k) parsers. Grammar rules are entered using annotations. Rule annotation can be attached to reducer method, which keeps rule and it's action together.

The newest version!
/*
 * Copyright (C) 2012 Timo Vesalainen
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see .
 */
package org.vesalainen.regex;

import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.PushbackReader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.util.Spliterator;
import java.util.function.Consumer;
import javax.lang.model.element.Modifier;
import org.vesalainen.bcc.FieldInitializer;
import org.vesalainen.bcc.SubClass;
import org.vesalainen.bcc.model.El;
import org.vesalainen.grammar.state.DFA;
import org.vesalainen.grammar.state.DFAState;
import org.vesalainen.grammar.state.NFA;
import org.vesalainen.grammar.state.NFAState;
import org.vesalainen.grammar.state.Scope;
import org.vesalainen.parser.util.Input;
import org.vesalainen.parser.util.InputReader;
import org.vesalainen.parser.util.UnderflowException;

/**
 * This regular expression implementation is DFA rather than NFA based. Using DFA is much
 * faster but there are some limitations.
 * Following features are not supported:
 *
 * 

Capturing groups are not supported * *

Reluctant and possessive quantifiers are not supported. * *

Special constructs are not supported * *

You can get Regex instance by using compile method. Compiling takes time. * *

Regex classes are thread safe * * @see java.util.regex.Pattern * @author tkv * @see BNF Syntax for Regular expression */ public abstract class Regex { public enum Option { /** * Enables case-insensitive matching. */ CASE_INSENSITIVE, /** * Creates a matcher for grammars where a fixed string ends the match. Examples * are xml comments ending in --> or PCDATA ending in ]]>. Regular expression * like '.*\\-\\->' without FIXED_ENDER option doesn't work. However with * FIXED_ENDER option the underlying NFA is modified to support that kind * of parsing before DFA creation. Note that the suffix is pushed back after * recognition. */ FIXED_ENDER, /** * Normally parser tries to match as more characters as is possible. For * expression 'LITERAL|LITERALS' it accepts LITERALS for input LITERALS. * if FIXED_ENDER options is used 'LITERAL' is matched instead without reading further * input. */ ACCEPT_IMMEDIATELY; public static boolean supports(Option[] options, Option option) { for (Option opt : options) { if (opt.equals(option)) { return true; } } return false; } }; private static RegexParserIntf regexParser; protected static boolean debug; protected boolean acceptEmpty; protected String expression; protected int minLength; protected int maxLength; /** * Returns the maximum length of accepted string or 2147483647 if expression * can accept infinite length string. * @return */ public int getMaxLength() { return maxLength; } /** * Returns the minimum length of accepted string. * @return */ public int getMinLength() { return minLength; } /** * Returns the expression from which this Regex has been compiled. * @return */ public String getExpression() { return expression; } /** * Returns a regular expression from a literal expression which has '?' * wildcard for any single character and * for any number of character. * @param literal * @return */ public static String wildcard(String literal) { StringBuilder sb = new StringBuilder(); for (int ii = 0; ii < literal.length(); ii++) { char cc = literal.charAt(ii); switch (cc) { case '[': case ']': case '(': case ')': case '\\': case '-': case '^': case '+': case '|': case '.': case '{': case '}': case '&': case '$': case ',': sb.append("\\").append(cc); break; case '*': sb.append(".*"); break; case '?': sb.append("."); break; default: sb.append(cc); break; } } return sb.toString(); } /** * Escapes all regex control characters returning expression suitable for * literal parsing. * @param literal * @return Escaped string */ public static String escape(String literal) { StringBuilder sb = new StringBuilder(); for (int ii = 0; ii < literal.length(); ii++) { char cc = literal.charAt(ii); switch (cc) { case '[': case ']': case '(': case ')': case '\\': case '-': case '^': case '*': case '+': case '?': case '|': case '.': case '{': case '}': case '&': case '$': case ',': sb.append("\\").append(cc); break; default: sb.append(cc); break; } } return sb.toString(); } /** * Double escapes all regex control characters plus linefeed and tab * characters returning expression suitable for printing. * @param literal * @return Escaped string */ public static String printable(String literal) { StringBuilder sb = new StringBuilder(); for (int ii = 0; ii < literal.length(); ii++) { char cc = literal.charAt(ii); switch (cc) { case '\n': sb.append("\\n"); break; case '\r': sb.append("\\r"); break; case '\t': sb.append("\\t"); break; case '[': case ']': case '(': case ')': case '\\': case '-': case '^': case '*': case '+': case '?': case '|': case '.': case '{': case '}': case '&': case '$': case ',': sb.append("\\\\").append(cc); break; default: sb.append(cc); break; } } return sb.toString(); } public static void setDebug(boolean d) { debug = d; } protected void trace(int a, String msg) { System.err.println(a + ": " + msg); } /** * Return true if text matches the regex * @param text * @return * @throws IOException */ public boolean isMatch(CharSequence text) { try { if (text.length() == 0) { return acceptEmpty; } InputReader reader = Input.getInstance(text); return isMatch(reader); } catch (IOException ex) { throw new IllegalArgumentException("can't happen"); } } /** * Return true if input matches the regex * @param input * @param size * @return * @throws IOException */ public boolean isMatch(PushbackReader input, int size) throws IOException { InputReader reader = Input.getInstance(input, size); return isMatch(reader); } /** * Return true if input matches the regex. Using shared buffer reduces the * need to allocate new buffer for several natches. * @param input * @param shared Shared buffer * @return * @throws IOException * @throws SyntaxErrorException */ public boolean isMatch(PushbackReader input, char[] shared) throws IOException { InputReader reader = Input.getInstance(input, shared); return isMatch(reader); } /** * Return true if input matches the regex. * @param reader * @return * @throws IOException * @throws SyntaxErrorException */ public boolean isMatch(InputReader reader) throws IOException { int rc = match(reader); return (rc == 1 && reader.read() == -1); } /** * Attempts to match input to regex * @param text * @return * @throws SyntaxErrorException */ public String match(CharSequence text) { try { if (text.length() == 0) { if (acceptEmpty) { return ""; } else { throw new SyntaxErrorException("empty string not accepted"); } } InputReader reader = Input.getInstance(text); int rc = match(reader); if (rc == 1 && reader.read() == -1) { return reader.getString(); } else { throw new SyntaxErrorException("syntax error" + "\n" + reader.getLineNumber() + ": " + reader.getLine() + "\n" + pointer(reader.getColumnNumber() + 2)); } } catch (IOException ex) { throw new IllegalArgumentException("can't happen"); } } /** * Matches the whole input and returns the matched string * @param in * @param size * @return * @throws IOException * @throws SyntaxErrorException */ public String getMatch(PushbackReader in, int size) throws IOException, SyntaxErrorException { InputReader reader = Input.getInstance(in, size); String s = getMatch(reader); reader.release(); return s; } /** * Matches the whole input and returns the matched string * @param text * @return * @throws SyntaxErrorException */ public String getMatch(CharSequence text) { try { if (text.length() == 0) { if (acceptEmpty) { return ""; } else { throw new SyntaxErrorException("empty string not accepted"); } } InputReader reader = Input.getInstance(text); return getMatch(reader); } catch (IOException ex) { throw new IllegalArgumentException("can't happen"); } } /** * Matches the whole input and returns the matched string * @param in * @param shared * @return * @throws IOException * @throws SyntaxErrorException */ public String getMatch(PushbackReader in, char[] shared) throws IOException, SyntaxErrorException { InputReader reader = Input.getInstance(in, shared); String s = getMatch(reader); reader.release(); return s; } /** * Matches the whole input and returns the matched string * @param reader * @return * @throws IOException * @throws SyntaxErrorException */ public String getMatch(InputReader reader) throws IOException, SyntaxErrorException { int rc = match(reader); if (rc == 1 && reader.read() == -1) { return reader.getString(); } else { throw new SyntaxErrorException("syntax error" + "\n" + reader.getLineNumber() + ": " + reader.getLine() + "\n" + pointer(reader.getColumnNumber() + 2)); } } /** * Returns true if input start matches the regular expression * @param text * @return */ public boolean startsWith(CharSequence text) { try { if (text.length() == 0) { if (acceptEmpty) { return true; } else { throw new SyntaxErrorException("empty string not accepted"); } } InputReader reader = Input.getInstance(text); return startsWith(reader); } catch (IOException ex) { throw new IllegalArgumentException("can't happen"); } } /** * Returns true if input start matches the regular expression * @param in * @param size * @return * @throws IOException */ public boolean startsWith(PushbackReader in, int size) throws IOException { InputReader reader = Input.getInstance(in, size); boolean b = startsWith(reader); reader.release(); return b; } /** * Returns true if input start matches the regular expression * @param in * @param shared * @return * @throws IOException */ public boolean startsWith(PushbackReader in, char[] shared) throws IOException { InputReader reader = Input.getInstance(in, shared); boolean b = startsWith(reader); reader.release(); return b; } /** * Returns true if input start matches the regular expression * @param reader * @return * @throws IOException */ public boolean startsWith(InputReader reader) throws IOException { int rc = match(reader); return rc == 1; } /** * Matches the start of text and returns the matched string * @param text * @return * @throws SyntaxErrorException */ public String lookingAt(CharSequence text) { try { if (text.length() == 0) { if (acceptEmpty) { return ""; } else { throw new SyntaxErrorException("empty string not accepted"); } } InputReader reader = Input.getInstance(text); return lookingAt(reader); } catch (IOException ex) { throw new IllegalArgumentException("can't happen"); } } /** * Matches the start of text and returns the matched string * @param in * @param size * @return * @throws IOException * @throws SyntaxErrorException */ public String lookingAt(PushbackReader in, int size) throws IOException, SyntaxErrorException { InputReader reader = Input.getInstance(in, size); String s = lookingAt(reader); reader.release(); return s; } /** * Matches the start of text and returns the matched string * @param in * @param shared * @return * @throws IOException * @throws SyntaxErrorException */ public String lookingAt(PushbackReader in, char[] shared) throws IOException, SyntaxErrorException { InputReader reader = Input.getInstance(in, shared); String s = lookingAt(reader); reader.release(); return s; } /** * Matches the start of text and returns the matched string * @param reader * @return * @throws IOException * @throws SyntaxErrorException */ public String lookingAt(InputReader reader) throws IOException, SyntaxErrorException { int rc = match(reader); if (rc == 1) { return reader.getString(); } else { throw new SyntaxErrorException("syntax error" + "\n" + reader.getLineNumber() + ": " + reader.getLine() + "\n" + pointer(reader.getColumnNumber() + 2)); } } private String pointer(int p) { StringBuilder sb = new StringBuilder(); for (int ii = 0; ii < p; ii++) { sb.append(" "); } sb.append("^^^"); return sb.toString(); } /** * Finds next match and returns the matched string * @param text * @return * @throws SyntaxErrorException */ public String find(CharSequence text) { try { if (text.length() == 0) { if (acceptEmpty) { return ""; } else { throw new SyntaxErrorException("empty string not accepted"); } } if (acceptEmpty) { throw new IllegalArgumentException("using find for expression that accepts empty string"); } InputReader reader = Input.getInstance(text); int rc = find(reader); if (rc == 1) { return reader.getString(); } else { throw new SyntaxErrorException("string matching '" + expression + "' not found"); } } catch (IOException ex) { throw new IllegalArgumentException("can't happen", ex); } } /** * Finds next match and returns the matched string * @param in * @param size * @return Matched string * @throws IOException * @throws SyntaxErrorException */ public String find(PushbackReader in, int size) throws IOException, SyntaxErrorException { if (acceptEmpty) { throw new IllegalArgumentException("using find for '" + expression + "' that accepts empty string"); } InputReader reader = Input.getInstance(in, size); int rc = find(reader); reader.release(); if (rc == 1) { return reader.getString(); } else { throw new SyntaxErrorException("string matching '" + expression + "' not found"); } } /** * Replaces regular expression matches in text with replacement string * @param text * @param replacement * @return */ public String replace(CharSequence text, CharSequence replacement) { try { if (text.length() == 0) { if (acceptEmpty) { return ""; } } CharArrayWriter caw = new CharArrayWriter(); InputReader reader = Input.getInstance(text); ObsoleteSimpleReplacer fsp = new ObsoleteSimpleReplacer(replacement); replace(reader, caw, fsp); return caw.toString(); } catch (IOException ex) { throw new IllegalArgumentException("can't happen", ex); } } /** * Replaces regular expression matches in text using replacer * @param text * @param replacer * @return * @throws IOException */ public String replace(CharSequence text, ObsoleteReplacer replacer) throws IOException { if (text.length() == 0) { return ""; } CharArrayWriter caw = new CharArrayWriter(); InputReader reader = Input.getInstance(text); replace(reader, caw, replacer); return caw.toString(); } /** * Replaces regular expression matches in text using replacer * @param text * @param bufferSize bufferSize must be > text.length() + max insert text * @param replacer * @return * @throws IOException */ public String replace(CharSequence text, int bufferSize, ObsoleteReplacer replacer) throws IOException { if (text.length() == 0) { return ""; } CharArrayWriter caw = new CharArrayWriter(); InputReader reader = Input.getInstance(text, bufferSize); replace(reader, caw, replacer); return caw.toString(); } /** * Writes in to out replacing every match with a string * @param in * @param bufferSize * @param out * @param format * @throws java.io.IOException * @see String.format */ public void replace(PushbackReader in, int bufferSize, Writer out, String format) throws IOException { InputReader reader = Input.getInstance(in, bufferSize); ObsoleteSimpleReplacer fsp = new ObsoleteSimpleReplacer(format); replace(reader, out, fsp); } public void replace(CharSequence text, Writer out, String format) throws IOException { if (text.length() > 0) { InputReader reader = Input.getInstance(text); ObsoleteSimpleReplacer fsp = new ObsoleteSimpleReplacer(format); replace(reader, out, fsp); } } /** * Replaces regular expression matches in input using replacer * @param in * @param bufferSize * @param out * @param replacer * @throws IOException */ public void replace(PushbackReader in, int bufferSize, Writer out, ObsoleteReplacer replacer) throws IOException { InputReader reader = Input.getInstance(in, bufferSize); replace(reader, out, replacer); } public void replace(CharSequence text, Writer out, ObsoleteReplacer replacer) throws IOException { if (text.length() > 0) { InputReader reader = Input.getInstance(text); replace(reader, out, replacer); } } public void replace(PushbackReader in, char[] shared, Writer out, String format) throws IOException { InputReader reader = Input.getInstance(in, shared); ObsoleteSimpleReplacer fsp = new ObsoleteSimpleReplacer(format); replace(reader, out, fsp); } public void replace(PushbackReader in, char[] shared, Writer out, ObsoleteReplacer replacer) throws IOException { InputReader reader = Input.getInstance(in, shared); replace(reader, out, replacer); } public void replace(InputReader reader, Writer out, ObsoleteReplacer replacer) throws IOException { int start = 0; int end = 0; PrintWriter pw = null; if (out instanceof PrintWriter) { pw = (PrintWriter) out; } else { pw = new PrintWriter(out); } while (!reader.isEof()) { reader.clear(); try { int rc = find(reader); if (rc == 1) { if (reader.getLength() == 0) { reader.read(); } else { end = reader.getStart(); if (end > start) { reader.write(start, end - start, pw); } start = reader.getEnd(); replacer.replace(reader, pw); } } else { break; } } catch (UnderflowException ex) { System.err.println(); end = reader.getEnd(); reader.write(start, end - start, pw); start = end; reader.clear(); } } end = reader.getEnd(); reader.write(start, end - start, pw); pw.close(); } /** * Splits the input * @param text * @return */ public String[] split(CharSequence text) { return split(text, Integer.MAX_VALUE); } /** * Splits the input * @param text * @param limit See java.util.Pattern.split * @return */ public String[] split(CharSequence text, int limit) { try { InputReader reader = Input.getInstance(text); List list = split(reader, limit); return list.toArray(new String[list.size()]); } catch (IOException ex) { throw new IllegalArgumentException("can't happen", ex); } } /** * Splits the input * @param in * @param bufferSize * @return * @throws IOException */ public String[] split(PushbackReader in, int bufferSize) throws IOException { return split(in, bufferSize, Integer.MAX_VALUE); } public String[] split(PushbackReader in, char[] shared) throws IOException { return split(in, shared, Integer.MAX_VALUE); } /** * Splits the input * @param in * @param bufferSize * @param limit See java.util.Pattern.split * @return * @throws IOException * @see Pattern.split */ public String[] split(PushbackReader in, int bufferSize, int limit) throws IOException { InputReader reader = Input.getInstance(in, bufferSize); List list = split(reader, limit); return list.toArray(new String[list.size()]); } public String[] split(PushbackReader in, char[] shared, int limit) throws IOException { InputReader reader = Input.getInstance(in, shared); List list = split(reader, limit); return list.toArray(new String[list.size()]); } private List split(InputReader reader, int limit) throws IOException { List list = new ArrayList<>(); int count = 0; int start = 0; int end = 0; while (!reader.isEof()) { count++; if (count == limit) { CharArrayWriter caw = new CharArrayWriter(); int cc = reader.read(); while (cc != -1) { caw.write(cc); cc = reader.read(); } list.add(caw.toString()); return list; } reader.clear(); int rc = find(reader); if (rc == 1) { if (reader.getLength() == 0) { reader.read(); } else { end = reader.getStart(); list.add(reader.getString(start, end - start)); start = reader.getEnd(); } } else { break; } } end = reader.getEnd(); list.add(reader.getString(start, end - start)); if (limit == 0) { while (!list.isEmpty() && list.get(list.size() - 1).isEmpty()) { list.remove(list.size() - 1); } } return list; } protected abstract int match(InputReader reader) throws IOException; protected abstract int find(InputReader reader) throws IOException; /** * Compiles a literal string into RegExImpl class. This is ok for testing. use RegexBuilder * ant task for release classes * @param expression * @return * @throws IOException */ public static Regex literal(String expression) throws IOException { return compile(escape(expression)); } /** * Compiles a literal string into RegexImpl class. This is ok for testing. use RegexBuilder * ant task for release classes * @param expression * @param options * @return * @throws IOException */ public static Regex literal(String expression, Option... options) throws IOException { return compile(escape(expression), options); } /** * Compiles a regular expression into RegExImpl class. * @param expression * @param options * @return * @throws NoSuchMethodException * @throws NoSuchFieldException * @throws IOException * @throws InstantiationException * @throws IllegalAccessException */ private static int regexCount; public static Regex compile(String expression, Option... options) throws IOException { String className = "org.vesalainen.regex.Regex"+regexCount; regexCount++; SubClass subClass = createSubClass(expression, className, options); Regex regex = (Regex) subClass.newInstance(); return regex; } /** * Creates a DFA from regular expression * @param expression * @return */ public static DFA createDFA(String expression) { return createDFA(expression, 1); } /** * Creates a DFA from regular expression * @param expression * @param reducer Reducer marks the accepting state with unique identifier * @param options * @return */ public static DFA createDFA(String expression, int reducer, Option... options) { NFA nfa = createNFA(new Scope>(expression), expression, reducer, options); DFA dfa = nfa.constructDFA(new Scope>(expression)); return dfa; } /** * Creates an NFA from regular expression * @param scope * @param expression * @return */ public static NFA createNFA(Scope> scope, String expression) { return createNFA(scope, expression, 1); } /** * Creates an NFA from regular expression * @param scope * @param expression * @param token Token marks the accepting state with unique identifier * @param options * @return */ public static NFA createNFA(Scope> scope, String expression, int token, Option... options) { if (regexParser == null) { regexParser = RegexParserFactory.newInstance(); } return regexParser.createNFA(scope, expression, token, options); } public static SubClass createSubClass(String expression, String classname, Option... options) throws IOException { //return createSubClass(expression, createDFA(expression, 1, options), classname); SubClass subClass = new SubClass(Regex.class, classname, Modifier.PUBLIC); DFA dfa = createDFA(expression, 1, options); subClass.codeDefaultConstructor( FieldInitializer.getInstance(El.getField(Regex.class, "acceptEmpty"), dfa.acceptEmpty()), FieldInitializer.getInstance(El.getField(Regex.class, "expression"), expression), FieldInitializer.getInstance(El.getField(Regex.class, "minLength"), dfa.minDepth()), FieldInitializer.getInstance(El.getField(Regex.class, "maxLength"), dfa.maxDepth()) ); MatchCompiler matchCompiler = new MatchCompiler<>(dfa, -1, 0); if (debug) { //Method trace = Regex.class.getDeclaredMethod("trace", Integer.TYPE, String.class); //matchComp.setDebug(trace); } subClass.overrideMethod(matchCompiler, java.lang.reflect.Modifier.PUBLIC, "match", InputReader.class); dfa = createDFA(expression, 1, options); FindCompiler findCompiler = new FindCompiler<>(dfa, -1, 0); if (debug) { //Method trace = Regex.class.getDeclaredMethod("trace", Integer.TYPE, String.class); //findComp.setDebug(trace); } subClass.overrideMethod(findCompiler, java.lang.reflect.Modifier.PUBLIC, "find", InputReader.class); return subClass; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy