
org.textmapper.lapg.regex.RegexDefParser Maven / Gradle / Ivy
/**
* Copyright 2002-2016 Evgeny Gryaznov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.textmapper.lapg.regex;
import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.List;
import org.textmapper.lapg.common.CharacterSetImpl;
import org.textmapper.lapg.regex.RegexDefLexer.ErrorReporter;
import org.textmapper.lapg.regex.RegexDefLexer.Span;
import org.textmapper.lapg.regex.RegexDefLexer.Tokens;
import org.textmapper.lapg.regex.RegexDefTree.TextSource;
public class RegexDefParser {
public static class ParseException extends Exception {
private static final long serialVersionUID = 1L;
public ParseException() {
}
}
private final ErrorReporter reporter;
public RegexDefParser(ErrorReporter reporter) {
this.reporter = reporter;
}
private static final boolean DEBUG_SYNTAX = false;
TextSource source;
CharacterSetImpl.Builder setbuilder = new CharacterSetImpl.Builder();
private static final int[] tmAction = RegexDefLexer.unpack_int(39,
"\ufffd\uffff\11\0\12\0\13\0\14\0\uffe5\uffff\uffff\uffff\uffff\uffff\20\0\1\0\uffcf" +
"\uffff\32\0\uffc9\uffff\uffa9\uffff\2\0\uffff\uffff\21\0\22\0\23\0\24\0\25\0\uffff" +
"\uffff\uffff\uffff\uff91\uffff\5\0\6\0\7\0\10\0\33\0\15\0\16\0\uff79\uffff\26\0\17" +
"\0\3\0\30\0\31\0\uffff\uffff\ufffe\uffff");
private static final int[] tmLalr = RegexDefLexer.unpack_int(144,
"\1\0\uffff\uffff\2\0\uffff\uffff\3\0\uffff\uffff\4\0\uffff\uffff\14\0\uffff\uffff" +
"\20\0\uffff\uffff\21\0\uffff\uffff\22\0\uffff\uffff\23\0\uffff\uffff\0\0\35\0\15" +
"\0\35\0\uffff\uffff\ufffe\uffff\1\0\uffff\uffff\2\0\uffff\uffff\3\0\uffff\uffff\4" +
"\0\uffff\uffff\14\0\uffff\uffff\20\0\uffff\uffff\21\0\uffff\uffff\22\0\uffff\uffff" +
"\15\0\35\0\16\0\35\0\uffff\uffff\ufffe\uffff\15\0\uffff\uffff\0\0\0\0\uffff\uffff" +
"\ufffe\uffff\5\0\uffff\uffff\6\0\uffff\uffff\7\0\uffff\uffff\10\0\uffff\uffff\0\0" +
"\4\0\1\0\4\0\2\0\4\0\3\0\4\0\4\0\4\0\14\0\4\0\15\0\4\0\16\0\4\0\20\0\4\0\21\0\4\0" +
"\22\0\4\0\uffff\uffff\ufffe\uffff\1\0\uffff\uffff\2\0\uffff\uffff\3\0\uffff\uffff" +
"\4\0\uffff\uffff\14\0\uffff\uffff\20\0\uffff\uffff\21\0\uffff\uffff\22\0\uffff\uffff" +
"\0\0\34\0\15\0\34\0\16\0\34\0\uffff\uffff\ufffe\uffff\1\0\uffff\uffff\2\0\uffff\uffff" +
"\3\0\uffff\uffff\4\0\uffff\uffff\14\0\uffff\uffff\20\0\uffff\uffff\21\0\uffff\uffff" +
"\22\0\uffff\uffff\0\0\35\0\15\0\35\0\16\0\35\0\uffff\uffff\ufffe\uffff\1\0\uffff" +
"\uffff\2\0\uffff\uffff\3\0\27\0\24\0\27\0\25\0\27\0\uffff\uffff\ufffe\uffff");
private static final int[] lapg_sym_goto = RegexDefLexer.unpack_int(31,
"\0\0\1\0\12\0\23\0\33\0\37\0\40\0\41\0\42\0\43\0\43\0\43\0\43\0\47\0\51\0\52\0\52" +
"\0\56\0\62\0\66\0\67\0\71\0\75\0\76\0\100\0\104\0\110\0\114\0\116\0\121\0\124\0");
private static final int[] lapg_sym_from = RegexDefLexer.unpack_int(84,
"\45\0\0\0\5\0\6\0\7\0\15\0\25\0\26\0\27\0\37\0\0\0\5\0\6\0\7\0\15\0\25\0\26\0\27" +
"\0\37\0\0\0\5\0\6\0\7\0\15\0\25\0\26\0\27\0\0\0\5\0\15\0\27\0\14\0\14\0\14\0\14\0" +
"\0\0\5\0\15\0\27\0\12\0\17\0\17\0\0\0\5\0\15\0\27\0\0\0\5\0\15\0\27\0\0\0\5\0\15" +
"\0\27\0\0\0\25\0\26\0\6\0\7\0\25\0\26\0\0\0\0\0\5\0\0\0\5\0\15\0\27\0\0\0\5\0\15" +
"\0\27\0\6\0\7\0\25\0\26\0\6\0\7\0\0\0\5\0\27\0\0\0\5\0\27\0");
private static final int[] lapg_sym_to = RegexDefLexer.unpack_int(84,
"\46\0\1\0\1\0\20\0\20\0\1\0\20\0\20\0\1\0\43\0\2\0\2\0\21\0\21\0\2\0\21\0\21\0\2" +
"\0\44\0\3\0\3\0\22\0\22\0\3\0\22\0\22\0\3\0\4\0\4\0\4\0\4\0\30\0\31\0\32\0\33\0\5" +
"\0\5\0\5\0\5\0\27\0\27\0\35\0\6\0\6\0\6\0\6\0\7\0\7\0\7\0\7\0\10\0\10\0\10\0\10\0" +
"\11\0\36\0\41\0\23\0\23\0\37\0\37\0\45\0\12\0\17\0\13\0\13\0\34\0\13\0\14\0\14\0" +
"\14\0\14\0\24\0\24\0\40\0\40\0\25\0\26\0\15\0\15\0\15\0\16\0\16\0\42\0");
private static final int[] tmRuleLen = RegexDefLexer.unpack_int(30,
"\1\0\1\0\1\0\3\0\1\0\2\0\2\0\2\0\2\0\1\0\1\0\1\0\1\0\3\0\3\0\3\0\1\0\1\0\1\0\1\0" +
"\1\0\1\0\2\0\2\0\3\0\3\0\1\0\2\0\1\0\0\0");
private static final int[] tmRuleSymbol = RegexDefLexer.unpack_int(30,
"\26\0\26\0\27\0\27\0\30\0\30\0\30\0\30\0\30\0\31\0\31\0\31\0\31\0\31\0\31\0\31\0" +
"\31\0\32\0\32\0\32\0\33\0\33\0\33\0\33\0\33\0\33\0\34\0\34\0\35\0\35\0");
protected static final String[] tmSymbolNames = new String[] {
"eoi",
"char",
"escaped",
"charclass",
"'.'",
"'*'",
"'+'",
"'?'",
"quantifier",
"op_minus",
"op_union",
"op_intersect",
"'('",
"'|'",
"')'",
"'(?'",
"'['",
"'[^'",
"expand",
"kw_eoi",
"']'",
"'-'",
"input",
"pattern",
"part",
"primitive_part",
"setsymbol",
"charset",
"parts",
"partsopt",
};
public interface Nonterminals extends Tokens {
// non-terminals
int input = 22;
int pattern = 23;
int part = 24;
int primitive_part = 25;
int setsymbol = 26;
int charset = 27;
int parts = 28;
int partsopt = 29;
}
/**
* -3-n Lookahead (state id)
* -2 Error
* -1 Shift
* 0..n Reduce (rule index)
*/
protected static int tmAction(int state, int symbol) {
int p;
if (tmAction[state] < -2) {
for (p = -tmAction[state] - 3; tmLalr[p] >= 0; p += 2) {
if (tmLalr[p] == symbol) {
break;
}
}
return tmLalr[p + 1];
}
return tmAction[state];
}
protected static int tmGoto(int state, int symbol) {
int min = lapg_sym_goto[symbol], max = lapg_sym_goto[symbol + 1] - 1;
int i, e;
while (min <= max) {
e = (min + max) >> 1;
i = lapg_sym_from[e];
if (i == state) {
return lapg_sym_to[e];
} else if (i < state) {
min = e + 1;
} else {
max = e - 1;
}
}
return -1;
}
protected int tmHead;
protected Span[] tmStack;
protected Span tmNext;
protected RegexDefLexer tmLexer;
public RegexAstPart parse(RegexDefLexer lexer) throws IOException, ParseException {
tmLexer = lexer;
tmStack = new Span[1024];
tmHead = 0;
tmStack[0] = new Span();
tmStack[0].state = 0;
tmNext = tmLexer.next();
while (tmStack[tmHead].state != 38) {
int action = tmAction(tmStack[tmHead].state, tmNext.symbol);
if (action >= 0) {
reduce(action);
} else if (action == -1) {
shift();
}
if (action == -2 || tmStack[tmHead].state == -1) {
break;
}
}
if (tmStack[tmHead].state != 38) {
reporter.error(MessageFormat.format("syntax error before line {0}",
tmLexer.getTokenLine()), tmNext.offset, tmNext.endoffset);
throw new ParseException();
}
return (RegexAstPart)tmStack[tmHead - 1].value;
}
protected void shift() throws IOException {
tmStack[++tmHead] = tmNext;
tmStack[tmHead].state = tmGoto(tmStack[tmHead - 1].state, tmNext.symbol);
if (DEBUG_SYNTAX) {
System.out.println(MessageFormat.format("shift: {0} ({1})", tmSymbolNames[tmNext.symbol], tmLexer.tokenText()));
}
if (tmStack[tmHead].state != -1 && tmNext.symbol != 0) {
tmNext = tmLexer.next();
}
}
protected void reduce(int rule) {
Span left = new Span();
left.value = (tmRuleLen[rule] != 0) ? tmStack[tmHead + 1 - tmRuleLen[rule]].value : null;
left.symbol = tmRuleSymbol[rule];
left.state = 0;
if (DEBUG_SYNTAX) {
System.out.println("reduce to " + tmSymbolNames[tmRuleSymbol[rule]]);
}
Span startsym = (tmRuleLen[rule] != 0) ? tmStack[tmHead + 1 - tmRuleLen[rule]] : tmNext;
left.offset = startsym.offset;
left.endoffset = (tmRuleLen[rule] != 0) ? tmStack[tmHead].endoffset : tmNext.offset;
applyRule(left, rule, tmRuleLen[rule]);
for (int e = tmRuleLen[rule]; e > 0; e--) {
tmStack[tmHead--] = null;
}
tmStack[++tmHead] = left;
tmStack[tmHead].state = tmGoto(tmStack[tmHead - 1].state, left.symbol);
}
@SuppressWarnings("unchecked")
protected void applyRule(Span tmLeft, int ruleIndex, int ruleLength) {
switch (ruleIndex) {
case 1: // input ::= kw_eoi
{ tmLeft.value = new RegexAstChar(-1, source, tmLeft.offset, tmLeft.endoffset); }
break;
case 2: // pattern ::= partsopt
{ tmLeft.value = RegexUtil.emptyIfNull(((RegexAstPart)tmStack[tmHead].value), source, tmStack[tmHead].offset); }
break;
case 3: // pattern ::= pattern '|' partsopt
{ tmLeft.value = RegexUtil.createOr(((RegexAstPart)tmStack[tmHead - 2].value), ((RegexAstPart)tmStack[tmHead].value), source, tmStack[tmHead].offset); }
break;
case 5: // part ::= primitive_part '*'
{ tmLeft.value = new RegexAstQuantifier(((RegexAstPart)tmStack[tmHead - 1].value), 0, -1, source, tmLeft.offset, tmLeft.endoffset); }
break;
case 6: // part ::= primitive_part '+'
{ tmLeft.value = new RegexAstQuantifier(((RegexAstPart)tmStack[tmHead - 1].value), 1, -1, source, tmLeft.offset, tmLeft.endoffset); }
break;
case 7: // part ::= primitive_part '?'
{ tmLeft.value = new RegexAstQuantifier(((RegexAstPart)tmStack[tmHead - 1].value), 0, 1, source, tmLeft.offset, tmLeft.endoffset); }
break;
case 8: // part ::= primitive_part quantifier
{ tmLeft.value = RegexUtil.createQuantifier(((RegexAstPart)tmStack[tmHead - 1].value), source, tmStack[tmHead].offset, tmLeft.endoffset, reporter); }
break;
case 9: // primitive_part ::= char
{ tmLeft.value = new RegexAstChar(((Integer)tmStack[tmHead].value), source, tmLeft.offset, tmLeft.endoffset); }
break;
case 10: // primitive_part ::= escaped
{ tmLeft.value = new RegexAstChar(((Integer)tmStack[tmHead].value), source, tmLeft.offset, tmLeft.endoffset); }
break;
case 11: // primitive_part ::= charclass
{ tmLeft.value = new RegexAstCharClass(((String)tmStack[tmHead].value), RegexUtil.getClassSet(((String)tmStack[tmHead].value), setbuilder, reporter, tmLeft.offset, tmLeft.endoffset), source, tmLeft.offset, tmLeft.endoffset); }
break;
case 12: // primitive_part ::= '.'
{ tmLeft.value = new RegexAstAny(source, tmLeft.offset, tmLeft.endoffset); }
break;
case 13: // primitive_part ::= '(' pattern ')'
{ tmLeft.value = RegexUtil.wrap(((RegexAstPart)tmStack[tmHead - 1].value)); }
break;
case 14: // primitive_part ::= '[' charset ']'
{ tmLeft.value = RegexUtil.toSet(((List)tmStack[tmHead - 1].value), reporter, setbuilder, false); }
break;
case 15: // primitive_part ::= '[^' charset ']'
{ tmLeft.value = RegexUtil.toSet(((List)tmStack[tmHead - 1].value), reporter, setbuilder, true); }
break;
case 16: // primitive_part ::= expand
{ tmLeft.value = new RegexAstExpand(source, tmLeft.offset, tmLeft.endoffset); RegexUtil.checkExpand((RegexAstExpand) tmLeft.value, reporter); }
break;
case 17: // setsymbol ::= char
{ tmLeft.value = new RegexAstChar(((Integer)tmStack[tmHead].value), source, tmLeft.offset, tmLeft.endoffset); }
break;
case 18: // setsymbol ::= escaped
{ tmLeft.value = new RegexAstChar(((Integer)tmStack[tmHead].value), source, tmLeft.offset, tmLeft.endoffset); }
break;
case 19: // setsymbol ::= charclass
{ tmLeft.value = new RegexAstCharClass(((String)tmStack[tmHead].value), RegexUtil.getClassSet(((String)tmStack[tmHead].value), setbuilder, reporter, tmLeft.offset, tmLeft.endoffset), source, tmLeft.offset, tmLeft.endoffset); }
break;
case 20: // charset ::= '-'
{ tmLeft.value = new ArrayList(); ((List)tmLeft.value).add(new RegexAstChar('-', source, tmStack[tmHead].offset, tmStack[tmHead].endoffset)); }
break;
case 21: // charset ::= setsymbol
{ tmLeft.value = new ArrayList(); RegexUtil.addSetSymbol(((List)tmLeft.value), ((RegexAstPart)tmStack[tmHead].value), reporter); }
break;
case 22: // charset ::= charset setsymbol
{ RegexUtil.addSetSymbol(((List)tmStack[tmHead - 1].value), ((RegexAstPart)tmStack[tmHead].value), reporter); }
break;
case 23: // charset ::= charset '-' %prec char
{ ((List)tmStack[tmHead - 1].value).add(new RegexAstChar('-', source, tmStack[tmHead].offset, tmStack[tmHead].endoffset)); }
break;
case 24: // charset ::= charset '-' char
{ RegexUtil.applyRange(((List)tmStack[tmHead - 2].value), new RegexAstChar(((Integer)tmStack[tmHead].value), source, tmStack[tmHead].offset, tmStack[tmHead].endoffset), reporter); }
break;
case 25: // charset ::= charset '-' escaped
{ RegexUtil.applyRange(((List)tmStack[tmHead - 2].value), new RegexAstChar(((Integer)tmStack[tmHead].value), source, tmStack[tmHead].offset, tmStack[tmHead].endoffset), reporter); }
break;
case 27: // parts ::= parts part
{ tmLeft.value = RegexUtil.createSequence(((RegexAstPart)tmStack[tmHead - 1].value), ((RegexAstPart)tmStack[tmHead].value)); }
break;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy