jflex.core.RegExp Maven / Gradle / Ivy
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* JFlex 1.8.2 *
* Copyright (C) 1998-2018 Gerwin Klein *
* All rights reserved. *
* *
* License: BSD *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
package jflex.core;
import java.util.List;
import jflex.core.unicode.CharClasses;
import jflex.core.unicode.IntCharSet;
import jflex.exceptions.CharClassException;
/**
* Stores a regular expression of rules section in a JFlex-specification.
*
* This base class has no content other than its type.
*
* @author Gerwin Klein
* @version JFlex 1.8.2
*/
public class RegExp {
/**
* The type of the regular expression. This field will be filled with values from class sym.java
* (generated by cup)
*/
int type;
/**
* Create a new regular expression of the specified type.
*
* @param type a value from the cup generated class sym.
*/
public RegExp(int type) {
this.type = type;
}
/**
* Returns a String-representation of this regular expression with the specified indentation.
*
* @param tab a String that should contain only space characters and that is inserted in front of
* standard String-representation pf this object.
* @return a {@link java.lang.String} object.
*/
public String print(String tab) {
return tab + toString();
}
@Override
public String toString() {
return "type = " + typeName();
}
/** String representation of the type of this regular expression. */
public String typeName() {
return sym.terminalNames[type];
}
/**
* Find out if this regexp is a char class or equivalent to one.
*
* @return true if the regexp is equivalent to a char class.
*/
public boolean isCharClass() {
switch (type) {
case sym.CHAR:
case sym.CHAR_I:
case sym.PRIMCLASS:
return true;
case sym.BAR:
RegExp2 binary = (RegExp2) this;
return binary.r1.isCharClass() && binary.r2.isCharClass();
default:
return false;
}
}
/**
* The approximate number of NFA states this expression will need (only works correctly after
* macro expansion and without negation)
*
* @param macros macro table for expansion
* @return a int.
*/
public int size(Macros macros) {
RegExp1 unary;
RegExp2 binary;
RegExp content;
switch (type) {
case sym.BAR:
binary = (RegExp2) this;
return binary.r1.size(macros) + binary.r2.size(macros) + 2;
case sym.CONCAT:
binary = (RegExp2) this;
return binary.r1.size(macros) + binary.r2.size(macros);
case sym.STAR:
case sym.PLUS:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return content.size(macros) + 2;
case sym.QUESTION:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return content.size(macros);
case sym.BANG:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return content.size(macros) * content.size(macros);
// this is only a very rough estimate (worst case 2^n)
// exact size too complicated (propably requires construction)
case sym.TILDE:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return content.size(macros) * content.size(macros) * 3;
// see sym.BANG
case sym.STRING:
case sym.STRING_I:
unary = (RegExp1) this;
return ((String) unary.content).length() + 1;
case sym.CHAR:
case sym.CHAR_I:
return 2;
case sym.CCLASS:
case sym.CCLASSNOT:
case sym.CCLASSOP:
case sym.PRIMCLASS:
return 2;
case sym.MACROUSE:
unary = (RegExp1) this;
return macros.getDefinition((String) unary.content).size(macros);
default:
throw new RegExpException(this);
}
}
/** Reverses a string. */
static String revString(String s) {
return new StringBuilder(s).reverse().toString();
}
/**
* Recursively convert tilde (upto) expressions into negation and star.
*
* @return new RegExp equivalent to the current one, but without upto expressions.
*/
public final RegExp resolveTilde() {
RegExp1 unary;
RegExp2 binary;
RegExp content;
switch (type) {
case sym.BAR:
binary = (RegExp2) this;
return new RegExp2(sym.BAR, binary.r1.resolveTilde(), binary.r2.resolveTilde());
case sym.CONCAT:
binary = (RegExp2) this;
return new RegExp2(sym.CONCAT, binary.r1.resolveTilde(), binary.r2.resolveTilde());
case sym.STAR:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.STAR, content.resolveTilde());
case sym.PLUS:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.PLUS, content.resolveTilde());
case sym.QUESTION:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.QUESTION, content.resolveTilde());
case sym.BANG:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.BANG, content.resolveTilde());
case sym.TILDE:
// ~a = !([^]* a [^]*) a
// uses subexpression sharing
unary = (RegExp1) this;
content = ((RegExp) unary.content).resolveTilde();
RegExp any_star = new RegExp1(sym.STAR, anyChar());
RegExp neg =
new RegExp1(
sym.BANG,
new RegExp2(sym.CONCAT, any_star, new RegExp2(sym.CONCAT, content, any_star)));
return new RegExp2(sym.CONCAT, neg, content);
case sym.STRING:
case sym.STRING_I:
case sym.CHAR:
case sym.CHAR_I:
case sym.PRIMCLASS:
unary = (RegExp1) this;
return new RegExp1(unary.type, unary.content);
default:
throw new RegExpException(this);
}
}
/**
* Returns a regexp that matches any character: {@code [^]}
*
* @return the regexp for {@code [^]}
*/
public static RegExp anyChar() {
return new RegExp1(sym.PRIMCLASS, IntCharSet.allChars());
}
/**
* Confirms that the parameter is a RegExp1 of type sym.PRIMCLASS.
*
* @param r the RegExp to check
* @throws CharClassException if r is not a RegExp1 or of type sym.PRIMCLASS.
* @return r cast to RegExp1
*/
public static RegExp1 checkPrimClass(RegExp r) {
if (!(r instanceof RegExp1 && r.type == sym.PRIMCLASS))
throw new CharClassException("Not normalised " + r);
return (RegExp1) r;
}
/**
* Performs the given set operation on the two {@link IntCharSet} parameters.
*
* @param op the operation to perform (as @{link sym} constant)
* @param l the left operator of the expression
* @param r the right operator of the expression
* @param ctxt the regular expression containing the provided operator
* @return a new {@link IntCharSet}
* @throws RegExpException for {@code ctxt} if the operator is not supported
*/
public static IntCharSet performClassOp(int op, IntCharSet l, IntCharSet r, RegExp ctxt) {
IntCharSet set;
IntCharSet intersection = l.and(r);
switch (op) {
case sym.INTERSECTION:
return intersection;
case sym.DIFFERENCE:
// IntCharSet.sub() assumes its argument is a subset, so subtract intersection
set = IntCharSet.copyOf(l);
set.sub(intersection);
return set;
case sym.SYMMETRICDIFFERENCE:
set = IntCharSet.copyOf(l);
set.add(r);
set.sub(intersection);
return set;
default:
throw new RegExpException(ctxt);
}
}
/**
* Normalise the regular expression to eliminate macro use (expand them), and compound character
* class expression (compute their content).
*
* @return a regexp that contains only {@link IntCharSet} char classes and no {@link
* sym#MACROUSE}.
*/
@SuppressWarnings("unchecked")
public final RegExp normalise(Macros m) {
RegExp1 unary;
RegExp2 binary;
RegExp content;
switch (type) {
case sym.BAR:
case sym.CONCAT:
binary = (RegExp2) this;
return new RegExp2(type, binary.r1.normalise(m), binary.r2.normalise(m));
case sym.STAR:
case sym.PLUS:
case sym.QUESTION:
case sym.BANG:
case sym.TILDE:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(type, content.normalise(m));
case sym.STRING:
case sym.STRING_I:
case sym.CHAR:
case sym.CHAR_I:
case sym.PRIMCLASS:
unary = (RegExp1) this;
return new RegExp1(type, unary.content);
case sym.CCLASS:
{
unary = (RegExp1) this;
List contents = (List) unary.content;
IntCharSet set = new IntCharSet();
for (RegExp r : contents) {
RegExp1 n = checkPrimClass(r.normalise(m));
set.add((IntCharSet) n.content);
}
return new RegExp1(sym.PRIMCLASS, set);
}
case sym.CCLASSNOT:
{
unary = (RegExp1) this;
List contents = (List) unary.content;
IntCharSet set = IntCharSet.allChars();
for (RegExp r : contents) {
RegExp1 n = checkPrimClass(r.normalise(m));
set.sub((IntCharSet) n.content);
}
return new RegExp1(sym.PRIMCLASS, set);
}
case sym.CCLASSOP:
unary = (RegExp1) this;
binary = (RegExp2) unary.content;
RegExp1 l = checkPrimClass(binary.r1.normalise(m));
IntCharSet setl = (IntCharSet) l.content;
RegExp1 r = checkPrimClass(binary.r2.normalise(m));
IntCharSet setr = (IntCharSet) r.content;
IntCharSet set = performClassOp(binary.type, setl, setr, this);
return new RegExp1(sym.PRIMCLASS, set);
case sym.MACROUSE:
unary = (RegExp1) this;
return m.getDefinition((String) unary.content).normalise(m);
default:
throw new RegExpException(this);
}
}
/**
* Make character class partitions based on the classes mentioned in this regexp.
*
* Assumption: regexp is normalised.
*/
public final void makeCCLs(CharClasses c, boolean caseless) {
RegExp1 unary;
RegExp2 binary;
RegExp content;
switch (type) {
case sym.BAR:
case sym.CONCAT:
binary = (RegExp2) this;
binary.r1.makeCCLs(c, caseless);
binary.r2.makeCCLs(c, caseless);
return;
case sym.STAR:
case sym.PLUS:
case sym.QUESTION:
case sym.BANG:
case sym.TILDE:
unary = (RegExp1) this;
content = (RegExp) unary.content;
content.makeCCLs(c, caseless);
return;
case sym.STRING:
case sym.STRING_I:
case sym.CHAR:
case sym.CHAR_I:
return;
case sym.PRIMCLASS:
unary = (RegExp1) this;
IntCharSet set = (IntCharSet) unary.content;
c.makeClass(set, caseless);
return;
default:
throw new CharClassException("makeCCLs: unexpected regexp " + this);
}
}
/**
* Creates a new regexp that matches the reverse text of this one.
*
* @return the reverse regexp
*/
public final RegExp rev() {
RegExp1 unary;
RegExp2 binary;
RegExp content;
switch (type) {
case sym.BAR:
binary = (RegExp2) this;
return new RegExp2(sym.BAR, binary.r1.rev(), binary.r2.rev());
case sym.CONCAT:
binary = (RegExp2) this;
return new RegExp2(sym.CONCAT, binary.r2.rev(), binary.r1.rev());
case sym.STAR:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.STAR, content.rev());
case sym.PLUS:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.PLUS, content.rev());
case sym.QUESTION:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.QUESTION, content.rev());
case sym.BANG:
unary = (RegExp1) this;
content = (RegExp) unary.content;
return new RegExp1(sym.BANG, content.rev());
case sym.TILDE:
content = resolveTilde();
return content.rev();
case sym.STRING:
case sym.STRING_I:
unary = (RegExp1) this;
return new RegExp1(unary.type, revString((String) unary.content));
case sym.CHAR:
case sym.CHAR_I:
case sym.PRIMCLASS:
unary = (RegExp1) this;
return new RegExp1(unary.type, unary.content);
default:
throw new RegExpException(this);
}
}
}