
com.oracle.truffle.regex.tregex.parser.RegexValidator Maven / Gradle / Ivy
/*
* Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
*
* Subject to the condition set forth below, permission is hereby granted to any
* person obtaining a copy of this software, associated documentation and/or
* data (collectively the "Software"), free of charge and under any and all
* copyright rights in the Software, and any and all patent rights owned or
* freely licensable by each licensor hereunder covering either (i) the
* unmodified Software as contributed to or provided by such licensor, or (ii)
* the Larger Works (as defined below), to deal in both
*
* (a) the Software, and
*
* (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
* one is included with the Software each a "Larger Work" to which the Software
* is contributed by such licensors),
*
* without restriction, including without limitation the rights to copy, create
* derivative works of, display, perform, and distribute the Software and make,
* use, sell, offer for sale, import, export, have made, and have sold the
* Software and the Larger Work(s), and to sublicense the foregoing rights on
* either these or other terms.
*
* This license is subject to the following condition:
*
* The above copyright notice and either this complete permission notice or at a
* minimum a reference to the UPL must be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package com.oracle.truffle.regex.tregex.parser;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
import com.oracle.truffle.regex.RegexFlags;
import com.oracle.truffle.regex.RegexOptions;
import com.oracle.truffle.regex.RegexSource;
import com.oracle.truffle.regex.RegexSyntaxException;
import com.oracle.truffle.regex.UnsupportedRegexException;
import com.oracle.truffle.regex.tregex.TRegexOptions;
public class RegexValidator {
private final RegexSource source;
private final RegexFlags flags;
private final RegexLexer lexer;
private RegexFeatures features;
public RegexValidator(RegexSource source, RegexOptions options) {
this.source = source;
this.flags = RegexFlags.parseFlags(source.getFlags());
this.lexer = new RegexLexer(source, flags, options);
}
@TruffleBoundary
public static void validate(RegexSource source) throws RegexSyntaxException {
new RegexValidator(source, RegexOptions.DEFAULT).validate();
}
@TruffleBoundary
public void validate() throws RegexSyntaxException {
features = new RegexFeatures();
parseDryRun();
}
/**
* Returns the features used by the regular expression that was just validated. This property is
* only populated after a call to {@link #validate()} and should therefore only be accessed
* then.
*/
public RegexFeatures getFeatures() {
assert features != null;
return features;
}
@TruffleBoundary
public int getNumberOfCaptureGroups() {
return lexer.numberOfCaptureGroups();
}
@TruffleBoundary
public Map getNamedCaptureGroups() {
return lexer.getNamedCaptureGroups();
}
/**
* A type representing an entry in the stack of currently open parenthesized expressions in a
* RegExp.
*/
private enum RegexStackElem {
Group,
LookAheadAssertion,
LookBehindAssertion
}
/**
* Information about the state of the current term. It can be either null, point to a lookahead
* assertion node, to a lookbehind assertion node or to some other non-null node.
*/
private enum CurTermState {
Null,
LookAheadAssertion,
LookBehindAssertion,
Other
}
/**
* Like {@link RegexParser#parse()}, but does not construct any AST, only checks for syntax
* errors.
*
* This method simulates the state of {@link RegexParser} running {@link RegexParser#parse()}.
* Most of the syntax errors are handled by {@link RegexLexer}. In order to correctly identify
* the remaining syntax errors, we need to track only a fraction of the parser's state (the
* stack of open parenthesized expressions and a short characterization of the last term).
*
* Unlike {@link RegexParser#parse()}, this method will never throw an
* {@link UnsupportedRegexException}.
*
* @throws RegexSyntaxException when a syntax error is detected in the RegExp
*/
private void parseDryRun() throws RegexSyntaxException {
List syntaxStack = new ArrayList<>();
int lookBehindDepth = 0;
CurTermState curTermState = CurTermState.Null;
while (lexer.hasNext()) {
Token token = lexer.next();
if (lookBehindDepth > 0 && token.kind != Token.Kind.charClass && token.kind != Token.Kind.groupEnd) {
features.setNonLiteralLookBehindAssertions();
}
switch (token.kind) {
case caret:
curTermState = CurTermState.Other;
break;
case dollar:
if (lookBehindDepth > 0 && !flags.isMultiline()) {
features.setEndOfStringAssertionsInLookBehind();
}
curTermState = CurTermState.Other;
break;
case wordBoundary:
case nonWordBoundary:
if (lookBehindDepth > 0) {
features.setWordBoundaryAssertionsInLookBehind();
}
curTermState = CurTermState.Other;
break;
case backReference:
features.setBackReferences();
if (lookBehindDepth > 0) {
features.setBackReferencesInLookBehind();
}
curTermState = CurTermState.Other;
break;
case charClass:
curTermState = CurTermState.Other;
break;
case quantifier:
switch (curTermState) {
case Null:
throw syntaxError(ErrorMessages.QUANTIFIER_WITHOUT_TARGET);
case LookAheadAssertion:
if (flags.isUnicode()) {
throw syntaxError(ErrorMessages.QUANTIFIER_ON_LOOKAHEAD_ASSERTION);
}
break;
case LookBehindAssertion:
throw syntaxError(ErrorMessages.QUANTIFIER_ON_LOOKBEHIND_ASSERTION);
case Other:
Token.Quantifier quantifier = (Token.Quantifier) token;
if (lookBehindDepth > 0 && quantifier.getMin() != quantifier.getMax()) {
features.setNonTrivialQuantifiersInLookBehind();
}
int threshold = Math.max(TRegexOptions.TRegexQuantifierUnrollThresholdSingleCC, TRegexOptions.TRegexQuantifierUnrollThresholdGroup);
if (quantifier.getMin() > threshold || quantifier.getMax() > threshold) {
features.setLargeCountedRepetitions();
}
break;
}
curTermState = CurTermState.Other;
break;
case alternation:
curTermState = CurTermState.Null;
break;
case captureGroupBegin:
case nonCaptureGroupBegin:
syntaxStack.add(RegexStackElem.Group);
curTermState = CurTermState.Null;
break;
case lookAheadAssertionBegin:
if (((Token.LookAheadAssertionBegin) token).isNegated()) {
features.setNegativeLookAheadAssertions();
}
if (lookBehindDepth > 0) {
features.setLookAheadAssertionsInLookBehind();
}
syntaxStack.add(RegexStackElem.LookAheadAssertion);
curTermState = CurTermState.Null;
break;
case lookBehindAssertionBegin:
if (((Token.LookBehindAssertionBegin) token).isNegated()) {
features.setNegativeLookBehindAssertions();
if (lookBehindDepth > 0) {
features.setNegativeLookBehindAssertionsInLookBehind();
}
}
syntaxStack.add(RegexStackElem.LookBehindAssertion);
lookBehindDepth++;
curTermState = CurTermState.Null;
break;
case groupEnd:
if (syntaxStack.isEmpty()) {
throw syntaxError(ErrorMessages.UNMATCHED_RIGHT_PARENTHESIS);
}
RegexStackElem poppedElem = syntaxStack.remove(syntaxStack.size() - 1);
switch (poppedElem) {
case LookAheadAssertion:
curTermState = CurTermState.LookAheadAssertion;
break;
case LookBehindAssertion:
lookBehindDepth--;
curTermState = CurTermState.LookBehindAssertion;
break;
case Group:
curTermState = CurTermState.Other;
break;
}
break;
}
}
if (!syntaxStack.isEmpty()) {
throw syntaxError(ErrorMessages.UNTERMINATED_GROUP);
}
}
private RegexSyntaxException syntaxError(String msg) {
return new RegexSyntaxException(source, msg);
}
}