eu.cqse.check.framework.shallowparser.languages.base.CStyleShallowParserBase Maven / Gradle / Ivy
Show all versions of teamscale-check-api Show documentation
/*
* Copyright (c) CQSE GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package eu.cqse.check.framework.shallowparser.languages.base;
import static eu.cqse.check.framework.scanner.ETokenType.AND;
import static eu.cqse.check.framework.scanner.ETokenType.ANDAND;
import static eu.cqse.check.framework.scanner.ETokenType.ANDEQ;
import static eu.cqse.check.framework.scanner.ETokenType.CASE;
import static eu.cqse.check.framework.scanner.ETokenType.CATCH;
import static eu.cqse.check.framework.scanner.ETokenType.COLON;
import static eu.cqse.check.framework.scanner.ETokenType.COMMA;
import static eu.cqse.check.framework.scanner.ETokenType.COMP;
import static eu.cqse.check.framework.scanner.ETokenType.CONST;
import static eu.cqse.check.framework.scanner.ETokenType.CONSTEXPR;
import static eu.cqse.check.framework.scanner.ETokenType.DEFAULT;
import static eu.cqse.check.framework.scanner.ETokenType.DO;
import static eu.cqse.check.framework.scanner.ETokenType.ELSE;
import static eu.cqse.check.framework.scanner.ETokenType.EQ;
import static eu.cqse.check.framework.scanner.ETokenType.FINALLY;
import static eu.cqse.check.framework.scanner.ETokenType.IDENTIFIER;
import static eu.cqse.check.framework.scanner.ETokenType.IF;
import static eu.cqse.check.framework.scanner.ETokenType.LBRACE;
import static eu.cqse.check.framework.scanner.ETokenType.LBRACK;
import static eu.cqse.check.framework.scanner.ETokenType.LITERALS;
import static eu.cqse.check.framework.scanner.ETokenType.LPAREN;
import static eu.cqse.check.framework.scanner.ETokenType.MULT;
import static eu.cqse.check.framework.scanner.ETokenType.NOT;
import static eu.cqse.check.framework.scanner.ETokenType.NOTEQ;
import static eu.cqse.check.framework.scanner.ETokenType.OR;
import static eu.cqse.check.framework.scanner.ETokenType.OREQ;
import static eu.cqse.check.framework.scanner.ETokenType.OROR;
import static eu.cqse.check.framework.scanner.ETokenType.QUESTION;
import static eu.cqse.check.framework.scanner.ETokenType.RBRACE;
import static eu.cqse.check.framework.scanner.ETokenType.RBRACK;
import static eu.cqse.check.framework.scanner.ETokenType.RPAREN;
import static eu.cqse.check.framework.scanner.ETokenType.SCOPE;
import static eu.cqse.check.framework.scanner.ETokenType.SEMICOLON;
import static eu.cqse.check.framework.scanner.ETokenType.TRY;
import static eu.cqse.check.framework.scanner.ETokenType.TYPEDEF;
import static eu.cqse.check.framework.scanner.ETokenType.UNION;
import static eu.cqse.check.framework.scanner.ETokenType.USING;
import static eu.cqse.check.framework.scanner.ETokenType.VAR;
import static eu.cqse.check.framework.scanner.ETokenType.WHILE;
import static eu.cqse.check.framework.scanner.ETokenType.XOR;
import static eu.cqse.check.framework.scanner.ETokenType.XOREQ;
import static eu.cqse.check.framework.shallowparser.languages.base.EGenericParserStates.IN_EXPRESSION;
import static eu.cqse.check.framework.shallowparser.languages.base.EGenericParserStates.IN_METHOD;
import static eu.cqse.check.framework.shallowparser.languages.base.EGenericParserStates.IN_TYPE;
import static eu.cqse.check.framework.shallowparser.languages.base.EGenericParserStates.TOP_LEVEL;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang3.ArrayUtils;
import org.checkerframework.checker.nullness.qual.Nullable;
import org.conqat.lib.commons.region.Region;
import eu.cqse.check.framework.scanner.ETokenType;
import eu.cqse.check.framework.scanner.IToken;
import eu.cqse.check.framework.shallowparser.SubTypeNames;
import eu.cqse.check.framework.shallowparser.TokenStreamUtils;
import eu.cqse.check.framework.shallowparser.framework.EShallowEntityType;
import eu.cqse.check.framework.shallowparser.framework.RecognizerBase;
import eu.cqse.check.framework.shallowparser.framework.ShallowParserBase;
import eu.cqse.check.framework.shallowparser.languages.cpp.CppSkipTemplateSpecificationRecognizer;
import eu.cqse.check.framework.shallowparser.languages.cs.CSharpUsingDeclarationRecognizer;
/**
* Base class for C-style languages (C++, Java, C#, Objective C).
*/
public abstract class CStyleShallowParserBase extends ShallowParserBase {
/** C++ operators that have an alternative (string) representations. */
public static final EnumSet OPERATORS_WITH_ALTERNATIVE_REPRESENTATION = EnumSet.of(NOT, COMP, NOTEQ,
ANDAND, OROR, AND, OR, XOR, ANDEQ, OREQ, XOREQ);
/** Used to define the possible scopes where statements can be located at */
private EGenericParserStates[] availableStatementScopes = { IN_METHOD };
/** Constructor. */
protected CStyleShallowParserBase() {
super(EGenericParserStates.class, EGenericParserStates.TOP_LEVEL);
createRules();
}
/** Constructor. */
protected CStyleShallowParserBase(EnumSet availableScopes) {
super(EGenericParserStates.class, EGenericParserStates.TOP_LEVEL);
availableStatementScopes = ArrayUtils.addAll(availableStatementScopes,
availableScopes.toArray(new EGenericParserStates[1]));
createRules();
}
private void createRules() {
createMetaRules();
createTypeRules();
createClassElementsRules();
createStatementRules();
createSubExpressionRules();
}
/** Creates rules for meta elements. */
protected void createMetaRules() {
// deal with dangling closing braces by inserting a broken node (endNode
// intentionally omitted)
inAnyState().sequence(RBRACE).createNode(EShallowEntityType.META, "dangling closing brace");
}
/** Parser rules for module/namespace and type creation. */
protected void createTypeRules() {
// types; we have to ensure when skipping to the LBRACE that there is
// no earlier SEMICOLON
inAnyState().repeated(getTypeModifier()).markStart().sequence(getTypeKeywords(), getValidIdentifiers())
.skipBefore(EnumSet.of(SEMICOLON, LBRACE)).sequence(LBRACE).createNode(EShallowEntityType.TYPE, 0, 1)
.parseUntil(IN_TYPE).sequence(RBRACE).endNode();
}
/** Creates rules for C style typedef */
protected void createTypedefRules() {
RecognizerBase typeInTypedefAlternative = inAnyState().sequence(TYPEDEF).optional(CONST);
typeInTypedefAlternative.sequenceBefore(getTypeKeywords(), IDENTIFIER, LBRACE)
.createNode(EShallowEntityType.TYPE, 0).parseOnce(TOP_LEVEL).markStart().skipTo(SEMICOLON)
.endNodeWithName(0);
typeInTypedefAlternative.sequenceBefore(getTypeKeywords(), LBRACE).createNode(EShallowEntityType.TYPE, 0)
.parseOnce(TOP_LEVEL).markStart().skipTo(SEMICOLON).endNodeWithName(0);
// skips to the name identifier (skips anything enclosed in <...>)
RecognizerBase simpleTypedefAlternative = inAnyState().sequence(TYPEDEF)
.createNode(EShallowEntityType.TYPE, 0)
.skipBeforeWithNesting(Arrays.asList(IDENTIFIER, EnumSet.of(SEMICOLON, RPAREN, LBRACK)),
Collections.singletonList(ETokenType.LT), Collections.singletonList(ETokenType.GT), null);
// array types such as "typedef foo name [COMPLEX STUFF];"
simpleTypedefAlternative.markStart().sequenceBefore(IDENTIFIER, EnumSet.of(SEMICOLON, LBRACK))
.skipToWithNesting(SEMICOLON, LBRACK, RBRACK).endNodeWithName(0);
simpleTypedefAlternative.markStart().sequence(IDENTIFIER, RPAREN).skipTo(SEMICOLON).endNodeWithName(0);
}
/**
* Returns the valid type modifiers for the language. The Default implementation returns an empty
* set. Override to use correct modifiers.
*/
protected EnumSet getTypeModifier() {
return EnumSet.noneOf(ETokenType.class);
}
/** Returns the set of keywords that start a type. */
protected abstract EnumSet getTypeKeywords();
/** Parser rules for both attributes and methods. */
protected abstract void createClassElementsRules();
/** Creates parser rules for statements. */
protected void createStatementRules() {
createEmptyStatementRule();
createLabelRule();
createElseIfRule();
createBasicBlockRules();
createCaseRule();
createDoWhileRule();
createGenericBlockRule();
createSimpleStatementRule();
}
/** The empty statement. */
private void createEmptyStatementRule() {
inState(availableStatementScopes).sequence(SEMICOLON)
.createNode(EShallowEntityType.STATEMENT, SubTypeNames.EMPTY_STATEMENT).endNode();
}
/** Matches labels. */
private void createLabelRule() {
// filter out labels as meta as they do not increase statement count
inState(availableStatementScopes).sequence(getValidIdentifiers(), COLON)
.createNode(EShallowEntityType.META, SubTypeNames.LABEL, 0).endNode();
}
/** Special rule for else-if. */
private void createElseIfRule() {
RecognizerBase elseIfAlternative = inState(availableStatementScopes).sequence(ELSE, IF)
.optional(CONSTEXPR).createNode(EShallowEntityType.STATEMENT, new int[] { 0, 1 });
endBlockRule(elseIfAlternative, EnumSet.of(ELSE), true, false);
}
/**
* Block constructs, such as if/else, while/for/switch, try/catch/finally, synchronized (only in
* some languages).
*/
protected void createBasicBlockRules() {
int subtype = 0;
createBasicBlockRule(getBlockRuleStart(), getSimpleBlockKeywordsWithParentheses(), null, true, false, subtype);
createBasicBlockRule(getBlockRuleStart(), getSimpleBlockKeywordsWithoutParentheses(), null, false, false,
subtype);
createBasicBlockRule(getBlockRuleStart(), EnumSet.of(IF), EnumSet.of(ELSE), true, false, subtype);
createBasicBlockRule(getBlockRuleStart(), EnumSet.of(TRY, CATCH), EnumSet.of(CATCH, FINALLY), true, true,
subtype);
}
/**
* Provides language-specific rule that marks the start of a block statement.
*/
protected RecognizerBase getBlockRuleStart() {
return inState(availableStatementScopes);
}
/**
* Creates a rule for recognizing statements of the following kind:
*
* - single keyword
* - (optional) expression in parentheses (parsed by the {@link #getSubExpressionRecognizer()
* subexpression recognizer})
* - either a single or a block statement
*
*
* @param blockRuleStart
* recognizer of tokens that should appear before {@code startTokens} (e.g. see
* {@link #getBlockRuleStart()})
* @param continuationTokens
* list of tokens that indicate a continued statement if encountered after the block. May
* be null.
* @param canBeFollowedByParentheses
* indicates whether parenthesis may follow the single keyword
* @param alwaysBraces
* indicates whether the statement(s) of the block is always enclosed in braces
* @param blockNodeSubtype
* the subtype of the block
*/
protected void createBasicBlockRule(RecognizerBase blockRuleStart,
EnumSet startTokens, @Nullable EnumSet continuationTokens,
boolean canBeFollowedByParentheses, boolean alwaysBraces, Object blockNodeSubtype) {
RecognizerBase blockRule = blockRuleStart.sequence(startTokens)
// Since the keyword `using` can be added to a local variable declaration or
// used as a statement, we have to differentiate them.
.notPreCondition(new CSharpUsingDeclarationRecognizer());
if (alwaysBraces) {
if (canBeFollowedByParentheses) {
blockRule = blockRule.sequenceBefore(EnumSet.of(LBRACE, LPAREN));
} else {
blockRule = blockRule.sequenceBefore(LBRACE);
}
}
blockRule = blockRule.createNode(EShallowEntityType.STATEMENT, blockNodeSubtype);
endBasicBlockRule(blockRule, continuationTokens, canBeFollowedByParentheses, alwaysBraces);
}
/**
* Ends the given {@code blockRule} by recognizing and parsing expressions in parentheses, ending
* with a single or block statement.
*
* Note: Be sure {@link RecognizerBase#createNode} was called before applying this
* rules.
*
* @see #createBasicBlockRule(RecognizerBase, EnumSet, EnumSet, boolean, boolean, Object)
*/
public void endBasicBlockRule(RecognizerBase blockRule,
@Nullable EnumSet continuationTokens, boolean canBeFollowedByParentheses,
boolean alwaysBraces) {
if (canBeFollowedByParentheses) {
blockRule = blockRule.skipNested(LPAREN, RPAREN, getSubExpressionRecognizer());
}
if (alwaysBraces) {
blockRule = blockRule.skipBefore(LBRACE);
}
Set optionalTokensBeforeBlockBraces = getOptionalTokensBeforeBlockBraces();
if (optionalTokensBeforeBlockBraces.isEmpty()) {
endWithPossibleContinuation(blockRule.sequence(LBRACE).parseUntil(IN_METHOD).sequence(RBRACE),
continuationTokens);
} else {
endWithPossibleContinuation(blockRule.optional(optionalTokensBeforeBlockBraces).sequence(LBRACE)
.parseUntil(IN_METHOD).sequence(RBRACE), continuationTokens);
}
endWithPossibleContinuation(blockRule.parseOnce(IN_METHOD), continuationTokens);
}
/**
* Returns a set of tokens that may appear between the parentheses and braces of a block statement.
*/
protected Set getOptionalTokensBeforeBlockBraces() {
return Collections.emptySet();
}
/**
* Creates a rule for parsing case statements inside a switch statement. The generated parser
* matches on the case
(or default
) keyword and then skips to the
* corresponding colon indicating the end of the case condition, taking care of nested colons where
* applicable. This ensures that case statements with arbitrary constant expressions are parsed
* correctly, e.g.:
*
* switch(foo) { case 1 + FOO.BAR: case (char) 2: return 1; default: return 2; }
* switch(foo) { case bar ? 1 : 2: return 1; default: return 2; }
*
*/
protected void createCaseRule() {
// Case statement is parsed as meta, as it is hardly a statement on its own.
inState(availableStatementScopes).markStart().sequence(CASE)
.skipToWithNesting(getCaseStatementEndTokens(), QUESTION, COLON)
.createNode(EShallowEntityType.META, 0, new Region(1, -2)).endNode();
inState(availableStatementScopes).sequence(DEFAULT, getCaseStatementEndTokens())
.createNode(EShallowEntityType.META, 0).endNode();
}
/** Creates the do-while rule. */
private void createDoWhileRule() {
RecognizerBase doWhileAlternative = inState(availableStatementScopes).sequence(DO)
.createNode(EShallowEntityType.STATEMENT, 0);
doWhileAlternative.sequence(LBRACE).parseUntil(IN_METHOD).sequence(RBRACE, WHILE)
.skipNested(LPAREN, RPAREN, getSubExpressionRecognizer()).optional(ETokenType.SEMICOLON).endNode();
doWhileAlternative.parseOnce(IN_METHOD).sequence(WHILE).skipNested(LPAREN, RPAREN, getSubExpressionRecognizer())
.optional(ETokenType.SEMICOLON).endNode();
}
/** Generic block. */
private void createGenericBlockRule() {
inState(IN_METHOD).sequence(LBRACE).createNode(EShallowEntityType.STATEMENT, SubTypeNames.ANONYMOUS_BLOCK)
.parseUntil(IN_METHOD).sequence(RBRACE).endNode();
}
/**
* Contributes rules for detecting simple statements (local variable declarations, assignments,
* etc.) inside methods.
*/
public void contributeSimpleStatementRules(Set localVariableIdentifiers,
Set statementStartTokens) {
completeSimpleStatement(inState(availableStatementScopes).sequence(LPAREN).markStart().sequence(IDENTIFIER));
// heuristic for detecting local variables
completeSimpleStatement(
typePatternInState(availableStatementScopes).skipNested(LBRACE, RBRACE).markStart()
.sequenceBefore(localVariableIdentifiers, EnumSet.of(COMMA, EQ, SEMICOLON, LBRACK)),
SubTypeNames.LOCAL_VARIABLE);
// union local variables
completeSimpleStatement(inState(availableStatementScopes).sequence(EnumSet.of(UNION)).skipNested(LBRACE, RBRACE)
.markStart().sequenceBefore(localVariableIdentifiers, SEMICOLON), SubTypeNames.LOCAL_VARIABLE);
// function pointer local variables
completeSimpleStatement(typePatternInState(availableStatementScopes).repeated(LPAREN).sequence(MULT).markStart()
.sequenceBefore(localVariableIdentifiers, RPAREN), SubTypeNames.LOCAL_VARIABLE);
// C#8 using declaration
completeSimpleStatement(inState(availableStatementScopes).sequence(USING).optional(VAR).skipTo(IDENTIFIER)
.markStart().sequence(IDENTIFIER));
// using declaration with "var" declaration: using var client = newHttpClient();
completeSimpleStatement(inState(availableStatementScopes).skipNested(LBRACE, RBRACE).sequence(USING)
.sequence(VAR).markStart().sequenceBefore(IDENTIFIER));
createRuleForSimpleStatementHavingStartTokens(statementStartTokens);
completeSimpleStatement(typePatternInState(availableStatementScopes));
completeSimpleStatement(inState(availableStatementScopes).sequence(LITERALS));
}
/** Create rule to match simple statements starting with given tokens. */
protected void createRuleForSimpleStatementHavingStartTokens(Set statementStartTokens) {
completeSimpleStatement(inState(availableStatementScopes).sequence(statementStartTokens));
}
/** Simple statement. */
protected void createSimpleStatementRule() {
contributeSimpleStatementRules(getValidIdentifiers(), getStatementStartTokens());
}
/**
* Creates rules for dealing with constructs in subexpressions, such as anonymous classes, lambdas,
* etc.
*/
protected void createSubExpressionRules() {
// default implementation does nothing
}
/**
* Returns a recognizer used for detecting sub expressions (anonymous classes, lambdas, etc.) within
* expressions. This may return null (which is done by the default implementation).
*/
public RecognizerBase getSubExpressionRecognizer() {
return null;
}
/**
* Returns the set of all valid identifiers, i.e., token types that can be used to name elements in
* the language.
*/
public Set getValidIdentifiers() {
return EnumSet.of(IDENTIFIER);
}
/**
* Returns the set of all keywords that start a simple block with optional parentheses (see
* implementers for examples).
*/
protected abstract EnumSet getSimpleBlockKeywordsWithParentheses();
/**
* Returns the set of all keywords that start a simple block but are never followed by parentheses
* (see implementers for examples).
*/
protected abstract EnumSet getSimpleBlockKeywordsWithoutParentheses();
/**
* Returns the set of all tokens that can end a case statement (Java also allows an arrow starting
* from Java 14).
*/
public EnumSet getCaseStatementEndTokens() {
return EnumSet.of(COLON);
}
/**
* Returns a set of all tokens that can start a statement, besides a type (see
* {@link #typePatternInState(EGenericParserStates...)}) and a literal.
*/
public abstract EnumSet getStatementStartTokens();
/** Creates a recognizer that matches all valid types. */
public abstract RecognizerBase typePattern(RecognizerBase currentState);
/**
* Creates a recognizer that matches all valid types, starting from the given state.
*/
protected RecognizerBase typePatternInState(EGenericParserStates... states) {
return typePattern(inState(states));
}
/**
* Ends the block rule. If a block statement contains conditions within parentheses, the parser just
* skips them. If this statement continues with a code block (in braces), this rule also parses the
* block.
*
* @param continuationTokens
* list of tokens that indicate a continued statement if encountered after the block. May
* be null.
*/
protected void endBlockRule(RecognizerBase blockRule, EnumSet continuationTokens,
boolean canBeFollowedByParentheses, boolean alwaysBraces) {
if (canBeFollowedByParentheses) {
blockRule = blockRule.skipNested(LPAREN, RPAREN, getSubExpressionRecognizer());
}
if (alwaysBraces) {
blockRule = blockRule.skipBefore(LBRACE);
}
endWithPossibleContinuation(blockRule.sequence(LBRACE).parseUntil(IN_METHOD).sequence(RBRACE),
continuationTokens);
endWithPossibleContinuation(blockRule.parseOnce(IN_METHOD), continuationTokens);
}
/** Completes a recognizer for a simple statement. */
public void completeSimpleStatement(RecognizerBase baseRecognizer) {
completeSimpleStatement(baseRecognizer, SubTypeNames.SIMPLE_STATEMENT);
}
/** Completes a recognizer for a simple statement. */
public void completeSimpleStatement(RecognizerBase baseRecognizer, String subtype) {
RecognizerBase alternative = baseRecognizer
.createNode(EShallowEntityType.STATEMENT, subtype, 0).skipBeforeWithNesting(
EnumSet.of(SEMICOLON, RBRACE), LBRACE, RBRACE, LPAREN, RPAREN, getSubExpressionRecognizer());
alternative.sequence(SEMICOLON).endNode();
// this (empty) alternative captures the case where a statement is not
// closed by a semicolon, so we deliberately leave it open. While in
// most languages this is an error (and then this rule helps us to
// continue parsing), in C++ you can construct valid statements without
// semicolon using macros (although it is discouraged).
alternative.sequence();
}
/** Creates rules for parsing lambdas with arrows like in Java or C#. */
protected void createLambdaWithArrowRules(ETokenType arrowType) {
// lambda expressions
completeLambda(inState(IN_EXPRESSION).sequence(getValidIdentifiers()), arrowType);
completeLambda(inState(IN_EXPRESSION).sequence(LPAREN).skipTo(RPAREN), arrowType);
// additional rule for parsing lambda expressions (without braces). see
// completeLambda() for details
// the node start is moved one token to the right, so the shallow
// entities produced by this rule don't include the arrow (instead it
// will be included in the parent entity)
inState(IN_EXPRESSION).sequence(arrowType)
.createNode(EShallowEntityType.STATEMENT, SubTypeNames.LAMBDA_EXPRESSION, null, 1)
.skipBeforeWithNesting(EnumSet.of(RPAREN, SEMICOLON, RBRACE, COMMA), LPAREN, RPAREN, LBRACE, RBRACE,
getSubExpressionRecognizer())
.endNode();
}
/** Completes a rule for parsing lambda expressions. */
private static void completeLambda(RecognizerBase ruleStart, ETokenType arrowType) {
RecognizerBase lambdaAlternative = ruleStart.createNode(EShallowEntityType.METHOD,
SubTypeNames.LAMBDA);
lambdaAlternative.sequence(arrowType, LBRACE).parseUntil(IN_METHOD).sequence(RBRACE).endNode();
// We start parsing before the arrow as this allows our special
// statement rule to capture this case. This is required, as this kind
// of expression is not terminated by a semicolon.
lambdaAlternative.sequenceBefore(arrowType).parseOnce(IN_EXPRESSION).endNode();
}
/**
* Skips over the optional parameters and returns the new offset or {@link RecognizerBase#NO_MATCH}
* if it is malformed.
*/
public static int skipOptionalParameters(List tokens, int currentOffset) {
if (!TokenStreamUtils.hasTokenTypeSequence(tokens, currentOffset, LPAREN)) {
return currentOffset;
}
int closingParenthesis = TokenStreamUtils.findMatchingClosingToken(tokens, currentOffset + 1, LPAREN, RPAREN);
if (closingParenthesis == TokenStreamUtils.NOT_FOUND) {
return RecognizerBase.NO_MATCH;
}
return closingParenthesis + 1;
}
/**
* Creates a new recognizer that can match a scope prefix for a method-like construct. This includes
* sequences of identifiers with double colon, possibly intermixed with template arguments.
*/
public RecognizerBase createScopeRecognizer() {
// remember the start of the recognizer chain (we can not use the
// result of the method chain, as this would be the last recognizer)
return createRecognizer(start -> start.optional(SCOPE).sequence(IDENTIFIER)
.optionalSubRecognizer(new CppSkipTemplateSpecificationRecognizer()).sequence(SCOPE));
}
}