eu.cqse.check.framework.shallowparser.languages.python.PythonShallowParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of teamscale-check-api Show documentation
The Teamscale Custom Check API allows users to extend Teamscale by writing custom analyses that create findings.
There is a newer version: 2024.7.2
/*
 * Copyright (c) CQSE GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eu.cqse.check.framework.shallowparser.languages.python;

import static eu.cqse.check.framework.scanner.ETokenType.AT;
import static eu.cqse.check.framework.scanner.ETokenType.CASE;
import static eu.cqse.check.framework.scanner.ETokenType.CLASS;
import static eu.cqse.check.framework.scanner.ETokenType.COLON;
import static eu.cqse.check.framework.scanner.ETokenType.COMMA;
import static eu.cqse.check.framework.scanner.ETokenType.DEDENT;
import static eu.cqse.check.framework.scanner.ETokenType.DEF;
import static eu.cqse.check.framework.scanner.ETokenType.DOT;
import static eu.cqse.check.framework.scanner.ETokenType.ELIF;
import static eu.cqse.check.framework.scanner.ETokenType.ELSE;
import static eu.cqse.check.framework.scanner.ETokenType.EOL;
import static eu.cqse.check.framework.scanner.ETokenType.EXCEPT;
import static eu.cqse.check.framework.scanner.ETokenType.FINALLY;
import static eu.cqse.check.framework.scanner.ETokenType.FOR;
import static eu.cqse.check.framework.scanner.ETokenType.FROM;
import static eu.cqse.check.framework.scanner.ETokenType.IDENTIFIER;
import static eu.cqse.check.framework.scanner.ETokenType.IF;
import static eu.cqse.check.framework.scanner.ETokenType.IMPORT;
import static eu.cqse.check.framework.scanner.ETokenType.INDENT;
import static eu.cqse.check.framework.scanner.ETokenType.LAMBDA;
import static eu.cqse.check.framework.scanner.ETokenType.LBRACE;
import static eu.cqse.check.framework.scanner.ETokenType.LBRACK;
import static eu.cqse.check.framework.scanner.ETokenType.LPAREN;
import static eu.cqse.check.framework.scanner.ETokenType.MATCH;
import static eu.cqse.check.framework.scanner.ETokenType.RBRACE;
import static eu.cqse.check.framework.scanner.ETokenType.RBRACK;
import static eu.cqse.check.framework.scanner.ETokenType.RPAREN;
import static eu.cqse.check.framework.scanner.ETokenType.SEMICOLON;
import static eu.cqse.check.framework.scanner.ETokenType.TRY;
import static eu.cqse.check.framework.scanner.ETokenType.WHILE;
import static eu.cqse.check.framework.scanner.ETokenType.WITH;
import static eu.cqse.check.framework.shallowparser.languages.python.PythonShallowParser.EPythonParserStates.ANY;
import static eu.cqse.check.framework.shallowparser.languages.python.PythonShallowParser.EPythonParserStates.IN_CLASS;
import static eu.cqse.check.framework.shallowparser.languages.python.PythonShallowParser.EPythonParserStates.IN_LAMBDA;

import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;

import org.conqat.lib.commons.collections.CollectionUtils;
import org.conqat.lib.commons.collections.UnmodifiableSet;
import org.conqat.lib.commons.region.Region;

import eu.cqse.check.framework.scanner.ETokenType;
import eu.cqse.check.framework.scanner.IToken;
import eu.cqse.check.framework.shallowparser.SubTypeNames;
import eu.cqse.check.framework.shallowparser.TokenStreamUtils;
import eu.cqse.check.framework.shallowparser.framework.EShallowEntityType;
import eu.cqse.check.framework.shallowparser.framework.ParserState;
import eu.cqse.check.framework.shallowparser.framework.RecognizerBase;
import eu.cqse.check.framework.shallowparser.framework.ShallowParserBase;
import eu.cqse.check.framework.shallowparser.languages.python.PythonShallowParser.EPythonParserStates;

/**
 * Shallow parser for Python.
 */
public class PythonShallowParser extends ShallowParserBase {

	/** All token types that can be used as identifier. */
	private static final UnmodifiableSet VALID_IDENTIFIERS = CollectionUtils
			.asUnmodifiable(EnumSet.of(IDENTIFIER, MATCH, CASE));

	/** The states used in this parser. */
	public enum EPythonParserStates {

		/**
		 * Any state apart from in-class states. Typically, any construct can occur at any place.
		 */
		ANY,

		/** State applying within a class scope. */
		IN_CLASS,

		/** Within a lambda expression. */
		IN_LAMBDA
	}

	/** Constructor. */
	public PythonShallowParser() {
		super(EPythonParserStates.class, EPythonParserStates.ANY);

		createErrorRules();
		createImportRules();
		createDecoratorRules();
		createClassRules();
		createFunctionRules();
		createLambdaRule();
		createStatementRules();
	}

	/** Create rules for handling error handling. */
	private void createErrorRules() {
		// unmatched indent/dedent: no endNode to keep the node incomplete
		inAnyState().sequence(INDENT).createNode(EShallowEntityType.META, "Unmatched indent");
		inAnyState().sequence(DEDENT).createNode(EShallowEntityType.META, "Unmatched dedent");
	}

	/** Creates parsing rules for imports. */
	private void createImportRules() {
		inAnyState().sequence(EnumSet.of(IMPORT, FROM)).createNode(EShallowEntityType.META, 0).skipTo(EOL).endNode();
	}

	/** Creates parsing rules for decorators. */
	private void createDecoratorRules() {
		inAnyState().sequence(AT, VALID_IDENTIFIERS).repeated(DOT, VALID_IDENTIFIERS)
				.createNode(EShallowEntityType.META, SubTypeNames.DECORATOR, new Region(1, -1))
				.skipNested(LPAREN, RPAREN, createLambdaSubRecognizer()).endNode();
	}

	/** Creates parsing rules for classes. */
	private void createClassRules() {
		RecognizerBase classAlternative = inAnyState().sequence(CLASS, VALID_IDENTIFIERS)
				.skipNested(LPAREN, RPAREN).sequence(COLON).createNode(EShallowEntityType.TYPE, SubTypeNames.CLASS, 1);
		addBlockClosingAlternatives(classAlternative, IN_CLASS);
	}

	/**
	 * Creates parsing rules for functions. Considers that type hinting could be used for parameters and
	 * a function too. See PEP 0484
	 */
	private void createFunctionRules() {
		RecognizerBase functionAlternative = inAnyState().sequence(DEF, VALID_IDENTIFIERS)
				.createNode(EShallowEntityType.METHOD, SubTypeNames.METHOD, 1).skipNested(LPAREN, RPAREN).skipTo(COLON);
		addBlockClosingAlternatives(functionAlternative, ANY);
	}

	/** Creates parsing rules for statements. */
	private void createStatementRules() {
		// empty statement
		inAnyState().sequence(SEMICOLON).createNode(EShallowEntityType.STATEMENT, SubTypeNames.EMPTY_STATEMENT)
				.endNode();

		// block statements
		RecognizerBase ifAlternative = inAnyState()
				.sequence(EnumSet.of(IF, ELIF, ELSE, TRY, EXCEPT, FINALLY, WHILE, FOR, WITH))
				.createNode(EShallowEntityType.STATEMENT, 0)
				.skipToWithNesting(COLON, Arrays.asList(LBRACK, LBRACE, LPAREN), Arrays.asList(RBRACK, RBRACE, RPAREN));
		addBlockClosingAlternatives(ifAlternative, ANY);

		// (MATCH, CASE) block statements for valid identifiers. (TS-34832)
		// find COLON before EOL since it could find a COLON anywhere after the sequence
		RecognizerBase ifAlternativeValidIdentifiers = inAnyState()
				.sequence(EnumSet.of(MATCH, CASE)).preCondition(new NotAMethodCallRecognizer())
				.preCondition(createRecognizer(start -> start.skipBefore(EnumSet.of(COLON, EOL)).sequence(COLON)))
				.createNode(EShallowEntityType.STATEMENT, 0)
				.skipToWithNesting(COLON, Arrays.asList(LBRACK, LBRACE, LPAREN), Arrays.asList(RBRACK, RBRACE, RPAREN));
		addBlockClosingAlternatives(ifAlternativeValidIdentifiers, ANY);

		// remove any isolated EOLs
		inAnyState().sequence(EOL);

		// in class attributes
		inState(IN_CLASS).sequenceBefore(VALID_IDENTIFIERS).subRecognizer(new PythonAttributeRecognizer(), 1, 1)
				.endNode();

		// simple statement
		inState(ANY, IN_CLASS).sequenceBefore(EnumSet.complementOf(EnumSet.of(DEF, CLASS)))
				.subRecognizer(new PythonSimpleStatementRecognizer(), 1, 1).endNode();

		/*
		 * Parse a simple statement inside a lambda. This is treated separately as in this case the comma
		 * can occur as statement separator (CR#22933).
		 */
		inState(IN_LAMBDA).sequenceBefore(EnumSet.complementOf(EnumSet.of(DEF, CLASS)))
				.subRecognizer(new PythonSimpleStatementRecognizer(true), 1, 1).endNode();
	}

	/** Creates a rule for parsing lambdas. */
	private void createLambdaRule() {
		inState(ANY).sequence(LAMBDA).skipTo(COLON).createNode(EShallowEntityType.METHOD, SubTypeNames.LAMBDA)
				.parseOnce(IN_LAMBDA)
				.sequenceBefore(EnumSet.of(RBRACK, RPAREN, DEDENT, EOL, COMMA, /* e.g. inside dictionary */RBRACE))
				.endNode();
	}

	/**
	 * Creates a recognizer that detects a lambda expression and parses into it.
	 */
	private RecognizerBase createLambdaSubRecognizer() {
		return createRecognizer(start -> start.sequenceBefore(LAMBDA).parseOnce(ANY));
	}

	/**
	 * Adds two different rules for closing a block:
	 * 
	 * Closing a block by finding a dedent
	 * Single line that ends with EOL, typically this means multiple statements on one line
	 * 
	 * 
	 * @param matchingAlternative
	 *            The block recognizer to be closed.
	 * @param innerBlockState
	 *            The {@link EPythonParserStates} used within the block this method is closing.
	 */
	private static void addBlockClosingAlternatives(RecognizerBase matchingAlternative,
			EPythonParserStates innerBlockState) {
		matchingAlternative.sequence(EOL, INDENT).parseUntil(innerBlockState).sequence(DEDENT).endNode();
		matchingAlternative.parseUntil(innerBlockState).sequence(EOL).endNode();
	}

	/** {@inheritDoc} */
	@Override
	protected boolean isFilteredToken(IToken token, IToken previousToken) {
		if (super.isFilteredToken(token, previousToken)) {
			return true;
		}
		// Don't allow double EOLs
		return previousToken != null && previousToken.getType() == EOL && token.getType() == EOL;
	}

	/** Detects if the given LPAREN is not the beginning of a method call. */
	private static class NotAMethodCallRecognizer extends RecognizerBase {

		@Override
		protected int matchesLocally(ParserState parserState, List tokens,
				int startOffset) {
			if (tokens.get(startOffset).getType() != LPAREN) {
				return startOffset;
			}
			if (startOffset < 1) {
				return NO_MATCH;
			}

			// We will search for a closing brace and then check if the next token is not a
			// COLON, in that case we are looking at someone calling a method named
			// case()/match(), otherwise it is probably a match followed by a tuple
			int closingBrace = TokenStreamUtils.findMatchingClosingToken(tokens, startOffset + 1, LPAREN, RPAREN);
			if (closingBrace != TokenStreamUtils.NOT_FOUND && tokens.size() > closingBrace + 1
					&& tokens.get(closingBrace + 1).getType() != COLON) {
				return NO_MATCH;

			}
			return super.matchesLocally(parserState, tokens, startOffset);
		}
	}
}