eu.cqse.check.framework.util.tokens.TokenStreamTransformationPattern Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of teamscale-check-api Show documentation
The Teamscale Custom Check API allows users to extend Teamscale by writing custom analyses that create findings.
There is a newer version: 2024.7.2
/*
 * Copyright (c) CQSE GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package eu.cqse.check.framework.util.tokens;

import static eu.cqse.check.framework.scanner.ETokenType.IDENTIFIER;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Collection;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import org.checkerframework.checker.nullness.qual.NonNull;
import org.checkerframework.checker.nullness.qual.Nullable;
import org.conqat.lib.commons.assertion.CCSMAssert;
import org.conqat.lib.commons.collections.CollectionUtils;

import eu.cqse.check.framework.core.CheckException;
import eu.cqse.check.framework.scanner.ArtificialTokenOriginIds;
import eu.cqse.check.framework.scanner.ELanguage;
import eu.cqse.check.framework.scanner.ETokenType;
import eu.cqse.check.framework.scanner.IToken;
import eu.cqse.check.framework.scanner.ScannerUtils;
import eu.cqse.check.framework.shallowparser.TokenStreamUtils;

/**
 * Transforms a token stream according to a search and a replace pattern. Both patterns can contain
 * variables in the form of $name.
 *
 * Example
 * 
 * Language: Java
 * 
Search Pattern: assertNotNull($a)
 * 
Replacement Pattern: $a != null
 * 
Input: assertNotNull(foo)
 * 
Output: foo != null
 * 
 *
 * Variables will not match the semicolon token. Additionally, variables can be postfixed with a
 * number, e.g. $a1 to signal that the code matched to $a1 should only be
 * exactly 1 token in length. For C# a block {@code {}} around the result is removed when this
 * matches a {@code out Type Identifier} variable introduction to avoid scoping issues, see
 * {@link #removeBlockIfCSharpOutIntroducesVariable}.
 */
public class TokenStreamTransformationPattern {

	/**
	 * Prefix of pattern variables.
	 */
	private static final String VARIABLE_PREFIX = "$";

	/** The search pattern transformed into matchers. */
	private final List matchers = new ArrayList<>();

	/** The replacement pattern as tokens of the input language. */
	private final List replacementPatternTokens;

	private final ELanguage language;

	/** Returned if the matcher did not match. */
	public static final int NO_MATCH = -1;

	/**
	 * Constructor.
	 * 
	 * @throws CheckException
	 *             if the search pattern has invalid syntax.
	 */
	public TokenStreamTransformationPattern(String searchPatternString, String replacementPatternString,
			ELanguage language) throws CheckException {
		List searchPatternTokens = ScannerUtils.getTokens(searchPatternString, language,
				ArtificialTokenOriginIds.TOKEN_STREAM_TRANSFORMATION_PATTERN);
		createMatchers(searchPatternTokens);
		replacementPatternTokens = ScannerUtils.getTokens(replacementPatternString, language,
				ArtificialTokenOriginIds.TOKEN_STREAM_TRANSFORMATION_PATTERN);
		this.language = language;
	}

	/**
	 * Creates matchers from the given search pattern tokens.
	 * 
	 * @throws CheckException
	 *             if the pattern has invalid syntax.
	 */
	private void createMatchers(List searchPatternTokens) throws CheckException {
		for (int i = 0; i < searchPatternTokens.size(); i++) {
			IToken token = searchPatternTokens.get(i);
			String text = token.getText();
			ETokenType type = token.getType();
			if (type == IDENTIFIER && text.startsWith(VARIABLE_PREFIX)) {
				if (i + 1 < searchPatternTokens.size()) {
					ETokenType endTokenType = searchPatternTokens.get(i + 1).getType();
					matchers.add(new VariableMatcher(text, EnumSet.of(endTokenType, ETokenType.SEMICOLON)));
				} else {
					throw new CheckException("The last token in the search pattern may not be a variable!");
				}
			} else {
				matchers.add(new TokenTypeMatcher(type, text));
			}
		}
	}

	/**
	 * Applies the pattern to the given tokens and returns the transformed token list. In case the
	 * pattern does not match, null is returned. For C# a block {@code {}} around the
	 * result is removed when this matches a {@code out Type Identifier} variable introduction to avoid
	 * scoping issues, see {@link #removeBlockIfCSharpOutIntroducesVariable}.
	 */
	private @Nullable Result apply(ELanguage language, List tokens, int position) {
		Map> variables = new HashMap<>();
		int matchedTokens = matchSearchPattern(tokens, position, variables);
		if (matchedTokens == NO_MATCH) {
			return null;
		}
		List result = createResult(tokens.get(position), variables);
		return removeBlockIfCSharpOutIntroducesVariable(language, result, variables, matchedTokens);
	}

	/**
	 * TS-38134 Removes block scope if a C# out parameter is used that introduces a new variable as
	 * otherwise that variable would have the wrong scope.
	 */
	private static @NonNull Result removeBlockIfCSharpOutIntroducesVariable(ELanguage language, List result,
			Map> variables, int matchedTokens) {
		if (language == ELanguage.CS && surroundedWithBlock(result)
				&& containsCSharpOutVariableIntroduction(variables.values())) {
			// remove block consisting of {} around result
			return new Result(result.subList(1, result.size() - 1), matchedTokens);
		} else {
			return new Result(result, matchedTokens);
		}
	}

	/**
	 * Returns whether the tokens are surrounded by a block consisting of {@code {}}
	 */
	private static boolean surroundedWithBlock(List result) {
		return !result.isEmpty() && result.get(0).getType() == ETokenType.LBRACE
				&& result.get(result.size() - 1).getType() == ETokenType.RBRACE;
	}

	/**
	 * Returns whether any of the variables contains a C# out variable introduction which has a pattern
	 * of {@code out Type Identifier}.
	 */
	private static boolean containsCSharpOutVariableIntroduction(Collection> variables) {
		return variables.stream()
				.anyMatch(t -> IntStream.range(0, t.size() - 2).anyMatch(i -> t.get(i).getType() == ETokenType.OUT
						&& t.get(i + 2).getType().getTokenClass() == ETokenType.ETokenClass.IDENTIFIER));
	}

	/**
	 * Applies the given patterns for the language of the token stream on it and replaces all matches
	 * with the transformed tokens. For C# a block {@code {}} around the result is removed when this
	 * matches a {@code out Type Identifier} variable introduction to avoid scoping issues, see
	 * {@link #removeBlockIfCSharpOutIntroducesVariable}.
	 */
	public static List applyPatterns(List tokens, List patterns) {
		if (tokens.isEmpty()) {
			return CollectionUtils.emptyList();
		}
		ELanguage language = tokens.get(0).getLanguage();
		List transformedTokens = new ArrayList<>();
		int position = 0;
		List filteredPatterns = patterns.stream()
				.filter(p -> p.checkIfTypeMatcherMatches(tokens)).collect(Collectors.toList());
		while (position < tokens.size()) {
			Result result = applyPatterns(language, tokens, filteredPatterns, position);
			if (result == null) {
				transformedTokens.add(tokens.get(position));
				position += 1;
			} else {
				transformedTokens.addAll(result.getTransformedTokens());
				position += result.getMatchedTokens();
			}
		}
		return transformedTokens;
	}

	/**
	 * Applies the given patterns for the given language at the given position. The result of the first
	 * match is returned. If no pattern matches, null is returned. For C# a block
	 * {@code {}} around the result is removed when this matches a {@code out Type Identifier} variable
	 * introduction to avoid scoping issues, see {@link #removeBlockIfCSharpOutIntroducesVariable}.
	 */
	private static @Nullable Result applyPatterns(ELanguage language, List tokens,
			List patterns, int position) {
		for (TokenStreamTransformationPattern pattern : patterns) {
			if (pattern.language != language) {
				continue;
			}
			Result result = pattern.apply(language, tokens, position);
			if (result != null) {
				return result;
			}
		}
		return null;
	}

	/**
	 * Checks, if all {@link TokenTypeMatcher}s of {@link #matchers} have at least one match. If not,
	 * there we do not have to apply the pattern. This is an early exit based on the assumption, that
	 * most files do not contain the text we search , e.g. {@code getOrDefault}. This way we can skip
	 * applying the more expensive {@link VariableMatcher}s in many cases.
	 */
	private boolean checkIfTypeMatcherMatches(List tokens) {
		for (IMatcher matcher : matchers) {
			if (matcher instanceof TokenTypeMatcher) {
				TokenTypeMatcher tokenTypeMatcher = (TokenTypeMatcher) matcher;
				boolean matches = tokenTypeMatcher.hasAnyMatch(tokens);
				if (!matches) {
					return false;
				}
			}
		}
		return true;
	}

	/**
	 * Matches the matchers against the given tokens and returns the variable map created by the
	 * matchers. Returns {@link #NO_MATCH} if the matchers do not match the token stream.
	 */
	private int matchSearchPattern(List tokens, int startPosition, Map> variables) {
		int tokenPosition = startPosition;
		for (IMatcher matcher : matchers) {
			if (tokenPosition >= tokens.size()) {
				return NO_MATCH;
			}
			int nextPosition = matcher.apply(tokens, tokenPosition, variables);
			if (nextPosition == NO_MATCH) {
				return NO_MATCH;
			}
			CCSMAssert.isTrue(nextPosition > tokenPosition, "Matcher did not advance token stream.");
			tokenPosition = nextPosition;
		}
		return tokenPosition - startPosition;
	}

	/**
	 * Creates the result token list based on the given variable map.
	 */
	private List createResult(IToken baseToken, Map> variables) {
		List result = new ArrayList<>();
		for (IToken token : replacementPatternTokens) {
			String text = token.getText();
			if (token.getType() == IDENTIFIER && text.startsWith(VARIABLE_PREFIX)) {
				List variableMatch = variables.get(text);
				CCSMAssert.isNotNull(variableMatch, "Variable " + text + " was not matched");
				result.addAll(variableMatch);
			} else {
				result.add(token.newToken(token.getType(), baseToken.getOffset(), baseToken.getLineNumber(),
						token.getText(), baseToken.getOriginId()));
			}
		}
		return result;
	}

	public ELanguage getLanguage() {
		return language;
	}

	/** Result of successfully matching one pattern. */
	private static class Result {

		/** The transformed tokens. */
		private final List transformedTokens;

		/** The number of tokens that matched in the input token stream. */
		private final int matchedTokens;

		private Result(List transformedTokens, int matchedTokens) {
			this.transformedTokens = transformedTokens;
			this.matchedTokens = matchedTokens;
		}

		/**
		 * @see #matchedTokens
		 */
		private int getMatchedTokens() {
			return matchedTokens;
		}

		/**
		 * @see #transformedTokens
		 */
		private List getTransformedTokens() {
			return transformedTokens;
		}
	}

	/** Matches part of the search pattern against the token stream. */
	private interface IMatcher {

		/**
		 * Tries to match this matcher at the given position in the token stream. May modify the given
		 * variables map. Returns {@link TokenStreamTransformationPattern#NO_MATCH} if the matcher does not
		 * apply at this position. Otherwise, returns the position where the next matcher should be applied.
		 */
		int apply(List tokens, int position, Map> variables);
	}

	/**
	 * Matches if the token at the current position has a certain type and text.
	 */
	private static class TokenTypeMatcher implements IMatcher {

		/** The token type to match. */
		private final ETokenType type;

		/** The expected text. */
		private final String text;

		private TokenTypeMatcher(ETokenType type, String text) {
			this.type = type;
			this.text = text;
		}

		@Override
		public int apply(List tokens, int position, Map> variables) {
			IToken token = tokens.get(position);
			if (token.getType() == type && token.getText().equals(text)) {
				return position + 1;
			}
			return NO_MATCH;
		}

		private boolean hasAnyMatch(List tokens) {
			IToken tokenByTypeAndText = TokenStreamUtils.getTokenByTypeAndText(tokens, text,
					Collections.singleton(type));
			return tokenByTypeAndText != null;
		}

		@Override
		public String toString() {
			return "TokenTypeMatcher[type=" + type + ",text=" + text + "]";
		}
	}

	/**
	 * Matches a variable from the current position in the token stream to the first occurrence of the
	 * end token.
	 */
	private static class VariableMatcher implements IMatcher {

		/** The pattern to find out how many tokens long a match should be */
		private static final Pattern MATCH_LENGTH_PATTERN = Pattern.compile("\\$[a-zA-Z]+([0-9]+)");

		/** The variable to match. */
		private final String variableName;

		/**
		 * The token type that signals the end of the variable match.
		 */
		private final Set endTokenType;

		/**
		 * The number of tokens that the variable should match. This may be null to express that the current
		 * variable does not contain a variable count. This means, it will greedily match as many tokens as
		 * possible.
		 */
		private Integer numberOfTokensToMatch = null;

		/**
		 * Indicates that {@link #endTokenType} has not been found in the token stream. Therefore, we don't
		 * have to search again the next time the matcher is applied.
		 */
		private boolean reachedEndOfTokenStream = false;

		private VariableMatcher(String variableName, Set endTokenType) {
			this.variableName = variableName;
			this.endTokenType = endTokenType;

			Matcher matcher = MATCH_LENGTH_PATTERN.matcher(variableName);
			if (matcher.matches()) {
				numberOfTokensToMatch = Integer.parseInt(matcher.group(1));
			}
		}

		@Override
		public int apply(List tokens, int position, Map> variables) {
			// If we did not find anything the last time, we will not find anything this time.
			if (reachedEndOfTokenStream) {
				return NO_MATCH;
			}
			int endIndex = TokenStreamUtils.findFirstTopLevel(tokens, position, endTokenType,
					List.of(ETokenType.LPAREN), List.of(ETokenType.RPAREN));
			if (endIndex == TokenStreamUtils.NOT_FOUND) {
				reachedEndOfTokenStream = true;
				return NO_MATCH;
			}
			if (position == endIndex) {
				return NO_MATCH;
			}

			if (numberOfTokensToMatch != null && endIndex > position + numberOfTokensToMatch) {
				endIndex = position + numberOfTokensToMatch;
			}

			variables.put(variableName, tokens.subList(position, endIndex));
			return endIndex;
		}

		@Override
		public String toString() {
			return "VariableMatcher[variableName=" + variableName + ",endTokenType=" + endTokenType + "]";
		}
	}
}