de.janno.evaluator.dice.Tokenizer Maven / Gradle / Ivy
The newest version!
package de.janno.evaluator.dice;
import com.google.common.collect.ImmutableList;
import lombok.NonNull;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class Tokenizer {
private final static String ALL_NUMBER_REGEX = "\\d+\\.?\\d*";
private final static Pattern SMALL_DECIMAL_PATTERN = Pattern.compile("\\d{1,9}\\.\\d{1,9}");
private final static Pattern SMALL_INTEGER_PATTERN = Pattern.compile("\\d{1,9}");
private final ImmutableList tokenBuilders;
private final String escapeCharacter;
private final ImmutableList allOperatorAndFunctionNamePatterns;
public Tokenizer(Parameters parameters) {
escapeCharacter = parameters.getEscapeBrackets().stream()
.map(BracketPair::toString).collect(Collectors.joining(" or "));
ImmutableList.Builder builder = ImmutableList.builder();
Stream.concat(parameters.getExpressionBrackets().stream(), parameters.getFunctionBrackets().stream())
.distinct() //expression and function brackets are allowed to contain the same elements
.forEach(c -> {
builder.add(new TokenBuilder(escapeForRegexAndAddCaseInsensitivity(c.getOpen()), expressionPosition -> Token.openTokenOf(c, expressionPosition), false));
builder.add(new TokenBuilder(escapeForRegexAndAddCaseInsensitivity(c.getClose()), expressionPosition -> Token.closeTokenOf(c, expressionPosition), false));
});
parameters.getFunctions().forEach(function -> builder.add(new TokenBuilder(escapeForRegexAndAddCaseInsensitivity(function.getName()), expressionPosition -> Token.of(function, expressionPosition), false)));
parameters.getOperators().forEach(operator -> builder.add(new TokenBuilder(escapeForRegexAndAddCaseInsensitivity(operator.getName()), expressionPosition -> Token.of(operator, expressionPosition), false)));
builder.add(new TokenBuilder(escapeForRegexAndAddCaseInsensitivity(parameters.getSeparator()), Token::separator, false));
parameters.getEscapeBrackets().forEach(b -> builder.add(new TokenBuilder(buildEscapeBracketsRegex(b), expressionPosition -> Token.of(expressionPosition.getValue().substring(1, expressionPosition.getValue().length() - 1), expressionPosition), true)));
builder.add(new TokenBuilder(ALL_NUMBER_REGEX, expressionPosition -> {
if (SMALL_INTEGER_PATTERN.matcher(expressionPosition.getValue()).matches() || SMALL_DECIMAL_PATTERN.matcher(expressionPosition.getValue()).matches()) {
return Token.of(expressionPosition.getValue(), expressionPosition);
}
throw new ExpressionException("The number '%s' is too big".formatted(expressionPosition.getValue()), expressionPosition);
}, false));
tokenBuilders = builder.build();
List duplicateRegex = tokenBuilders.stream().collect(Collectors.groupingBy(TokenBuilder::regex))
.entrySet().stream()
.filter(e -> e.getValue().size() > 1)
.map(Map.Entry::getKey)
.toList();
if (!duplicateRegex.isEmpty()) {
throw new IllegalArgumentException("The following regex for tokenizing where used more then once: " + duplicateRegex);
}
allOperatorAndFunctionNamePatterns = Stream.concat(
parameters.getOperators().stream().map(Operator::getName),
parameters.getFunctions().stream().map(Function::getName))
.map(n -> Pattern.compile(escapeForRegexAndAddCaseInsensitivity(n)))
.collect(ImmutableList.toImmutableList());
}
private static String buildEscapeBracketsRegex(BracketPair bracketPair) {
return String.format("%s.*?%s", escapeForRegexAndAddCaseInsensitivity(bracketPair.getOpen()), escapeForRegexAndAddCaseInsensitivity(bracketPair.getClose()));
}
private static String escapeForRegexAndAddCaseInsensitivity(String in) {
return "(?i)\\Q%s\\E(?-i)".formatted(in);
}
private static int countLeadingWhitespaces(String input) {
int spaceCount = 0;
for (char c : input.toCharArray()) {
if (c == ' ') {
spaceCount++;
} else {
break;
}
}
return spaceCount;
}
public List tokenize(final String input) throws ExpressionException {
List preTokens = new ArrayList<>();
String current = input.trim();
//input it trimmed and has never leading spaces
int currentPositionWithSpace = 0;
Optional currentMatch;
do {
currentMatch = getBestMatch(current, currentPositionWithSpace);
if (currentMatch.isPresent()) {
Token token = currentMatch.get();
preTokens.add(token);
int matchLength = token.getExpressionPosition().getValue().length();
currentPositionWithSpace += matchLength;
String substringWithSpace = current.substring(matchLength);
currentPositionWithSpace += countLeadingWhitespaces(substringWithSpace);
current = substringWithSpace.trim();
}
} while (currentMatch.isPresent());
if (!current.isEmpty()) {
int nextPositionWithMatch = findNextMatchForErrorMessage(current);
String nonMatchingString = current.substring(0, nextPositionWithMatch);
throw new ExpressionException("No matching operator for '%s', non-functional text and value names must to be surrounded by %s".formatted(nonMatchingString, escapeCharacter),
ExpressionPosition.of(currentPositionWithSpace, nonMatchingString));
}
return setOperatorType(preTokens);
}
private int findNextMatchForErrorMessage(String current) {
int i=0;
while(!current.isEmpty()){
i++;
current = current.substring(1);
try {
Optional match = getBestMatch(current, i);
if (match.isPresent()) {
return i;
}
} catch (ExpressionException e) {
//next error, we want to return only the current error
return i;
}
}
return i;
}
private List setOperatorType(List in) throws ExpressionException {
ImmutableList.Builder builder = ImmutableList.builder();
boolean lastOperatorWasUnaryLeft = false;
for (int i = 0; i < in.size(); i++) {
Token token = in.get(i);
Optional previousOpenBracket = getPreviousOpenBrackets(in, i);
if (previousOpenBracket.isPresent()) {
token = Token.addOpenBracket(token, previousOpenBracket.get());
}
Optional followingCloseBracket = getFollowingCloseBrackets(in, i);
if (followingCloseBracket.isPresent()) {
token = Token.addCloseBracket(token, followingCloseBracket.get());
}
if (token.getOperator().isPresent()) {
Token left = i == 0 ? null : in.get(i - 1);
Token right = i == in.size() - 1 ? null : in.get(i + 1);
Operator.OperatorType type = determineAndValidateOperatorType(token, left, right, lastOperatorWasUnaryLeft);
builder.add(Token.of(token.getOperator().get(), type, token.getExpressionPosition()));
lastOperatorWasUnaryLeft = type == Operator.OperatorType.UNARY && token.getOperator().get().getAssociativityForOperantType(Operator.OperatorType.UNARY) == Operator.Associativity.LEFT;
} else {
builder.add(token);
lastOperatorWasUnaryLeft = false;
}
}
return builder.build();
}
private Optional getPreviousOpenBrackets(List in, int index) {
if (index <= 0) {
return Optional.empty();
}
int i = index - 1;
StringBuilder brackets = new StringBuilder();
while (i >= 0 && in.get(i).getBrackets().isPresent() && in.get(i).isOpenBracket()) {
brackets.append(in.get(i).getBrackets().get().getOpen());
i--;
}
if (brackets.isEmpty()) {
return Optional.empty();
}
return Optional.of(brackets.toString());
}
private Optional getFollowingCloseBrackets(List in, int index) {
if (index > in.size() - 2) {
return Optional.empty();
}
int i = index + 1;
StringBuilder brackets = new StringBuilder();
while (i < in.size() && in.get(i).getBrackets().isPresent() && in.get(i).isCloseBracket()) {
brackets.append(in.get(i).getBrackets().get().getClose());
i++;
}
if (brackets.isEmpty()) {
return Optional.empty();
}
return Optional.of(brackets.toString());
}
private Operator.OperatorType determineAndValidateOperatorType(@NonNull Token token, @Nullable Token left, @Nullable Token right, boolean lastOperatorWasUnaryLeft) throws ExpressionException {
//todo cleanup
//operator is already checked
Operator operator = token.getOperator().orElseThrow();
boolean leftLiteralOrBracket = left != null && (left.getLiteral().isPresent() || left.isCloseBracket() || (left.getOperator().isPresent() && lastOperatorWasUnaryLeft));
boolean rightLiteralOrBracket = right != null && (right.getLiteral().isPresent() || right.isOpenBracket() ||
(right.getOperator().isPresent() && right.getOperator().get().getAssociativityForOperantType(Operator.OperatorType.UNARY) == Operator.Associativity.RIGHT)
|| (right.getFunction().isPresent()));
if (leftLiteralOrBracket && rightLiteralOrBracket) {
if (!operator.supportBinaryOperation()) {
throw new ExpressionException("Operator %s does not support binary operations".formatted(operator.getName()), token.getExpressionPosition());
}
return Operator.OperatorType.BINARY;
}
if (!operator.supportUnaryOperation()) {
throw new ExpressionException("Operator %s does not support unary operations".formatted(operator.getName()), token.getExpressionPosition());
}
Operator.Associativity operatorAssociativity = operator.getAssociativityForOperantType(Operator.OperatorType.UNARY);
if (operatorAssociativity == Operator.Associativity.LEFT && !leftLiteralOrBracket) {
throw new ExpressionException("Operator %s has left associativity but the left value was: %s".formatted(operator.getName(), Optional.ofNullable(left).map(Object::toString).orElse("empty")), token.getExpressionPosition());
}
if (operatorAssociativity == Operator.Associativity.RIGHT && !rightLiteralOrBracket) {
throw new ExpressionException("Operator %s has right associativity but the right value was: %s".formatted(operator.getName(), Optional.ofNullable(right).map(Object::toString).orElse("empty")), token.getExpressionPosition());
}
return Operator.OperatorType.UNARY;
}
private Optional getBestMatch(String input, int position) throws ExpressionException {
List allMatches = getAllMatches(input, position);
int maxLength = allMatches.stream()
.map(Token::getExpressionPosition)
.map(ExpressionPosition::getValue)
.mapToInt(String::length)
.max()
.orElse(0);
List maxLengthMatches = allMatches.stream()
.filter(m -> m.getExpressionPosition().getValue().length() == maxLength)
.toList();
if (maxLengthMatches.isEmpty()) {
return Optional.empty();
}
if (maxLengthMatches.size() > 1) {
throw new IllegalStateException("More then one operator matched the input %s: %s".formatted(input, maxLengthMatches.stream().map(Token::toString).toList()));
}
return Optional.of(maxLengthMatches.getFirst());
}
private List getAllMatches(String input, int position) throws ExpressionException {
ImmutableList.Builder matchBuilder = ImmutableList.builder();
for (Tokenizer.TokenBuilder tokenBuilder : tokenBuilders) {
Optional firstMatch = getFirstMatch(input, tokenBuilder, position);
firstMatch.ifPresent(matchBuilder::add);
}
return matchBuilder.build();
}
private Optional getFirstMatch(String input, TokenBuilder tokenBuilder, int position) throws ExpressionException {
Matcher matcher = tokenBuilder.pattern().matcher(input);
if (matcher.find()) {
String matchGroup = matcher.group().trim();
return Optional.of(tokenBuilder.toToken().apply(ExpressionPosition.of(position, matchGroup)));
}
return Optional.empty();
}
public boolean expressionContainsOperatorOrFunction(String expression) {
return allOperatorAndFunctionNamePatterns.stream().anyMatch(p -> p.matcher(expression).find());
}
private interface ToToken {
Token apply(ExpressionPosition expressionPosition) throws ExpressionException;
}
private record TokenBuilder(String regex, ToToken toToken, boolean multiLine) {
Pattern pattern() {
if (multiLine) {
return Pattern.compile("^\\s*%s\\s*".formatted(regex), Pattern.DOTALL);
}
return Pattern.compile("^\\s*%s\\s*".formatted(regex));
}
}
}