com.microsoft.semantickernel.implementation.templateengine.tokenizer.CodeTokenizer Maven / Gradle / Ivy
// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantickernel.implementation.templateengine.tokenizer;
import com.microsoft.semantickernel.exceptions.SKException;
import com.microsoft.semantickernel.implementation.Verify;
import com.microsoft.semantickernel.implementation.templateengine.tokenizer.blocks.Block;
import com.microsoft.semantickernel.implementation.templateengine.tokenizer.blocks.FunctionIdBlock;
import com.microsoft.semantickernel.implementation.templateengine.tokenizer.blocks.NamedArgBlock;
import com.microsoft.semantickernel.implementation.templateengine.tokenizer.blocks.Symbols;
import com.microsoft.semantickernel.implementation.templateengine.tokenizer.blocks.ValBlock;
import com.microsoft.semantickernel.implementation.templateengine.tokenizer.blocks.VarBlock;
import com.microsoft.semantickernel.templateengine.semantickernel.TemplateException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import javax.annotation.Nullable;
/**
* Simple tokenizer used for default SK template code language.
*
* BNF parsed by TemplateTokenizer: [template] ::= "" | [block] | [block] [template] [block]
* ::= [sk-block] | [text-block] [sk-block] ::= "{{" [variable] "}}" | "{{" [value] "}}" |
* "{{" [function-call] "}}" [text-block] ::= [any-char] | [any-char] [text-block] [any-char]
* ::= any char
*
* BNF parsed by CodeTokenizer: [template] ::= "" | [variable] " " [template] | [value] " "
* [template] | [function-call] " [variable] ::= "$" [valid-name] [value] ::= "'"
* [text] "'" | '"' [text] '"' [function-call] ::= [function-id] | [function-id] [parameter]
* [parameter] ::= [variable] | [value]
*
* BNF parsed by dedicated blocks [function-id] ::= [valid-name] | [valid-name] "." [valid-name]
* [valid-name] ::= [valid-symbol] | [valid-symbol] [valid-name] [valid-symbol] ::= [letter] |
* [digit] | "_" [letter] ::= "a" | "b" ... | "z" | "A" | "B" ... | "Z" [digit] ::=
* "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
*/
public class CodeTokenizer {
/**
* Initializes a new instance of the {@link CodeTokenizer} class.
*/
public CodeTokenizer() {
}
private static boolean isVarPrefix(char c) {
return (c == Symbols.VarPrefix);
}
private static boolean IsBlankSpace(char c) {
return Character.isWhitespace(c);
}
private static boolean isQuote(char c) {
return c == Symbols.DblQuote || c == Symbols.SglQuote;
}
private static boolean CanBeEscaped(char c) {
return c == Symbols.DblQuote || c == Symbols.SglQuote || c == Symbols.EscapeChar;
}
@SuppressWarnings("NullAway")
@Nullable
private static NamedArgBlock getNamedArg(String tokenContent) {
String name = NamedArgBlock.tryGetName(tokenContent);
String value = NamedArgBlock.tryGetValue(tokenContent);
if (Verify.isNullOrEmpty(name) || Verify.isNullOrEmpty(value)) {
return null;
}
NamedArgBlock block = new NamedArgBlock(tokenContent, name, value);
if (block.isValid()) {
return block;
}
return null;
}
/**
* Tokenize a code block, without checking for syntax errors
*
* @param text Text to parse
* @return A list of blocks
*/
public List tokenize(String text) {
if (text == null) {
return new ArrayList<>();
}
// Remove spaces, which are ignored anyway
text = text.trim();
// Render NULL to ""
if (text.isEmpty()) {
return Collections.unmodifiableList(new ArrayList<>());
}
// Track what type of token we're reading
TokenTypes currentTokenType = TokenTypes.None;
// Track the content of the current token
StringBuilder currentTokenContent = new StringBuilder();
char textValueDelimiter = '\0';
List blocks = new ArrayList<>();
char nextChar = text.charAt(0);
// Tokens must be separated by spaces, track their presence
boolean spaceSeparatorFound = false;
// Named args may contain string values that contain spaces. These are used
// to determine when a space occurs between quotes.
boolean namedArgSeparatorFound = false;
char namedArgValuePrefix = '\0';
// 1 char only edge case
if (text.length() == 1) {
switch (nextChar) {
case Symbols.VarPrefix:
blocks.add(new VarBlock(text));
break;
case Symbols.DblQuote:
case Symbols.SglQuote:
blocks.add(new ValBlock(text));
break;
default:
blocks.add(new FunctionIdBlock(text));
break;
}
return blocks;
}
boolean skipNextChar = false;
for (int nextCharCursor = 1; nextCharCursor < text.length(); nextCharCursor++) {
char currentChar = nextChar;
nextChar = text.charAt(nextCharCursor);
if (skipNextChar) {
skipNextChar = false;
continue;
}
// First char is easy
if (nextCharCursor == 1) {
if (isVarPrefix(currentChar)) {
currentTokenType = TokenTypes.Variable;
} else if (isQuote(currentChar)) {
currentTokenType = TokenTypes.Value;
textValueDelimiter = currentChar;
} else {
// A function Id starts here
currentTokenType = TokenTypes.FunctionId;
}
currentTokenContent.append(currentChar);
continue;
}
// While reading a values between quotes
if (currentTokenType == TokenTypes.Value
|| (currentTokenType == TokenTypes.NamedArg && isQuote(namedArgValuePrefix))) {
// If the current char is escaping the next special char:
// - skip the current char (escape char)
// - add the next (special char)
// - jump to the one after (to handle "\\" properly)
if (currentChar == Symbols.EscapeChar && CanBeEscaped(nextChar)) {
currentTokenContent.append(nextChar);
skipNextChar = true;
continue;
}
currentTokenContent.append(currentChar);
// When we reach the end of the value
if (currentChar == textValueDelimiter) {
blocks.add(new ValBlock(currentTokenContent.toString()));
currentTokenContent = new StringBuilder();
currentTokenType = TokenTypes.None;
spaceSeparatorFound = false;
} else if (currentChar == namedArgValuePrefix
&& currentTokenType == TokenTypes.NamedArg) {
blocks.add(NamedArgBlock.from(currentTokenContent.toString()));
currentTokenContent = new StringBuilder();
currentTokenType = TokenTypes.None;
spaceSeparatorFound = false;
namedArgSeparatorFound = false;
namedArgValuePrefix = '\0';
}
continue;
}
// If we're not between quotes, a space signals the end of the current token
// Note: there might be multiple consecutive spaces
if (IsBlankSpace(currentChar)) {
if (currentTokenType == TokenTypes.Variable) {
blocks.add(new VarBlock(currentTokenContent.toString()));
currentTokenContent = new StringBuilder();
} else if (currentTokenType == TokenTypes.FunctionId) {
String tokenContent = currentTokenContent.toString();
// This isn't an expected block at this point but the TemplateTokenizer should throw an error when
// a named arg is used without a function call
NamedArgBlock namedArg = getNamedArg(tokenContent);
if (namedArg != null) {
blocks.add(namedArg);
} else {
blocks.add(new FunctionIdBlock(tokenContent));
}
currentTokenContent = new StringBuilder();
currentTokenType = TokenTypes.None;
} else if (currentTokenType == TokenTypes.NamedArg && namedArgSeparatorFound
&& namedArgValuePrefix != 0) {
blocks.add(
NamedArgBlock.from(currentTokenContent.toString()));
currentTokenContent = new StringBuilder();
namedArgSeparatorFound = false;
namedArgValuePrefix = '\0';
currentTokenType = TokenTypes.None;
}
spaceSeparatorFound = true;
currentTokenType = TokenTypes.None;
continue;
}
// If reading a named argument and either the '=' or the value prefix ($, ', or ") haven't been found
if (currentTokenType == TokenTypes.NamedArg && (!namedArgSeparatorFound
|| namedArgValuePrefix == 0)) {
if (!namedArgSeparatorFound) {
if (currentChar == Symbols.NamedArgBlockSeparator) {
namedArgSeparatorFound = true;
}
} else {
namedArgValuePrefix = currentChar;
if (!isQuote(namedArgValuePrefix) && namedArgValuePrefix != Symbols.VarPrefix) {
throw new SKException(
"Named argument values need to be prefixed with a quote or "
+ Symbols.VarPrefix);
}
}
currentTokenContent.append(currentChar);
continue;
}
// If we're not inside a quoted value and we're not processing a space
currentTokenContent.append(currentChar);
if (currentTokenType == TokenTypes.None) {
if (!spaceSeparatorFound) {
throw new TemplateException(
TemplateException.ErrorCodes.SYNTAX_ERROR,
"Tokens must be separated by one space least");
}
if (isQuote(currentChar)) {
// A quoted value starts here
currentTokenType = TokenTypes.Value;
textValueDelimiter = currentChar;
} else if (isVarPrefix(currentChar)) {
// A variable starts here
currentTokenType = TokenTypes.Variable;
} else if (blocks.isEmpty()) {
// A function Id starts here
currentTokenType = TokenTypes.FunctionId;
} else {
// A named arg starts here
currentTokenType = TokenTypes.NamedArg;
}
}
}
// Capture last token
currentTokenContent.append(nextChar);
switch (currentTokenType) {
case Value:
blocks.add(new ValBlock(currentTokenContent.toString()));
break;
case Variable:
blocks.add(new VarBlock(currentTokenContent.toString()));
break;
case FunctionId:
NamedArgBlock namedArg = getNamedArg(currentTokenContent.toString());
// This isn't an expected block at this point but the TemplateTokenizer should throw an error when
// a named arg is used without a function call
if (namedArg != null) {
blocks.add(namedArg);
} else {
blocks.add(new FunctionIdBlock(currentTokenContent.toString()));
}
break;
case NamedArg:
blocks.add(NamedArgBlock.from(currentTokenContent.toString()));
break;
case None:
throw new TemplateException(
TemplateException.ErrorCodes.SYNTAX_ERROR,
"Tokens must be separated by one space least");
}
return blocks;
}
private enum TokenTypes {
None(0), Value(1), Variable(2), FunctionId(3), NamedArg(4);
TokenTypes(int i) {
}
}
}