All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.microsoft.semantickernel.implementation.templateengine.tokenizer.CodeTokenizer Maven / Gradle / Ivy

// Copyright (c) Microsoft. All rights reserved.
package com.microsoft.semantickernel.implementation.templateengine.tokenizer;

import com.microsoft.semantickernel.exceptions.SKException;
import com.microsoft.semantickernel.implementation.Verify;
import com.microsoft.semantickernel.implementation.templateengine.tokenizer.blocks.Block;
import com.microsoft.semantickernel.implementation.templateengine.tokenizer.blocks.FunctionIdBlock;
import com.microsoft.semantickernel.implementation.templateengine.tokenizer.blocks.NamedArgBlock;
import com.microsoft.semantickernel.implementation.templateengine.tokenizer.blocks.Symbols;
import com.microsoft.semantickernel.implementation.templateengine.tokenizer.blocks.ValBlock;
import com.microsoft.semantickernel.implementation.templateengine.tokenizer.blocks.VarBlock;
import com.microsoft.semantickernel.templateengine.semantickernel.TemplateException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import javax.annotation.Nullable;

/**
 * Simple tokenizer used for default SK template code language.
 * 

* BNF parsed by TemplateTokenizer: [template] ::= "" | [block] | [block] [template] [block] * ::= [sk-block] | [text-block] [sk-block] ::= "{{" [variable] "}}" | "{{" [value] "}}" | * "{{" [function-call] "}}" [text-block] ::= [any-char] | [any-char] [text-block] [any-char] * ::= any char *

* BNF parsed by CodeTokenizer: [template] ::= "" | [variable] " " [template] | [value] " " * [template] | [function-call] " [variable] ::= "$" [valid-name] [value] ::= "'" * [text] "'" | '"' [text] '"' [function-call] ::= [function-id] | [function-id] [parameter] * [parameter] ::= [variable] | [value] *

* BNF parsed by dedicated blocks [function-id] ::= [valid-name] | [valid-name] "." [valid-name] * [valid-name] ::= [valid-symbol] | [valid-symbol] [valid-name] [valid-symbol] ::= [letter] | * [digit] | "_" [letter] ::= "a" | "b" ... | "z" | "A" | "B" ... | "Z" [digit] ::= * "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" */ public class CodeTokenizer { /** * Initializes a new instance of the {@link CodeTokenizer} class. */ public CodeTokenizer() { } private static boolean isVarPrefix(char c) { return (c == Symbols.VarPrefix); } private static boolean IsBlankSpace(char c) { return Character.isWhitespace(c); } private static boolean isQuote(char c) { return c == Symbols.DblQuote || c == Symbols.SglQuote; } private static boolean CanBeEscaped(char c) { return c == Symbols.DblQuote || c == Symbols.SglQuote || c == Symbols.EscapeChar; } @SuppressWarnings("NullAway") @Nullable private static NamedArgBlock getNamedArg(String tokenContent) { String name = NamedArgBlock.tryGetName(tokenContent); String value = NamedArgBlock.tryGetValue(tokenContent); if (Verify.isNullOrEmpty(name) || Verify.isNullOrEmpty(value)) { return null; } NamedArgBlock block = new NamedArgBlock(tokenContent, name, value); if (block.isValid()) { return block; } return null; } /** * Tokenize a code block, without checking for syntax errors * * @param text Text to parse * @return A list of blocks */ public List tokenize(String text) { if (text == null) { return new ArrayList<>(); } // Remove spaces, which are ignored anyway text = text.trim(); // Render NULL to "" if (text.isEmpty()) { return Collections.unmodifiableList(new ArrayList<>()); } // Track what type of token we're reading TokenTypes currentTokenType = TokenTypes.None; // Track the content of the current token StringBuilder currentTokenContent = new StringBuilder(); char textValueDelimiter = '\0'; List blocks = new ArrayList<>(); char nextChar = text.charAt(0); // Tokens must be separated by spaces, track their presence boolean spaceSeparatorFound = false; // Named args may contain string values that contain spaces. These are used // to determine when a space occurs between quotes. boolean namedArgSeparatorFound = false; char namedArgValuePrefix = '\0'; // 1 char only edge case if (text.length() == 1) { switch (nextChar) { case Symbols.VarPrefix: blocks.add(new VarBlock(text)); break; case Symbols.DblQuote: case Symbols.SglQuote: blocks.add(new ValBlock(text)); break; default: blocks.add(new FunctionIdBlock(text)); break; } return blocks; } boolean skipNextChar = false; for (int nextCharCursor = 1; nextCharCursor < text.length(); nextCharCursor++) { char currentChar = nextChar; nextChar = text.charAt(nextCharCursor); if (skipNextChar) { skipNextChar = false; continue; } // First char is easy if (nextCharCursor == 1) { if (isVarPrefix(currentChar)) { currentTokenType = TokenTypes.Variable; } else if (isQuote(currentChar)) { currentTokenType = TokenTypes.Value; textValueDelimiter = currentChar; } else { // A function Id starts here currentTokenType = TokenTypes.FunctionId; } currentTokenContent.append(currentChar); continue; } // While reading a values between quotes if (currentTokenType == TokenTypes.Value || (currentTokenType == TokenTypes.NamedArg && isQuote(namedArgValuePrefix))) { // If the current char is escaping the next special char: // - skip the current char (escape char) // - add the next (special char) // - jump to the one after (to handle "\\" properly) if (currentChar == Symbols.EscapeChar && CanBeEscaped(nextChar)) { currentTokenContent.append(nextChar); skipNextChar = true; continue; } currentTokenContent.append(currentChar); // When we reach the end of the value if (currentChar == textValueDelimiter) { blocks.add(new ValBlock(currentTokenContent.toString())); currentTokenContent = new StringBuilder(); currentTokenType = TokenTypes.None; spaceSeparatorFound = false; } else if (currentChar == namedArgValuePrefix && currentTokenType == TokenTypes.NamedArg) { blocks.add(NamedArgBlock.from(currentTokenContent.toString())); currentTokenContent = new StringBuilder(); currentTokenType = TokenTypes.None; spaceSeparatorFound = false; namedArgSeparatorFound = false; namedArgValuePrefix = '\0'; } continue; } // If we're not between quotes, a space signals the end of the current token // Note: there might be multiple consecutive spaces if (IsBlankSpace(currentChar)) { if (currentTokenType == TokenTypes.Variable) { blocks.add(new VarBlock(currentTokenContent.toString())); currentTokenContent = new StringBuilder(); } else if (currentTokenType == TokenTypes.FunctionId) { String tokenContent = currentTokenContent.toString(); // This isn't an expected block at this point but the TemplateTokenizer should throw an error when // a named arg is used without a function call NamedArgBlock namedArg = getNamedArg(tokenContent); if (namedArg != null) { blocks.add(namedArg); } else { blocks.add(new FunctionIdBlock(tokenContent)); } currentTokenContent = new StringBuilder(); currentTokenType = TokenTypes.None; } else if (currentTokenType == TokenTypes.NamedArg && namedArgSeparatorFound && namedArgValuePrefix != 0) { blocks.add( NamedArgBlock.from(currentTokenContent.toString())); currentTokenContent = new StringBuilder(); namedArgSeparatorFound = false; namedArgValuePrefix = '\0'; currentTokenType = TokenTypes.None; } spaceSeparatorFound = true; currentTokenType = TokenTypes.None; continue; } // If reading a named argument and either the '=' or the value prefix ($, ', or ") haven't been found if (currentTokenType == TokenTypes.NamedArg && (!namedArgSeparatorFound || namedArgValuePrefix == 0)) { if (!namedArgSeparatorFound) { if (currentChar == Symbols.NamedArgBlockSeparator) { namedArgSeparatorFound = true; } } else { namedArgValuePrefix = currentChar; if (!isQuote(namedArgValuePrefix) && namedArgValuePrefix != Symbols.VarPrefix) { throw new SKException( "Named argument values need to be prefixed with a quote or " + Symbols.VarPrefix); } } currentTokenContent.append(currentChar); continue; } // If we're not inside a quoted value and we're not processing a space currentTokenContent.append(currentChar); if (currentTokenType == TokenTypes.None) { if (!spaceSeparatorFound) { throw new TemplateException( TemplateException.ErrorCodes.SYNTAX_ERROR, "Tokens must be separated by one space least"); } if (isQuote(currentChar)) { // A quoted value starts here currentTokenType = TokenTypes.Value; textValueDelimiter = currentChar; } else if (isVarPrefix(currentChar)) { // A variable starts here currentTokenType = TokenTypes.Variable; } else if (blocks.isEmpty()) { // A function Id starts here currentTokenType = TokenTypes.FunctionId; } else { // A named arg starts here currentTokenType = TokenTypes.NamedArg; } } } // Capture last token currentTokenContent.append(nextChar); switch (currentTokenType) { case Value: blocks.add(new ValBlock(currentTokenContent.toString())); break; case Variable: blocks.add(new VarBlock(currentTokenContent.toString())); break; case FunctionId: NamedArgBlock namedArg = getNamedArg(currentTokenContent.toString()); // This isn't an expected block at this point but the TemplateTokenizer should throw an error when // a named arg is used without a function call if (namedArg != null) { blocks.add(namedArg); } else { blocks.add(new FunctionIdBlock(currentTokenContent.toString())); } break; case NamedArg: blocks.add(NamedArgBlock.from(currentTokenContent.toString())); break; case None: throw new TemplateException( TemplateException.ErrorCodes.SYNTAX_ERROR, "Tokens must be separated by one space least"); } return blocks; } private enum TokenTypes { None(0), Value(1), Variable(2), FunctionId(3), NamedArg(4); TokenTypes(int i) { } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy