io.codemodder.plugins.llm.Tokens Maven / Gradle / Ivy
Show all versions of codemodder-plugin-llm Show documentation
package io.codemodder.plugins.llm;
import com.knuddels.jtokkit.Encodings;
import com.knuddels.jtokkit.api.Encoding;
import com.knuddels.jtokkit.api.EncodingRegistry;
import com.knuddels.jtokkit.api.EncodingType;
import java.util.List;
/** A set of utilities around LLM tokens. */
public final class Tokens {
private Tokens() {}
/**
* Estimates the number of tokens the messages will consume.
*
* This does not yet support estimating the number of tokens the functions will consume, since
* the unofficial
* solutions are brittle.
*
*
We should be able to replace this with {@code TikTokensUtil.tokens} when the feature is released.
*
* @param messages The messages.
* @param tokensPerMessage The number of tokens consumed per message by the given model.
* @param encodingType The encoding type used by the model.
* @return The number of tokens.
* @see How
* to count tokens with tiktoken
*/
public static int countTokens(
final List messages, final int tokensPerMessage, final EncodingType encodingType) {
EncodingRegistry registry = Encodings.newDefaultEncodingRegistry();
Encoding encoding = registry.getEncoding(encodingType);
int count = 0;
for (var message : messages) {
count += tokensPerMessage;
count += encoding.countTokens(message);
}
count += tokensPerMessage; // Every reply is primed with <|start|>assistant<|message|>.
return count;
}
}