io.codemodder.plugins.llm.Tokens Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of codemodder-plugin-llm Show documentation

Codemod plugin for augmenting transformation with LLM assisted analysis and fixes

There is a newer version: 0.97.6

Show newest version

package io.codemodder.plugins.llm;

import com.knuddels.jtokkit.Encodings;
import com.knuddels.jtokkit.api.Encoding;
import com.knuddels.jtokkit.api.EncodingRegistry;
import com.knuddels.jtokkit.api.EncodingType;
import com.theokanning.openai.completion.chat.ChatMessage;
import java.util.List;

/** A set of utilities around LLM tokens. */
public final class Tokens {

  private Tokens() {}

  /**
   * Estimates the number of tokens the messages will consume.
   *
   * This does not yet support estimating the number of tokens the functions will consume, since
   * the unofficial
   * solutions are brittle.
   *
   * We should be able to replace this with {@code TikTokensUtil.tokens} when the feature is released.
   *
   * @param messages The messages.
   * @param tokensPerMessage The number of tokens consumed per message by the given model.
   * @param encodingType The encoding type used by the model.
   * @return The number of tokens.
   * @see How
   *     to count tokens with tiktoken
   */
  public static int countTokens(
      final List messages,
      final int tokensPerMessage,
      final EncodingType encodingType) {
    EncodingRegistry registry = Encodings.newDefaultEncodingRegistry();
    Encoding encoding = registry.getEncoding(encodingType);

    int count = 0;
    for (ChatMessage message : messages) {
      count += tokensPerMessage;
      count += encoding.countTokens(message.getContent());
      count += encoding.countTokens(message.getRole());
    }
    count += tokensPerMessage; // Every reply is primed with <|start|>assistant<|message|>.

    return count;
  }
}