io.quarkiverse.langchain4j.llama3.copy.AOT Maven / Gradle / Ivy
package io.quarkiverse.langchain4j.llama3.copy;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Map;
import java.util.Objects;
/**
* Support for AOT preloading of GGUF metadata with GraalVM's Native Image.
*
*
* To preload a model at build time, pass {@code -Dllama.PreloadGGUF=/path/to/model.gguf}
* to the native-image builder command. At runtime, the preloaded model will be used
* iff the specified and preloaded file names (base name) match.
*/
public final class AOT {
public record PartialModel(String modelFileName, Llama model, long tensorDataOffset,
Map tensorInfos) {
}
private static final PartialModel PRELOADED_GGUF = preLoadGGUF(System.getProperty("llama.PreloadGGUF"));
public static PartialModel preLoadGGUF(String modelPath) {
if (modelPath == null || modelPath.isEmpty()) {
return null;
}
try {
Path path = Path.of(modelPath);
if (!Files.exists(path) || !Files.isRegularFile(path)) {
throw new IllegalArgumentException("Cannot pre-load model: " + path);
}
GGUF gguf = GGUF.loadModel(path);
try (FileChannel fileChannel = FileChannel.open(path, StandardOpenOption.READ)) {
return new PartialModel(
path.getFileName().toString(),
ModelLoader.loadModel(fileChannel, gguf, Llama3.Options.DEFAULT_MAX_TOKENS, false),
gguf.getTensorDataOffset(),
gguf.getTensorInfos());
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Tries to reuse a compatible AOT preloaded model.
* The file name (base name) must match with the preloaded file name.
* No checksum/hash is checked for performance reasons.
*/
public static Llama tryUsePreLoaded(Path modelPath, int contextLength) throws IOException {
AOT.PartialModel preLoaded = AOT.PRELOADED_GGUF;
if (preLoaded == null) {
return null; // no pre-loaded model stored
}
String optionsModel = modelPath.getFileName().toString();
String preLoadedModel = preLoaded.modelFileName();
if (!Objects.equals(optionsModel, preLoadedModel)) {
// Preloaded and specified model file names didn't match.
return null;
}
Llama baseModel = preLoaded.model();
try (var timer = Timer.log("Load tensors from pre-loaded model");
var fileChannel = FileChannel.open(modelPath, StandardOpenOption.READ)) {
// Load only the tensors (mmap slices).
Map tensorEntries = GGUF.loadTensors(fileChannel, preLoaded.tensorDataOffset(),
preLoaded.tensorInfos());
Llama.Weights weights = ModelLoader.loadWeights(tensorEntries, baseModel.configuration());
return new Llama(baseModel.configuration().withContextLength(contextLength), baseModel.tokenizer(), weights);
}
}
}