All Downloads are FREE. Search and download functionalities are using the official Maven repository.

tri.ai.embedding.LegacyEmbeddingIndex.kt Maven / Gradle / Ivy

/*-
 * #%L
 * tri.promptfx:promptkt
 * %%
 * Copyright (C) 2023 - 2025 Johns Hopkins University Applied Physics Laboratory
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */
package tri.ai.embedding

import com.fasterxml.jackson.module.kotlin.readValue
import tri.ai.embedding.LocalFolderEmbeddingIndex.Companion.EMBEDDINGS_FILE_NAME_LEGACY
import tri.ai.openai.OpenAiModelIndex.ADA_ID
import tri.ai.openai.jsonMapper
import tri.ai.text.chunks.TextChunkInDoc
import tri.ai.text.chunks.TextChunkRaw
import tri.ai.text.chunks.process.LocalFileManager
import tri.ai.text.chunks.process.LocalFileManager.originalFile
import tri.ai.text.chunks.process.LocalFileManager.textCacheFile
import tri.ai.text.chunks.process.LocalTextDocIndex.Companion.createTextDoc
import tri.ai.text.chunks.process.TextDocEmbeddings.putEmbeddingInfo
import tri.util.fine
import tri.util.info
import java.io.File

/**
 * An embedding index that loads the documents from the local file system.
 * This should be used only for importing legacy formats.
 */
class LegacyEmbeddingIndex {
    var info = mapOf()
    companion object {
        fun loadFrom(file: File): LegacyEmbeddingIndex {
            val index = LegacyEmbeddingIndex()
            index.info = jsonMapper.readValue(file)
            return index
        }
    }
}

//region WORKING WITH LEGACY DATA

/** Legacy format for embedding information. */
class LegacyEmbeddingInfo {
    var path: String = ""
    var sections: List = listOf()
}

/** Legacy format for embedding section information. */
class LegacyEmbeddingSectionInfo {
    var embedding: List = listOf()
    var start: Int = 0
    var end: Int = 0
}

//endregion

//region WORKING WITH LEGACY DATA

/** Upgrades a legacy format embeddings file. Only supports upgrading from OpenAI embeddings file, `embeddings.json`. */
fun LocalFolderEmbeddingIndex.upgradeEmbeddingIndex() {
    val folder = rootDir
    val file = indexFile
    val oldFile = File(folder, EMBEDDINGS_FILE_NAME_LEGACY)
    if (!file.exists() && oldFile.exists()) {
        fine("Checking legacy embeddings file for embedding vectors: $oldFile")
        try {
            var changed = false
            LegacyEmbeddingIndex.loadFrom(oldFile).info.values.map {
                val f = LocalFileManager.fixPath(File(it.path), folder)?.originalFile()
                    ?: throw IllegalArgumentException("File not found: ${it.path}")
                f.createTextDoc().apply {
                    all = TextChunkRaw(f.textCacheFile().readText())
                    chunks.addAll(it.sections.map {
                        TextChunkInDoc(it.start, it.end).apply {
                            if (it.embedding.isNotEmpty())
                                putEmbeddingInfo(ADA_ID, it.embedding, EmbeddingPrecision.FIRST_EIGHT)
                        }
                    })
                }
            }.forEach {
                if (addIfNotPresent(it))
                    changed = true
            }
            if (changed) {
                info("Upgraded legacy embeddings file to new format.")
                saveIndex()
                info("Legacy embeddings file $oldFile can be deleted unless needed for previous versions of PromptFx.")
            } else {
                fine("No new embeddings found in legacy embeddings file.")
            }
        } catch (x: Exception) {
            info("Failed to load legacy embeddings file: ${x.message}")
        }
    }
}

//endregion




© 2015 - 2025 Weber Informatics LLC | Privacy Policy