tri.ai.text.chunks.process.LocalTextDocIndex.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of promptkt Show documentation
Show all versions of promptkt Show documentation
LLM and prompt engineering.
The newest version!
/*-
* #%L
* tri.promptfx:promptkt
* %%
* Copyright (C) 2023 - 2024 Johns Hopkins University Applied Physics Laboratory
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
package tri.ai.text.chunks.process
import tri.ai.text.chunks.*
import tri.ai.text.chunks.process.LocalFileManager.extractTextContent
import tri.ai.text.chunks.process.LocalFileManager.textCacheFile
import tri.ai.text.chunks.process.LocalFileManager.textFiles
import java.io.File
import java.time.Instant
import java.time.LocalDateTime
/**
* Manages set of [TextDoc]s coming from a local file structure.
* Designed to save/restore metadata and chunk information to the file system.
*/
class LocalTextDocIndex(
private val rootFolder: File,
_indexFile: File? = null
) {
/** Index of documents by id. */
val docIndex = mutableMapOf>()
private val indexFile = _indexFile ?: File(rootFolder, "docs.json")
//region PROCESSING
/** Scrapes text from documents in index. */
fun processDocuments(reindexAll: Boolean) {
rootFolder.extractTextContent(reindexAll)
if (reindexAll) {
docIndex.clear()
}
val docs = mutableMapOf>()
rootFolder.textFiles().forEach {
if (reindexAll || it.absolutePath !in docs) {
val doc = it.createTextDoc()
docs[doc.metadata.id] = it to doc
}
}
if (reindexAll)
docIndex.putAll(docs)
else
docIndex.putAll(docs.filter { it.key !in docIndex })
}
/** Breaks up documents into chunks. */
fun processChunks(chunker: TextChunker, reindexAll: Boolean) {
docIndex.values.forEach {
chunker.process(it.first, it.second, reindexAll)
}
}
/** Breaks up a single document into chunks. */
fun TextChunker.process(file: File, doc: TextDoc, reindexAll: Boolean) {
if (doc.chunks.isEmpty() || reindexAll) {
doc.chunks.clear()
doc.all = TextChunkRaw(file.readText())
doc.chunks.addAll(chunk(doc.all!!))
}
}
//endregion
//region SAVING INDEX
/** Loads index from file. */
fun loadIndex() {
if (indexFile.exists()) {
docIndex.clear()
val index = TextLibrary.loadFrom(indexFile)
index.docs.forEach { doc ->
docIndex[doc.metadata.id] = File(doc.metadata.path!!).textCacheFile() to doc
}
}
}
/** Saves index to file. */
fun saveIndex() {
val index = TextLibrary("").apply {
docs.addAll(docIndex.values.map { it.second })
}
TextLibrary.saveTo(index, indexFile)
}
//endregion
companion object {
/** Create a [TextDoc] with metadata from a file's attributes. */
fun File.createTextDoc() = TextDoc(name).apply {
metadata.title = nameWithoutExtension
metadata.dateTime = LocalDateTime.ofInstant(
Instant.ofEpochMilli(lastModified()),
java.time.ZoneId.systemDefault()
)
metadata.path = toURI()
metadata.relativePath = name
}
}
}