tri.ai.text.chunks.LocalTextDocumentSet.kt Maven / Gradle / Ivy
/*-
* #%L
* tri.promptfx:promptkt
* %%
* Copyright (C) 2023 - 2024 Johns Hopkins University Applied Physics Laboratory
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
package tri.ai.text.chunks
import org.apache.poi.hwpf.extractor.WordExtractor
import org.apache.poi.xwpf.extractor.XWPFWordExtractor
import org.apache.poi.xwpf.usermodel.XWPFDocument
import tri.ai.embedding.pdfText
import java.io.File
/**
* Document set managed within local file structure.
* Designed to save/restore metadata and chunk information to the file system.
*/
class LocalTextDocumentSet(
private val rootFolder: File,
_indexFile: File? = null
) {
private val documents = mutableMapOf()
private val chunks = mutableListOf()
private val indexFile = _indexFile ?: File(rootFolder, "docs.json")
//region PROCESSING
/** Scrapes text from documents in index. */
fun processDocuments(reindexAll: Boolean) {
preprocessDocumentFormats(reindexAll)
if (reindexAll) {
documents.clear()
chunks.clear()
}
val docs = mutableMapOf()
rootFiles(".txt").forEach {
if (reindexAll || it.absolutePath !in docs)
docs[it.absolutePath] = TextDocumentImpl(it)
}
documents.putAll(docs)
}
/** Breaks up documents into chunks. */
fun processChunks(chunker: TextChunker, reindexAll: Boolean): List {
val newChunks = if (reindexAll) {
chunks.clear()
documents.flatMap { chunker.chunk(it.value) }
} else {
val existing = chunks.filterIsInstance().map { it.doc.metadata.id }.toSet()
documents.filterKeys { it !in existing }.flatMap { chunker.chunk(it.value) }
}
chunks.addAll(newChunks)
return newChunks
}
//endregion
//region SAVING INDEX
/** Loads index from file. */
fun loadIndex() {
if (indexFile.exists()) {
documents.clear()
chunks.clear()
val index = LocalDocumentIndex.loadFrom(indexFile)
val docsLookup = index.documents.associateWith { TextDocumentImpl(File(it.metadata.id)) }
documents.putAll(docsLookup.mapKeys { it.key.metadata.id })
chunks.addAll(index.documents.flatMap { doc ->
doc.sections.map { TextSection(docsLookup[doc]!!, it.first..it.last) }
})
}
}
/** Saves index to file. */
fun saveIndex() {
val index = LocalDocumentIndex().apply {
documents = chunks
.filterIsInstance()
.groupBy { it.doc }
.entries
.map {
LocalDocumentInfo(it.key.metadata, it.key.attributes, it.value.map { TextSectionInfo(it) })
}
}
LocalDocumentIndex.saveTo(index, indexFile)
}
//endregion
//region FROM LocalEmbeddingIndex.kt
private fun rootFiles(ext: String) = rootFolder.listFiles { _, name -> name.endsWith(ext) }?.toList() ?: emptyList()
//region ALTERNATE FORMAT PROCESSING
/**
* Convert docs in other formats to text files if they don't already exist.
*/
private fun preprocessDocumentFormats(reprocessAll: Boolean = false) {
preprocess(".pdf", reprocessAll) { pdfText(it) }
preprocess(".docx", reprocessAll) { docxText(it) }
preprocess(".doc", reprocessAll) { docText(it) }
}
private fun preprocess(ext: String, reprocessAll: Boolean, op: (File) -> String) {
rootFiles(ext).forEach {
val txtFile = File(it.absolutePath.replace(ext, ".txt"))
if (reprocessAll || !txtFile.exists()) {
txtFile.writeText(op(it))
}
}
}
/** Extract text from DOCX. */
private fun docxText(file: File) = XWPFWordExtractor(XWPFDocument(file.inputStream())).text
/** Extract text from DOC. */
private fun docText(file: File) = WordExtractor(file.inputStream()).text
//endregion
//endregion
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy