All Downloads are FREE. Search and download functionalities are using the official Maven repository.

tri.ai.text.chunks.process.LocalFileManager.kt Maven / Gradle / Ivy

/*-
 * #%L
 * tri.promptfx:promptkt
 * %%
 * Copyright (C) 2023 - 2024 Johns Hopkins University Applied Physics Laboratory
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */
package tri.ai.text.chunks.process

import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import tri.util.pdf.PdfUtils
import tri.util.poi.WordDocUtils
import java.io.File
import java.io.FileFilter
import java.net.URI

/**
 * Handles local files, including extraction of text from various formats, catching of text in associated ".txt" files,
 * managing of changes to paths, and retrieval of original files.
 */
object LocalFileManager {

    const val TXT = "txt"
    const val PDF = "pdf"
    private const val DOCX = "docx"
    private const val DOC = "doc"

    /** Extensions supported by the embedding index, either raw text or with available scrapers. */
    private val SUPPORTED_EXTENSIONS = listOf(PDF, DOCX, DOC, TXT)

    /**
     * Attempt to fix a file path, when the file may have been moved to another directory.
     * Returns the original file location, if it exists, or the file inside the alternate folder,
     * if there is a matching file with the same name.
     */
    fun fixPath(file: File, alternateFolder: File): File? {
        require(alternateFolder.isDirectory)
        val file2 = File(alternateFolder, file.name)
        return when {
            file2.exists() -> file2
            file.exists() -> file
            else -> null
        }
    }

    //region FILE MANAGEMENT

    /** Return file filter for files that are convertible to text, but not a text cache file themselves. */
    val fileWithTextContentFilter = FileFilter { file -> file.hasTextContent() && !file.isLikelyTextCache() }

    /** List all files with .txt extension in a given folder. */
    fun File.textFiles() = listFiles { f -> f.extension.lowercase() == TXT }?.toList() ?: emptyList()

    /** List files that are convertible to text in a given folder. */
    fun File.listFilesWithTextContent(): List {
        require(isDirectory)
        return listFiles(fileWithTextContentFilter)?.toList() ?: emptyList()
    }

    /** Return true if the file is convertible to text. */
    private fun File.hasTextContent() = extension.lowercase() in SUPPORTED_EXTENSIONS

    /** Return true if the file has a .txt extension and there is an associated "original" file that likely matches. */
    private fun File.isLikelyTextCache() = extension.lowercase() == TXT && originalFile()?.extension?.lowercase() != TXT

    /** Find a non .txt file that might be associated with a .txt file. */
    fun File.originalFile(): File? {
        val name = nameWithoutExtension
        return SUPPORTED_EXTENSIONS.map { File(parentFile, "$name.$it") }
            .firstOrNull { it.exists() }
    }

    /** Map a file to an associated .txt file. */
    fun File.textCacheFile(): File {
        val name = nameWithoutExtension
        return File(parentFile, "$name.txt")
    }
    /** Map a file to an associated metadata file. */
    fun File.metadataFile(): File {
        val name = nameWithoutExtension
        return File(parentFile, "$name.meta.json")
    }

    //endregion

    //region SCRAPING

    /**
     * Reads text content from a given URI, or the text file matching its contents.
     * Throws an exception if URI is not a file.
     */
    fun readText(uri: URI) =
        File(uri).fileToText(true)

    /** Scrape all documents with text content in a folder. */
    fun File.extractTextContent(reprocessAll: Boolean = false) {
        require(isDirectory)
        listFiles {
                f -> f.hasTextContent() && f.extension.lowercase() != TXT
                && (reprocessAll || !f.textCacheFile().exists())
        }?.forEach {
            it.fileToText(true)
        }
    }

    /**
     * Get text from a file by extension.
     * @param useCache if true, reads/writes to a .txt file in the same directory, creating it if it doesn't already exist
     */
    fun File.fileToText(useCache: Boolean): String {
        val txtFile = textCacheFile()
        if (useCache && txtFile.exists()) {
            return txtFile.readText()
        }
        return when (extension) {
            PDF -> PdfUtils.pdfText(this)
            DOC -> WordDocUtils.readDoc(this)
            DOCX -> WordDocUtils.readDocx(this)
            else -> readText()
        }.also {
            if (useCache) {
                txtFile.writeText(it)
                extractMetadata()
            }
        }
    }

    /**
     * Extract metadata from a given file and save it adjacent to the file so it can be easily accessed later.
     */
    fun File.extractMetadata(): Map {
        val props = when (extension) {
            PDF -> PdfUtils.pdfMetadata(this)
            DOC -> WordDocUtils.readDocMetadata(this)
            DOCX -> WordDocUtils.readDocxMetadata(this)
            else -> emptyMap()
        }.filterValues { it != null && (it !is String || it.isNotBlank()) }
        if (props.isNotEmpty())
            ObjectMapper()
                .registerModule(JavaTimeModule())
                .writerWithDefaultPrettyPrinter()
                .writeValue(metadataFile(), props)
        return props
    }

    //endregion
}





© 2015 - 2024 Weber Informatics LLC | Privacy Policy