All Downloads are FREE. Search and download functionalities are using the official Maven repository.

tri.ai.embedding.EmbeddingDocument.kt Maven / Gradle / Ivy

/*-
 * #%L
 * tri.promptfx:promptkt
 * %%
 * Copyright (C) 2023 - 2024 Johns Hopkins University Applied Physics Laboratory
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */
package tri.ai.embedding

import com.fasterxml.jackson.annotation.JsonIgnore
import java.io.File

/** A document with a list of sections. */
class EmbeddingDocument(val path: String) {
    /** The sections of the document. */
    val sections: MutableList = mutableListOf()

    /** Get short name of path. */
    @get:JsonIgnore
    val shortName: String
        get() = File(path).name

    /** Get short name of path without extension. */
    @get:JsonIgnore
    val shortNameWithoutExtension: String
        get() = shortName.substringBeforeLast('.')

    /** The file with the raw text. */
    fun rawTextUrl(rootDir: File): File? {
        val file1 = File(rootDir, path)
        val file2 = File(path)
        return when {
            file1.exists() -> file1
            file2.exists() -> file2
            else -> null
        }
    }

    /** The original file. */
    fun originalUrl(rootDir: File): File? {
        val file = rawTextUrl(rootDir) ?: return null
        return SUPPORTED_EXTENSIONS.map {
            File(file.parentFile, file.nameWithoutExtension + ".$it")
        }.firstOrNull { it.exists() } ?: file
    }

    /** The raw text of the document. */
    fun readText(rootDir: File) = rawTextUrl(rootDir)?.readText() ?: "Unable to locate $path in $rootDir"

    /** The raw text of the section. */
    fun readText(rootDir: File, section: EmbeddingSection) =
        readText(rootDir).substring(section.start, section.end)

    companion object {
        /** Extensions supported by the embedding index, either raw text or with available scrapers. */
        val SUPPORTED_EXTENSIONS = listOf("pdf", "doc", "docx", "txt")
    }
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy