org.hildan.ocr.reference.ReferenceImages.kt Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of simple-ocr Show documentation
A simple OCR that recognizes characters in an image given a set of base character images
The newest version!
package org.hildan.ocr.reference

import org.hildan.ocr.TextDetector
import org.hildan.ocr.readImage
import java.awt.image.BufferedImage
import java.net.URLDecoder
import java.net.URLEncoder
import java.nio.file.Path
import java.util.*
import javax.imageio.ImageIO
import kotlin.io.path.createDirectories
import kotlin.io.path.listDirectoryEntries
import kotlin.io.path.nameWithoutExtension
import kotlin.streams.toList

private val whiteSpaceRegex = Regex("\\s+")

/**
 * A reference image to recognize text.
 */
data class ReferenceImage(
    /** The image data. */
    val image: BufferedImage,
    /** The text value corresponding to the image. */
    val text: String,
)

object ReferenceImages {

    /**
     * Reads [ReferenceImage]s from the given [directory], associating them with text based on their names.
     *
     * If it contains no spaces, the name without extension of the image file is used as text for the image.
     * Otherwise, only the part of the name up to the first space is used as text.
     *
     * This way, more info can be added to the name after a space, which can be useful to map several images to the
     * same text, or have images for lowercase and uppercase letters on case-insensitive file systems (for example,
     * `"a.png"` and `"A upper.png"`.
     *
     * A [glob] pattern can be used to filter the files from the given [directory].
     */
    fun readFrom(directory: Path, glob: String = "*.png"): List =
        directory.listDirectoryEntries(glob).map {
            ReferenceImage(image = it.readImage(), text = inferTextFromPath(it))
        }
}

// ignores anything after a space to allow disambiguation on case-insensitive file systems
private fun inferTextFromPath(it: Path) = it.nameWithoutExtension.split(" ")[0].unescapeFilenameToChar()

/**
 * Splits the given [sampleImage] into sub-images of text elements, and saves them as files into the given [outputDir].
 * Each file is named based on the result of [subImageFilenameWithoutExt], which is called for each sub-image.
 *
 * Sub-images are often individual characters, but sometimes several characters can be grouped together due to kerning.
 * For instance, a lowercase letter following an uppercase T ou V can be part of a single sub-image (Te, To, Va...).
 */
fun TextDetector.splitAndSaveSubImages(
    sampleImage: BufferedImage,
    outputDir: Path,
    subImageFilenameWithoutExt: (index: Int, subImg: BufferedImage) -> String = { _, _ -> UUID.randomUUID().toString() },
): List {
    outputDir.createDirectories()
    return splitTextElements(sampleImage).mapIndexed { index, subImg ->
        outputDir.resolve(subImageFilenameWithoutExt(index, subImg) + ".png").also { path ->
            ImageIO.write(subImg, "png", path.toFile())
        }
    }
}

/**
 * Splits the given [sampleImage] into sub-images of text elements, and saves them as files into the given [imageStore].
 * The image store reuses images and doesn't write duplicates.
 *
 * Sub-images are often individual characters, but sometimes several characters can be grouped together due to kerning.
 * For instance, a lowercase letter following an uppercase T ou V can be part of a single sub-image (Te, To, Va...).
 */
fun TextDetector.splitAndSaveSubImages(
    sampleImage: BufferedImage,
    imageStore: UniqueImageStore,
): List = splitTextElements(sampleImage).map { subImg ->
    imageStore.saveOrGetPath(subImg)
}

/**
 * Reads all images from [sampleImagesDir] matching [sampleImagesGlob], and splits them into sub-images of text
 * elements. The resulting sub-images are saved in the given [outputDir], with no exact duplicates.
 *
 * Sub-images are often individual characters, but sometimes several characters can be grouped together due to kerning.
 * For instance, a lowercase letter following an uppercase T ou V can be part of a single sub-image (Te, To, Va...).
 *
 * Those sub-images should then be manually renamed according to their text content, so they can be used as reference
 * images by the OCR. Load them using [ReferenceImages.readFrom].
 */
fun TextDetector.splitAndSaveSubImages(sampleImagesDir: Path, outputDir: Path, sampleImagesGlob: String = "*") {
    val imageStore = UniqueImageStore(outputDir)
    sampleImagesDir.listDirectoryEntries(sampleImagesGlob)
        .map { it.readImage() }
        .forEach { splitAndSaveSubImages(it, imageStore) }
}

/**
 * Splits the given [sampleImage] into sub-images of text elements, and saves them as files into the given [outputDir].
 * Each file is named based on the characters (more specifically, the unicode code points) in [sampleText].
 * Characters that are not valid as file names are escaped.
 *
 * ## Important note
 *
 * Sub-images are often individual characters, but sometimes several characters can be grouped together due to kerning.
 * For instance, a lowercase letter following an uppercase T ou V can be part of a single sub-image (Te, To, Va...).
 *
 * If the kerning of your font causes this kind of grouping, this method will not properly map images to characters.
 * In that case, please prefer [splitAndSaveSubImages].
 */
fun TextDetector.splitAndSaveCharacterImages(
    sampleImage: BufferedImage,
    sampleText: String,
    outputDir: Path,
): List {
    val codePoints = sampleText.replace(whiteSpaceRegex, "").splitCodePoints()
    return splitAndSaveSubImages(sampleImage, outputDir) { index, _ -> codePoints[index].escapeCharForFilename() }
}

private fun String.splitCodePoints() = codePoints().toList().map { Character.toString(it) }

// extending string to support code points above the BMP
private fun String.escapeCharForFilename() = when(this) {
    "." -> "dot"
    "-" -> "dash"
    "/" -> "slash"
    "\\" -> "backslash"
    else -> URLEncoder.encode(this, Charsets.UTF_8)
}

// returning string to support code points above the BMP
private fun String.unescapeFilenameToChar() = when(this) {
    "dot" -> "."
    "dash" -> "-"
    "slash" -> "/"
    "backslash" -> "\\"
    else -> URLDecoder.decode(this, Charsets.UTF_8)
}