com.itextpdf.pdfocr.tesseract4.Tesseract4LibOcrEngine Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdfocr-tesseract4 Show documentation
pdfOCR-Tesseract4 is an iText add-on for Java to recognize and extract text in scanned documents and images. It can also convert them into fully ISO-compliant PDF or PDF/A-3u files that are accessible, searchable, and suitable for archiving
The newest version!
/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2024 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see .
 */
package com.itextpdf.pdfocr.tesseract4;

import com.itextpdf.commons.actions.confirmations.ConfirmEvent;
import com.itextpdf.commons.actions.confirmations.EventConfirmationType;
import com.itextpdf.commons.utils.MessageFormatUtil;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.itextpdf.pdfocr.AbstractPdfOcrEventHelper;
import com.itextpdf.pdfocr.tesseract4.actions.events.PdfOcrTesseract4ProductEvent;
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrTesseract4Exception;
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrTesseract4ExceptionMessageConstant;
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrInputTesseract4Exception;
import com.itextpdf.pdfocr.tesseract4.logs.Tesseract4LogMessageConstant;

import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.TesseractException;
import org.slf4j.LoggerFactory;

/**
 * The implementation of {@link AbstractTesseract4OcrEngine} for tesseract OCR.
 *
 * 
 * This class provides possibilities to use features of "tesseract"
 * using tess4j.
 *
 * 
 * Please note that this class is not thread-safe, in other words this Tesseract engine cannot
 * be used for multithreaded processing. You should create one instance per thread
 */
public class Tesseract4LibOcrEngine extends AbstractTesseract4OcrEngine {

    /**
     * {@link net.sourceforge.tess4j.ITesseract} Instance.
     * (depends on OS type)
     */
    private ITesseract tesseractInstance = null;

    /**
     * Pattern for matching ASCII string.
     */
    private static final Pattern ASCII_STRING_PATTERN = Pattern.compile("^[\\u0000-\\u007F]*$");

    /**
     * Creates a new {@link Tesseract4LibOcrEngine} instance.
     *
     * @param tesseract4OcrEngineProperties set of properteis
     */
    public Tesseract4LibOcrEngine(
            final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties) {
        super(tesseract4OcrEngineProperties);
        tesseractInstance = TesseractOcrUtil
                .initializeTesseractInstance(isWindows(), null,
                        null, null);
    }

    /**
     * Gets tesseract instance.
     *
     * @return initialized {@link net.sourceforge.tess4j.ITesseract} instance
     */
    public ITesseract getTesseractInstance() {
        return tesseractInstance;
    }

    /**
     * Initializes instance of tesseract if it haven't been already
     * initialized or it have been disposed and sets all the required
     * properties.
     *
     * @param outputFormat selected {@link OutputFormat} for tesseract
     */
    public void initializeTesseract(final OutputFormat outputFormat) {
        if (getTesseractInstance() == null
                || TesseractOcrUtil
                .isTesseractInstanceDisposed(getTesseractInstance())) {
            tesseractInstance = TesseractOcrUtil
                    .initializeTesseractInstance(isWindows(), getTessData(),
                            getLanguagesAsString(),
                            getTesseract4OcrEngineProperties()
                                    .getPathToUserWordsFile());
        }
        getTesseractInstance()
                .setTessVariable("tessedit_create_hocr",
                        outputFormat.equals(OutputFormat.HOCR) ? "1" : "0");

        if (getTesseract4OcrEngineProperties().isUseTxtToImproveHocrParsing()) {
            getTesseractInstance().setTessVariable("preserve_interword_spaces", "1");
        }

        getTesseractInstance().setTessVariable("user_defined_dpi", "300");
        if (getTesseract4OcrEngineProperties()
                .getPathToUserWordsFile() != null) {
            getTesseractInstance()
                    .setTessVariable("load_system_dawg", "0");
            getTesseractInstance()
                    .setTessVariable("load_freq_dawg", "0");
            getTesseractInstance()
                    .setTessVariable("user_words_suffix",
                            getTesseract4OcrEngineProperties()
                                    .getDefaultUserWordsSuffix());
            getTesseractInstance()
                    .setTessVariable("user_words_file",
                            getTesseract4OcrEngineProperties()
                                    .getPathToUserWordsFile());
        }

        TesseractOcrUtil.setTesseractProperties(getTesseractInstance(),
                getTessData(), getLanguagesAsString(),
                getTesseract4OcrEngineProperties().getPageSegMode(),
                getTesseract4OcrEngineProperties().getPathToUserWordsFile());
    }

    /**
     * Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page
     * of input image (by default 1st).
     *
     * Please note that list of output files is accepted instead of a single file because
     * page number parameter is not respected in case of TIFF images not requiring preprocessing.
     * In other words, if the passed image is the TIFF image and according to the {@link Tesseract4OcrEngineProperties}
     * no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list
     * is expected to be same as number of pages in the image, otherwise, only one file is expected
     *
     * @param inputImage input image {@link java.io.File}
     * @param outputFiles {@link java.util.List} of output files
     *                                          (one per each page)
     * @param outputFormat selected {@link OutputFormat} for tesseract
     * @param pageNumber number of page to be processed
     * @param dispatchEvent indicates if event needs to be dispatched
     * @param eventHelper event helper
     */
    void doTesseractOcr(final File inputImage,
            final List outputFiles, final OutputFormat outputFormat,
            final int pageNumber, final boolean dispatchEvent, AbstractPdfOcrEventHelper eventHelper) {
        PdfOcrTesseract4ProductEvent event = null;
        if (eventHelper == null) {
            eventHelper = new Tesseract4EventHelper();
        }
        // usage event
        if (dispatchEvent) {
            event = onEvent(eventHelper);
        }
        try {
            // check tess data path for non ASCII characters
            validateTessDataPath(getTessData());
            validateLanguages(getTesseract4OcrEngineProperties()
                    .getLanguages());
            initializeTesseract(outputFormat);

            // if preprocessing is not needed and provided image is tiff,
            // the image will be paginated and separate pages will be OCRed
            List resultList = new ArrayList();
            if (!getTesseract4OcrEngineProperties().isPreprocessingImages()
                    && ImagePreprocessingUtil.isTiffImage(inputImage)) {
                resultList = getOcrResultForMultiPage(inputImage,
                        outputFormat);
            } else {
                resultList.add(getOcrResultForSinglePage(inputImage,
                        outputFormat, pageNumber));
            }

            // list of result strings is written to separate files
            // (one for each page)
            for (int i = 0; i < resultList.size(); i++) {
                String result = resultList.get(i);
                File outputFile = i >= outputFiles.size()
                        ? null : outputFiles.get(i);
                if (result != null && outputFile != null) {
                    try (Writer writer = new OutputStreamWriter(
                            new FileOutputStream(outputFile.getAbsolutePath()),
                            StandardCharsets.UTF_8)) {
                        writer.write(result);
                    } catch (IOException e) {
                        throw new PdfOcrInputTesseract4Exception(
                                PdfOcrTesseract4ExceptionMessageConstant.CANNOT_WRITE_TO_FILE, e);
                    }
                }
            }

            // statistics event
            onEventStatistics(eventHelper);

            // confirm on_demand event
            if (event != null && event.getConfirmationType() == EventConfirmationType.ON_DEMAND) {
                eventHelper.onEvent(new ConfirmEvent(event));
            }
        } catch (PdfOcrTesseract4Exception e) {
            LoggerFactory.getLogger(getClass())
                    .error(e.getMessage());
            throw new PdfOcrTesseract4Exception(e.getMessage(), e);
        } finally {
            if (tesseractInstance != null) {
                TesseractOcrUtil.disposeTesseractInstance(tesseractInstance);
            }
            if (getTesseract4OcrEngineProperties().getPathToUserWordsFile()
                    != null
                    && getTesseract4OcrEngineProperties().isUserWordsFileTemporary()) {
                TesseractHelper.deleteFile(
                        getTesseract4OcrEngineProperties()
                                .getPathToUserWordsFile());
            }
        }
    }

    /**
     * Validates Tess Data path,
     * checks if tess data path contains only ASCII charset.
     * Note: tesseract lib has issues with non ASCII characters in tess data path.
     *
     * @param tessDataPath {@link java.lang.String} path to tess data
     */
    private static void validateTessDataPath(final String tessDataPath) {
        Matcher asciiStringMatcher = ASCII_STRING_PATTERN.matcher(tessDataPath);

        if (!asciiStringMatcher.matches()) {
            throw new PdfOcrTesseract4Exception(
                    PdfOcrTesseract4ExceptionMessageConstant
                            .PATH_TO_TESS_DATA_DIRECTORY_CONTAINS_NON_ASCII_CHARACTERS);
        }
    }

    /**
     * Gets OCR result from provided multi-page image and returns result as
     * list of strings for each page. This method is used for tiff images
     * when preprocessing is not needed.
     *
     * @param inputImage input image {@link java.io.File}
     * @param outputFormat selected {@link OutputFormat} for tesseract
     * @return list of result string that will be written to a temporary files
     * later
     */
    private List getOcrResultForMultiPage(final File inputImage,
            final OutputFormat outputFormat) {
        List resultList = new ArrayList();
        try {
            initializeTesseract(outputFormat);
            TesseractOcrUtil util = new TesseractOcrUtil();
            util.initializeImagesListFromTiff(inputImage);
            int numOfPages = util.getListOfPages().size();
            for (int i = 0; i < numOfPages; i++) {
                String result = util.getOcrResultAsString(
                        getTesseractInstance(),
                        util.getListOfPages().get(i),
                        outputFormat);
                resultList.add(result);
            }
        } catch (TesseractException e) {
            String msg = MessageFormatUtil
                    .format(Tesseract4LogMessageConstant.TESSERACT_FAILED,
                            e.getMessage());
            LoggerFactory.getLogger(getClass())
                    .error(msg);
            throw new PdfOcrTesseract4Exception(
                    PdfOcrTesseract4ExceptionMessageConstant
                            .TESSERACT_FAILED);
        } finally {
            TesseractOcrUtil
                    .disposeTesseractInstance(getTesseractInstance());
        }
        return resultList;
    }

    /**
     * Gets OCR result from provided single page image and preprocesses it if
     * it is needed.
     *
     * @param inputImage input image {@link java.io.File}
     * @param outputFormat selected {@link OutputFormat} for tesseract
     * @param pageNumber number of page to be OCRed
     * @return result as string that will be written to a temporary file later
     */
    private String getOcrResultForSinglePage(final File inputImage,
            final OutputFormat outputFormat,
            final int pageNumber) {
        String result = null;
        try {
            // preprocess if required
            if (getTesseract4OcrEngineProperties().isPreprocessingImages()) {
                // preprocess and try to ocr
                result = new TesseractOcrUtil().getOcrResultAsString(
                        getTesseractInstance(),
                        ImagePreprocessingUtil
                                .preprocessImage(inputImage, pageNumber,
                                        getTesseract4OcrEngineProperties().getImagePreprocessingOptions()),
                        outputFormat);
            }
            if (result == null) {
                BufferedImage bufferedImage = ImagePreprocessingUtil
                        .readImage(inputImage);
                if (bufferedImage != null) {
                    try {
                        result = new TesseractOcrUtil()
                                .getOcrResultAsString(getTesseractInstance(),
                                        bufferedImage, outputFormat);
                    } catch (Exception e) {
                        LoggerFactory.getLogger(getClass())
                                .info(MessageFormatUtil.format(
                                        Tesseract4LogMessageConstant
                                                .CANNOT_PROCESS_IMAGE,
                                        e.getMessage()));
                    }
                }
                if (result == null) {
                    // perform ocr using original input image
                    result = new TesseractOcrUtil()
                            .getOcrResultAsString(getTesseractInstance(),
                                    inputImage, outputFormat);
                }
            }
        } catch (Exception e) {
            LoggerFactory.getLogger(getClass())
                    .error(MessageFormatUtil
                            .format(Tesseract4LogMessageConstant
                                            .TESSERACT_FAILED,
                                    e.getMessage()));
            throw new PdfOcrTesseract4Exception(
                    PdfOcrTesseract4ExceptionMessageConstant
                            .TESSERACT_FAILED);
        }

        return result;
    }
}