com.itextpdf.pdfocr.tesseract4.AbstractTesseract4OcrEngine Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdfocr-tesseract4 Show documentation
pdfOCR-Tesseract4 is an iText add-on for Java to recognize and extract text in scanned documents and images. It can also convert them into fully ISO-compliant PDF or PDF/A-3u files that are accessible, searchable, and suitable for archiving
The newest version!
/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2024 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see .
 */
package com.itextpdf.pdfocr.tesseract4;

import com.itextpdf.commons.actions.confirmations.ConfirmEvent;
import com.itextpdf.commons.actions.confirmations.EventConfirmationType;
import com.itextpdf.commons.actions.contexts.IMetaInfo;
import com.itextpdf.commons.actions.data.ProductData;
import com.itextpdf.commons.utils.MessageFormatUtil;
import com.itextpdf.io.image.ImageType;
import com.itextpdf.pdfocr.AbstractPdfOcrEventHelper;
import com.itextpdf.pdfocr.IOcrEngine;
import com.itextpdf.pdfocr.IProductAware;
import com.itextpdf.pdfocr.OcrProcessContext;
import com.itextpdf.pdfocr.PdfOcrMetaInfoContainer;
import com.itextpdf.pdfocr.TextInfo;
import com.itextpdf.pdfocr.statistics.PdfOcrOutputType;
import com.itextpdf.pdfocr.statistics.PdfOcrOutputTypeStatisticsEvent;
import com.itextpdf.pdfocr.tesseract4.actions.data.PdfOcrTesseract4ProductData;
import com.itextpdf.pdfocr.tesseract4.actions.events.PdfOcrTesseract4ProductEvent;
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrTesseract4Exception;
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrTesseract4ExceptionMessageConstant;
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrInputTesseract4Exception;
import com.itextpdf.pdfocr.tesseract4.logs.Tesseract4LogMessageConstant;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import org.slf4j.LoggerFactory;

/**
 * The implementation of {@link IOcrEngine}.
 *
 * 
 * This class provides possibilities to perform OCR, to read data from input
 * files and to return contained text in the required format.
 * Also, there are possibilities to use features of "tesseract"
 * (optical character recognition engine for various operating systems).
 */
public abstract class AbstractTesseract4OcrEngine implements IOcrEngine, IProductAware {

    /**
     * Supported image formats.
     */
    private static final Set SUPPORTED_IMAGE_FORMATS =
            Collections.unmodifiableSet(new HashSet<>(
                    Arrays.asList(ImageType.BMP, ImageType.PNG,
                            ImageType.TIFF, ImageType.JPEG)));

    Set processedUUID = new HashSet<>();

    /**
     * Set of properties.
     */
    private Tesseract4OcrEngineProperties tesseract4OcrEngineProperties;

    private ThreadLocal threadLocalMetaInfo = new ThreadLocal<>();

    /**
     * Creates a new {@link Tesseract4OcrEngineProperties} instance
     * based on another {@link Tesseract4OcrEngineProperties} instance (copy
     * constructor).
     *
     * @param tesseract4OcrEngineProperties the other {@link Tesseract4OcrEngineProperties} instance
     */
    public AbstractTesseract4OcrEngine(
            Tesseract4OcrEngineProperties tesseract4OcrEngineProperties) {
        this.tesseract4OcrEngineProperties = tesseract4OcrEngineProperties;
    }

    /**
     * Performs tesseract OCR for the first (or for the only) image page.
     *
     * @param inputImage input image {@link java.io.File}
     * @param outputFile output file for the result for the first page
     * @param outputFormat selected {@link OutputFormat} for tesseract
     */
    public void doTesseractOcr(File inputImage, File outputFile,
            OutputFormat outputFormat) {
        doTesseractOcr(inputImage, outputFile, outputFormat, new OcrProcessContext(new Tesseract4EventHelper()));
    }

    /**
     * Performs tesseract OCR for the first (or for the only) image page.
     *
     * @param inputImage input image {@link java.io.File}
     * @param outputFile output file for the result for the first page
     * @param outputFormat selected {@link OutputFormat} for tesseract
     * @param ocrProcessContext ocr process context
     */
    public void doTesseractOcr(File inputImage, File outputFile,
            OutputFormat outputFormat,
            OcrProcessContext ocrProcessContext) {
        doTesseractOcr(inputImage, Collections.singletonList(outputFile),
                outputFormat, 1, ocrProcessContext.getOcrEventHelper());
    }

    /**
     * Performs OCR using provided {@link IOcrEngine} for the given list of
     * input images and saves output to a text file using provided path.
     *
     * @param inputImages {@link java.util.List} of images to be OCRed
     * @param txtFile file to be created
     */
    public void createTxtFile(final List inputImages, final File txtFile) {
        createTxtFile(inputImages, txtFile, new OcrProcessContext(new Tesseract4EventHelper()));
    }

    /**
     * Performs OCR using provided {@link IOcrEngine} for the given list of
     * input images and saves output to a text file using provided path.
     *
     * @param inputImages {@link java.util.List} of images to be OCRed
     * @param txtFile file to be created
     * @param ocrProcessContext ocr process context
     */
    public void createTxtFile(final List inputImages, final File txtFile,
            final OcrProcessContext ocrProcessContext) {
        LoggerFactory.getLogger(getClass())
                .info(MessageFormatUtil.format(
                        Tesseract4LogMessageConstant.START_OCR_FOR_IMAGES,
                        inputImages.size()));

        AbstractPdfOcrEventHelper storedEventHelper;
        if (ocrProcessContext.getOcrEventHelper() == null) {
            storedEventHelper = new Tesseract4EventHelper();
        } else {
            storedEventHelper = ocrProcessContext.getOcrEventHelper();
        }

        PdfOcrTesseract4ProductEvent event = PdfOcrTesseract4ProductEvent.createProcessImageEvent(
                storedEventHelper.getSequenceId(), null, storedEventHelper.getConfirmationType());
        storedEventHelper.onEvent(event);

        try {
            // set Tesseract4FileResultEventHelper
            ocrProcessContext.setOcrEventHelper(new Tesseract4FileResultEventHelper(storedEventHelper));

            StringBuilder content = new StringBuilder();
            for (File inputImage : inputImages) {
                content.append(doImageOcr(inputImage, OutputFormat.TXT, ocrProcessContext));
            }

            // write to file
            TesseractHelper.writeToTextFile(txtFile.getAbsolutePath(),
                    content.toString());

            if (event.getConfirmationType() == EventConfirmationType.ON_DEMAND) {
                storedEventHelper.onEvent(new ConfirmEvent(event));
            }
        } finally {
            ocrProcessContext.setOcrEventHelper(storedEventHelper);
        }
    }

    /**
     * Gets properties for {@link AbstractTesseract4OcrEngine}.
     *
     * @return set properties {@link Tesseract4OcrEngineProperties}
     */
    public final Tesseract4OcrEngineProperties getTesseract4OcrEngineProperties() {
        return tesseract4OcrEngineProperties;
    }

    /**
     * Sets properties for {@link AbstractTesseract4OcrEngine}.
     *
     * @param tesseract4OcrEngineProperties set of properties
     * {@link Tesseract4OcrEngineProperties} for {@link AbstractTesseract4OcrEngine}
     */
    public final void setTesseract4OcrEngineProperties(
            final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties) {
        this.tesseract4OcrEngineProperties = tesseract4OcrEngineProperties;
    }

    /**
     * Gets list of languages concatenated with "+" symbol to a string
     * in format required by tesseract.
     * @return {@link java.lang.String} of concatenated languages
     */
    public final String getLanguagesAsString() {
        if (getTesseract4OcrEngineProperties().getLanguages().size() > 0) {
            return String.join("+",
                    getTesseract4OcrEngineProperties().getLanguages());
        } else {
            return getTesseract4OcrEngineProperties().getDefaultLanguage();
        }
    }

    /**
     * Reads data from the provided input image file and returns retrieved
     * data in the format described below.
     *
     * @param input input image {@link java.io.File}
     * @return {@link java.util.Map} where key is {@link java.lang.Integer}
     * representing the number of the page and value is
     * {@link java.util.List} of {@link TextInfo} elements where each
     * {@link TextInfo} element contains a word or a line and its 4
     * coordinates(bbox)
     */
    public final Map> doImageOcr(
            final File input) {
        verifyImageFormatValidity(input);
        return ((TextInfoTesseractOcrResult)processInputFiles(input,
                OutputFormat.HOCR, new Tesseract4EventHelper())).getTextInfos();
    }

    /**
     * Reads data from the provided input image file and returns retrieved
     * data in the format described below.
     *
     * @param input input image {@link java.io.File}
     * @param ocrProcessContext ocr process context
     * @return {@link java.util.Map} where key is {@link java.lang.Integer}
     * representing the number of the page and value is
     * {@link java.util.List} of {@link TextInfo} elements where each
     * {@link TextInfo} element contains a word or a line and its 4
     * coordinates(bbox)
     */
    public final Map> doImageOcr(
            final File input,
            OcrProcessContext ocrProcessContext) {
        verifyImageFormatValidity(input);
        return ((TextInfoTesseractOcrResult)processInputFiles(input,
                OutputFormat.HOCR, ocrProcessContext.getOcrEventHelper())).getTextInfos();
    }

    /**
     * Reads data from the provided input image file and returns retrieved
     * data as string.
     *
     * @param input input image {@link java.io.File}
     * @param outputFormat return {@link OutputFormat} result
     * @param ocrProcessContext ocr process context
     * @return OCR result as a {@link java.lang.String} that is
     * returned after processing the given image
     */
    public final String doImageOcr(final File input,
            final OutputFormat outputFormat,
            final OcrProcessContext ocrProcessContext) {
        String result = "";
        verifyImageFormatValidity(input);
        ITesseractOcrResult processedData = processInputFiles(input, outputFormat,
                ocrProcessContext.getOcrEventHelper());
        if (processedData != null) {
            if (outputFormat.equals(OutputFormat.TXT)) {
                result = ((StringTesseractOcrResult)processedData).getData();
            } else {
                StringBuilder outputText = new StringBuilder();
                Map> outputMap =
                        ((TextInfoTesseractOcrResult)processedData).getTextInfos();
                for (int page : outputMap.keySet()) {
                    StringBuilder pageText = new StringBuilder();
                    for (TextInfo textInfo : outputMap.get(page)) {
                        pageText.append(textInfo.getText());
                        pageText.append(System.lineSeparator());
                    }
                    outputText.append(pageText);
                    outputText.append(System.lineSeparator());
                }
                result = outputText.toString();
            }
        }
        return result;
    }

    /**
     * Reads data from the provided input image file and returns retrieved
     * data as string.
     *
     * @param input        input image {@link java.io.File}
     * @param outputFormat return {@link OutputFormat} result
     *
     * @return OCR result as a {@link java.lang.String} that is
     * returned after processing the given image
     */
    public final String doImageOcr(final File input,
            final OutputFormat outputFormat) {
        return doImageOcr(input, outputFormat, new OcrProcessContext(
                new Tesseract4EventHelper()));
    }

    /**
     * Checks current os type.
     *
     * @return boolean true is current os is windows, otherwise - false
     */
    public boolean isWindows() {
        return identifyOsType().toLowerCase().contains("win");
    }

    /**
     * Identifies type of current OS and return it (win, linux).
     *
     * @return type of current os as {@link java.lang.String}
     */
    public String identifyOsType() {
        String os = System.getProperty("os.name") == null
                ? System.getProperty("OS") : System.getProperty("os.name");
        return os.toLowerCase();
    }

    /**
     * Validates list of provided languages and
     * checks if they all exist in given tess data directory.
     *
     * @param languagesList {@link java.util.List} of provided languages
     * @throws PdfOcrTesseract4Exception if tess data wasn't found for one of the
     * languages from the provided list
     */
    public void validateLanguages(final List languagesList)
            throws PdfOcrTesseract4Exception {
        String suffix = ".traineddata";
        if (languagesList.size() == 0) {
            if (!new File(getTessData()
                    + java.io.File.separatorChar
                    + getTesseract4OcrEngineProperties().getDefaultLanguage()
                    + suffix)
                    .exists()) {
                throw new PdfOcrInputTesseract4Exception(
                        PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE)
                        .setMessageParams(
                                getTesseract4OcrEngineProperties()
                                        .getDefaultLanguage()
                                        + suffix,
                                getTessData());
            }
        } else {
            for (String lang : languagesList) {
                if (!new File(getTessData()
                        + java.io.File.separatorChar + lang + suffix)
                        .exists()) {
                    throw new PdfOcrInputTesseract4Exception(
                            PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_LANGUAGE)
                            .setMessageParams(lang + suffix, getTessData());
                }
            }
        }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public PdfOcrMetaInfoContainer getMetaInfoContainer() {
        return new PdfOcrMetaInfoContainer(new Tesseract4MetaInfo());
    }

    @Override
    public ProductData getProductData() {
        return PdfOcrTesseract4ProductData.getInstance();
    }

    @Override
    public boolean isTaggingSupported() {
        return false;
    }

    /**
     * Performs tesseract OCR using command line tool
     * or a wrapper for Tesseract OCR API.
     *
     * Please note that list of output files is accepted instead of a single file because
     * page number parameter is not respected in case of TIFF images not requiring preprocessing.
     * In other words, if the passed image is the TIFF image and according to the {@link Tesseract4OcrEngineProperties}
     * no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list
     * is expected to be same as number of pages in the image, otherwise, only one file is expected
     *
     * @param inputImage input image {@link java.io.File}
     * @param outputFiles {@link java.util.List} of output files
     *                                          (one per each page)
     * @param outputFormat selected {@link OutputFormat} for tesseract
     * @param pageNumber number of page to be processed
     */
    void doTesseractOcr(File inputImage,
            List outputFiles, OutputFormat outputFormat,
            int pageNumber, AbstractPdfOcrEventHelper eventHelper) {
        doTesseractOcr(inputImage, outputFiles, outputFormat, pageNumber, true, eventHelper);
    }

    /**
     * Performs tesseract OCR using command line tool
     * or a wrapper for Tesseract OCR API.
     *
     * Please note that list of output files is accepted instead of a single file because
     * page number parameter is not respected in case of TIFF images not requiring preprocessing.
     * In other words, if the passed image is the TIFF image and according to the {@link Tesseract4OcrEngineProperties}
     * no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list
     * is expected to be same as number of pages in the image, otherwise, only one file is expected
     *
     * @param inputImage input image {@link java.io.File}
     * @param outputFiles {@link java.util.List} of output files
     *                                          (one per each page)
     * @param outputFormat selected {@link OutputFormat} for tesseract
     * @param pageNumber number of page to be processed
     * @param dispatchEvent indicates if event needs to be dispatched
     * @param eventHelper event helper
     */
    abstract void doTesseractOcr(File inputImage,
            List outputFiles, OutputFormat outputFormat,
            int pageNumber, boolean dispatchEvent,
            AbstractPdfOcrEventHelper eventHelper);

    /**
     * Gets path to provided tess data directory.
     *
     * @return path to provided tess data directory as
     * {@link java.lang.String}
     */
    String getTessData() {
        if (getTesseract4OcrEngineProperties().getPathToTessData() == null) {
            throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant
                    .PATH_TO_TESS_DATA_IS_NOT_SET);
        } else {
            return getTesseract4OcrEngineProperties().getPathToTessData()
                    .getAbsolutePath();
        }
    }

    PdfOcrTesseract4ProductEvent onEvent(final AbstractPdfOcrEventHelper eventHelper) {
        // usage event
        PdfOcrTesseract4ProductEvent event =
                PdfOcrTesseract4ProductEvent.createProcessImageEvent(eventHelper.getSequenceId(),
                        null, eventHelper.getConfirmationType());
        eventHelper.onEvent(event);
        return event;
    }

    void onEventStatistics(final AbstractPdfOcrEventHelper eventHelper) {
        eventHelper.onEvent(new PdfOcrOutputTypeStatisticsEvent(PdfOcrOutputType.DATA,
                PdfOcrTesseract4ProductData.getInstance()));
    }

    /**
     * Reads data from the provided input image file.
     *
     * @param input input image {@link File}
     * @param outputFormat {@link OutputFormat} for the result returned
     *                                         by {@link IOcrEngine}
     * @param eventHelper event helper
     * @return {@link ITesseractOcrResult} instance, either {@link StringTesseractOcrResult}
     *     if output format is TXT, or {@link TextInfoTesseractOcrResult} if the output format is HOCR
     */
    private ITesseractOcrResult processInputFiles(
            final File input, final OutputFormat outputFormat,
            final AbstractPdfOcrEventHelper eventHelper) {
        Map> imageData =
                new LinkedHashMap>();
        StringBuilder data = new StringBuilder();
        List tempFiles = new ArrayList();
        ITesseractOcrResult result = null;
        try {
            // image needs to be paginated only if it's tiff
            // or preprocessing isn't required
            int realNumOfPages = !ImagePreprocessingUtil.isTiffImage(input)
                    ? 1 : ImagePreprocessingUtil.getNumberOfPageTiff(input);
            int numOfPages =
                    getTesseract4OcrEngineProperties().isPreprocessingImages()
                            ? realNumOfPages : 1;
            int numOfFiles =
                    getTesseract4OcrEngineProperties().isPreprocessingImages()
                            ? 1 : realNumOfPages;

            for (int page = 1; page <= numOfPages; page++) {
                String extension = outputFormat.equals(OutputFormat.HOCR)
                        ? ".hocr" : ".txt";
                for (int i = 0; i < numOfFiles; i++) {
                    tempFiles.add(createTempFile(extension));
                }

                doTesseractOcr(input, tempFiles, outputFormat, page, true, eventHelper);
                if (outputFormat.equals(OutputFormat.HOCR)) {
                    List tempTxtFiles = null;
                    if (getTesseract4OcrEngineProperties().isUseTxtToImproveHocrParsing()) {
                        tempTxtFiles = new ArrayList<>();
                        for (int i = 0; i < numOfFiles; i++) {
                            tempTxtFiles.add(createTempFile(".txt"));
                        }
                        doTesseractOcr(input, tempTxtFiles, OutputFormat.TXT, page, false, eventHelper);
                    }
                    Map> pageData = TesseractHelper
                            .parseHocrFile(tempFiles, tempTxtFiles,
                                    getTesseract4OcrEngineProperties());

                    if (getTesseract4OcrEngineProperties()
                            .isPreprocessingImages()) {
                        imageData.put(page, pageData.get(1));
                    } else {
                        imageData = pageData;
                    }
                    result = new TextInfoTesseractOcrResult(imageData);
                } else {
                    for (File tmpFile : tempFiles) {
                        if (Files.exists(
                                java.nio.file.Paths
                                        .get(tmpFile.getAbsolutePath()))) {
                            data.append(TesseractHelper.readTxtFile(tmpFile));
                        }
                    }
                    result = new StringTesseractOcrResult(data.toString());
                }
            }
        } catch (IOException e) {
            LoggerFactory.getLogger(getClass())
                    .error(MessageFormatUtil.format(
                            Tesseract4LogMessageConstant.CANNOT_OCR_INPUT_FILE,
                            e.getMessage()));
        } finally {
            for (File file : tempFiles) {
                TesseractHelper.deleteFile(file.getAbsolutePath());
            }
        }
        return result;
    }

    /**
     * Creates a temporary file with given extension.
     *
     * @param extension file extension for a new file {@link java.lang.String}
     * @return a new created {@link java.io.File} instance
     */
    private File createTempFile(final String extension) {
        String tmpFileName = TesseractOcrUtil.getTempFilePath(
                UUID.randomUUID().toString(), extension);
        return new File(tmpFileName);
    }

    /**
     * Validates input image format.
     * Allowed image formats are listed
     * in {@link AbstractTesseract4OcrEngine#SUPPORTED_IMAGE_FORMATS}
     *
     * @param image input image {@link java.io.File}
     * @throws PdfOcrTesseract4Exception if image format is invalid
     */
    private void verifyImageFormatValidity(final File image)
            throws PdfOcrTesseract4Exception {
        ImageType type = ImagePreprocessingUtil.getImageType(image);
        boolean isValid = SUPPORTED_IMAGE_FORMATS.contains(type);
        if (!isValid) {
            LoggerFactory.getLogger(getClass()).error(MessageFormatUtil
                    .format(Tesseract4LogMessageConstant
                                    .CANNOT_READ_INPUT_IMAGE,
                            image.getAbsolutePath()));
            throw new PdfOcrInputTesseract4Exception(
                    PdfOcrTesseract4ExceptionMessageConstant.INCORRECT_INPUT_IMAGE_FORMAT)
                    .setMessageParams(image.getName());
        }
    }

    interface ITesseractOcrResult {
    }

    static class StringTesseractOcrResult implements ITesseractOcrResult {
        private String data;

        StringTesseractOcrResult(String data) {
            this.data = data;
        }

        String getData() {
            return data;
        }
    }

    static class TextInfoTesseractOcrResult implements ITesseractOcrResult {
        private Map> textInfos;

        TextInfoTesseractOcrResult(Map> textInfos) {
            this.textInfos = textInfos;
        }

        Map> getTextInfos() {
            return this.textInfos;
        }
    }
}