All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.itextpdf.pdfocr.tesseract4.Tesseract4ExecutableOcrEngine Maven / Gradle / Ivy

Go to download

pdfOCR-Tesseract4 is an iText add-on for Java to recognize and extract text in scanned documents and images. It can also convert them into fully ISO-compliant PDF or PDF/A-3u files that are accessible, searchable, and suitable for archiving

The newest version!
/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2024 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see .
 */
package com.itextpdf.pdfocr.tesseract4;

import com.itextpdf.commons.actions.confirmations.ConfirmEvent;
import com.itextpdf.commons.actions.confirmations.EventConfirmationType;
import com.itextpdf.commons.utils.MessageFormatUtil;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.UUID;

import com.itextpdf.pdfocr.AbstractPdfOcrEventHelper;
import com.itextpdf.pdfocr.tesseract4.actions.events.PdfOcrTesseract4ProductEvent;
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrTesseract4Exception;
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrTesseract4ExceptionMessageConstant;
import com.itextpdf.pdfocr.tesseract4.logs.Tesseract4LogMessageConstant;

import net.sourceforge.lept4j.Pix;
import org.slf4j.LoggerFactory;

/**
 * The implementation of {@link AbstractTesseract4OcrEngine} for tesseract OCR.
 *
 * 

* This class provides possibilities to use features of "tesseract" CL tool * (optical character recognition engine for various operating systems). * Please note that it's assumed that "tesseract" has already been * installed locally. */ public class Tesseract4ExecutableOcrEngine extends AbstractTesseract4OcrEngine { /** * Path to the tesseract executable. * By default it's assumed that "tesseract" already exists in the "PATH". */ private String pathToExecutable; /** * Creates a new {@link Tesseract4ExecutableOcrEngine} instance. * * @param tesseract4OcrEngineProperties set of properties */ public Tesseract4ExecutableOcrEngine( final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties) { super(tesseract4OcrEngineProperties); setPathToExecutable("tesseract"); } /** * Creates a new {@link Tesseract4ExecutableOcrEngine} instance. * * @param executablePath path to tesseract executable * @param tesseract4OcrEngineProperties set of properties */ public Tesseract4ExecutableOcrEngine(final String executablePath, final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties) { super(tesseract4OcrEngineProperties); setPathToExecutable(executablePath); } /** * Gets path to tesseract executable. * * @return path to tesseract executable */ public final String getPathToExecutable() { return pathToExecutable; } /** * Sets path to tesseract executable. * By default it's assumed that "tesseract" already exists in the "PATH". * * @param path path to tesseract executable */ public final void setPathToExecutable(final String path) { pathToExecutable = path; } /** * Performs tesseract OCR using command line tool for the selected page * of input image (by default 1st). * * Please note that list of output files is accepted instead of a single file because * page number parameter is not respected in case of TIFF images not requiring preprocessing. * In other words, if the passed image is the TIFF image and according to the {@link Tesseract4OcrEngineProperties} * no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list * is expected to be same as number of pages in the image, otherwise, only one file is expected * * @param inputImage input image {@link java.io.File} * @param outputFiles {@link java.util.List} of output files * (one per each page) * @param outputFormat selected {@link OutputFormat} for tesseract * @param pageNumber number of page to be processed * @param dispatchEvent indicates if event needs to be dispatched * @param eventHelper event helper */ void doTesseractOcr(final File inputImage, final List outputFiles, final OutputFormat outputFormat, final int pageNumber, final boolean dispatchEvent, AbstractPdfOcrEventHelper eventHelper) { List params = new ArrayList(); String execPath = null; String imagePath = null; String workingDirectory = null; PdfOcrTesseract4ProductEvent event = null; if (eventHelper == null) { eventHelper = new Tesseract4EventHelper(); } if (dispatchEvent) { event = onEvent(eventHelper); } try { imagePath = inputImage.getAbsolutePath(); // path to tesseract executable if (getPathToExecutable() == null || getPathToExecutable().isEmpty()) { throw new PdfOcrTesseract4Exception( PdfOcrTesseract4ExceptionMessageConstant .CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE); } else { if (isWindows()) { execPath = addQuotes(getPathToExecutable()); } else { execPath = getPathToExecutable(); } } checkTesseractInstalled(execPath); // path to tess data addTessData(params); // validate languages before preprocessing started validateLanguages(getTesseract4OcrEngineProperties() .getLanguages()); // preprocess input file if needed imagePath = preprocessImage(inputImage, pageNumber); // get the input file parent directory as working directory // as tesseract cannot parse non ascii characters in input path String imageParentDir = TesseractOcrUtil.getParentDirectoryFile(imagePath); String replacement = isWindows() ? "" : "/"; workingDirectory = imageParentDir.replace("file:///", replacement) .replace("file:/", replacement); // input file addInputFile(params, imagePath); // output file addOutputFile(params, outputFiles.get(0), outputFormat, imagePath); // page segmentation mode addPageSegMode(params); // add user words if needed addUserWords(params, imagePath); // required languages addLanguages(params); addOutputFormat(params, outputFormat); addPreserveInterwordSpaces(params); // set default user defined dpi addDefaultDpi(params); // run tesseract process TesseractHelper.runCommand(execPath, params, workingDirectory); // statistics event onEventStatistics(eventHelper); // confrim on_demand event if (event != null && event.getConfirmationType() == EventConfirmationType.ON_DEMAND) { eventHelper.onEvent(new ConfirmEvent(event)); } } catch (PdfOcrTesseract4Exception e) { LoggerFactory.getLogger(getClass()) .error(e.getMessage()); throw new PdfOcrTesseract4Exception(e.getMessage(), e); } finally { try { if (imagePath != null && !inputImage.getAbsolutePath().equals(imagePath)) { TesseractHelper.deleteFile(imagePath); } } catch (SecurityException e) { LoggerFactory.getLogger(getClass()) .error(MessageFormatUtil.format( Tesseract4LogMessageConstant.CANNOT_DELETE_FILE, imagePath, e.getMessage())); } try { if (getTesseract4OcrEngineProperties() .getPathToUserWordsFile() != null && getTesseract4OcrEngineProperties().isUserWordsFileTemporary()) { TesseractHelper.deleteFile( getTesseract4OcrEngineProperties() .getPathToUserWordsFile()); } } catch (SecurityException e) { LoggerFactory.getLogger(getClass()) .error(MessageFormatUtil.format( Tesseract4LogMessageConstant.CANNOT_DELETE_FILE, getTesseract4OcrEngineProperties() .getPathToUserWordsFile(), e.getMessage())); } } } /** * Sets hocr output format. * * @param command result command as list of strings */ private void setHocrOutput(final List command) { command.add("-c"); command.add("tessedit_create_hocr=1"); } /** * Sets preserve_interword_spaces option. * * @param command result command as list of strings */ private void addPreserveInterwordSpaces(final List command) { if (getTesseract4OcrEngineProperties().isUseTxtToImproveHocrParsing()) { command.add("-c"); command.add("preserve_interword_spaces=1"); } } /** * Add output format. * * @param command result command as list of strings * @param outputFormat output format */ private void addOutputFormat(final List command, OutputFormat outputFormat) { if (outputFormat == OutputFormat.HOCR) { setHocrOutput(command); } } /** * Add path to user-words file for tesseract executable. * * @param command result command as list of strings */ private void addUserWords(final List command, final String imgPath) { if (getTesseract4OcrEngineProperties().getPathToUserWordsFile() != null && !getTesseract4OcrEngineProperties() .getPathToUserWordsFile().isEmpty()) { File userWordsFile = new File(getTesseract4OcrEngineProperties() .getPathToUserWordsFile()); // Workaround for a non-ASCII characters in path // Currently works only if the user words (or output files) reside in the same directory as the input image // Leaves only a filename in this case, otherwise - absolute path to output file String filePath = areEqualParentDirectories(imgPath, userWordsFile.getAbsolutePath()) ? userWordsFile.getName() : userWordsFile.getAbsolutePath(); command.add("--user-words"); command.add(addQuotes(filePath)); command.add("--oem"); command.add("0"); } } /** * Set default DPI for image. * * @param command result command as list of strings */ private void addDefaultDpi(final List command) { command.add("-c"); command.add("user_defined_dpi=300"); } /** * Adds path to tess data to the command list. * * @param command result command as list of strings */ private void addTessData(final List command) { command.add("--tessdata-dir"); command.add(addQuotes(getTessData())); } /** * Adds selected Page Segmentation Mode as parameter. * * @param command result command as list of strings */ private void addPageSegMode(final List command) { if (getTesseract4OcrEngineProperties().getPageSegMode() != null) { command.add("-c"); command.add("tessedit_pageseg_mode=" + getTesseract4OcrEngineProperties().getPageSegMode()); } } /** * Add list of selected languages concatenated to a string as parameter. * * @param command result command as list of strings */ private void addLanguages(final List command) { if (getTesseract4OcrEngineProperties().getLanguages().size() > 0) { command.add("-l"); command.add(getLanguagesAsString()); } } /** * Adds path to the input image file. * * @param command result command as list of strings * @param imagePath path to the input image file as string */ private void addInputFile(final List command, final String imagePath) { command.add(addQuotes(new File(imagePath).getName())); } /** * Adds path to temporary output file with result. * * @param command result command as list of strings * @param outputFile output file with result * @param outputFormat selected {@link OutputFormat} for tesseract */ private void addOutputFile(final List command, final File outputFile, final OutputFormat outputFormat, final String inputImagePath) { String extension = outputFormat.equals(OutputFormat.HOCR) ? ".hocr" : ".txt"; try { // Workaround for a non-ASCII characters in path // Currently works only if the user words (or output files) reside in the same directory as the input image // Leaves only a filename in this case, otherwise - absolute path to output file String filePath = areEqualParentDirectories(inputImagePath, outputFile.getAbsolutePath()) ? outputFile.getName() : outputFile.getAbsolutePath(); String fileName = new String( filePath.toCharArray(), 0, filePath.indexOf(extension)); LoggerFactory.getLogger(getClass()).info( MessageFormatUtil.format( Tesseract4LogMessageConstant.CREATED_TEMPORARY_FILE, outputFile.getAbsolutePath())); command.add(addQuotes(fileName)); } catch (Exception e) { throw new PdfOcrTesseract4Exception(PdfOcrTesseract4ExceptionMessageConstant .TESSERACT_FAILED); } } /** * Surrounds given string with quotes. * * @param value string to be wrapped into quotes * @return wrapped string */ private String addQuotes(final String value) { // choosing correct quotes for system if (isWindows()) { return "\"" + value + "\""; } else { return "'" + value + "'"; } } /** * Preprocess given image if it is needed. * * @param inputImage original input image {@link java.io.File} * @param pageNumber number of page to be OCRed * @return path to output image as {@link java.lang.String} * @throws PdfOcrTesseract4Exception if preprocessing cannot be done or file * is invalid */ private String preprocessImage(final File inputImage, final int pageNumber) throws PdfOcrTesseract4Exception { String tmpFileName = TesseractOcrUtil .getTempFilePath(UUID.randomUUID().toString(), getExtension(inputImage)); String path = inputImage.getAbsolutePath(); try { if (getTesseract4OcrEngineProperties().isPreprocessingImages()) { Pix pix = ImagePreprocessingUtil .preprocessImage(inputImage, pageNumber, getTesseract4OcrEngineProperties().getImagePreprocessingOptions()); TesseractOcrUtil.savePixToPngFile(tmpFileName, pix); if (!Files.exists(Paths.get(tmpFileName))) { BufferedImage img = TesseractOcrUtil.convertPixToImage(pix); if (img != null) { TesseractOcrUtil.saveImageToTempPngFile(tmpFileName, img); } } } if (!getTesseract4OcrEngineProperties().isPreprocessingImages() || !Files.exists(Paths.get(tmpFileName))) { TesseractOcrUtil.createTempFileCopy(path, tmpFileName); } if (Files.exists(Paths.get(tmpFileName))) { path = tmpFileName; } } catch (IOException e) { LoggerFactory.getLogger(getClass()) .error(MessageFormatUtil.format( Tesseract4LogMessageConstant .CANNOT_READ_INPUT_IMAGE, e.getMessage())); } return path; } /** * Check whether tesseract executable is installed on the machine and * provided path to tesseract executable is correct. * @param execPath path to tesseract executable * @throws PdfOcrTesseract4Exception if tesseract is not installed or * provided path to tesseract executable is incorrect, * i.e. running "{@link #getPathToExecutable()} --version" command failed. */ private void checkTesseractInstalled(String execPath) throws PdfOcrTesseract4Exception { try { TesseractHelper.runCommand(execPath, Collections.singletonList("--version")); } catch (PdfOcrTesseract4Exception e) { throw new PdfOcrTesseract4Exception( PdfOcrTesseract4ExceptionMessageConstant.TESSERACT_NOT_FOUND, e); } } /** * Gets input image file extension. * * @param inputImage input file * @return file extension as a {@link java.lang.String} */ private String getExtension(File inputImage) { if (inputImage != null) { int index = inputImage.getAbsolutePath().lastIndexOf('.'); if (index > 0) { String extension = new String( inputImage.getAbsolutePath().toCharArray(), index, inputImage.getAbsolutePath().length() - index); return extension.toLowerCase(); } } return ".png"; } /** * Checks whether parent directories are equal for the passed file paths. * * @param firstPath path to the first file * @param secondPath path to the second file * @return true if parent directories are equal, otherwise - false */ private boolean areEqualParentDirectories(final String firstPath, final String secondPath) { String firstParentDir = TesseractOcrUtil.getParentDirectoryFile(firstPath); String secondParentDir = TesseractOcrUtil .getParentDirectoryFile(secondPath); return firstParentDir != null && firstParentDir.equals(secondParentDir); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy