All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.itextpdf.pdfocr.OcrPdfCreator Maven / Gradle / Ivy

/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2020 iText Group NV
    Authors: iText Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see .
 */
package com.itextpdf.pdfocr;

import com.itextpdf.io.font.otf.ActualTextIterator;
import com.itextpdf.io.font.otf.Glyph;
import com.itextpdf.io.font.otf.GlyphLine;
import com.itextpdf.io.image.ImageData;
import com.itextpdf.io.util.MessageFormatUtil;
import com.itextpdf.kernel.counter.event.IMetaInfo;
import com.itextpdf.kernel.font.PdfFont;
import com.itextpdf.kernel.font.PdfTrueTypeFont;
import com.itextpdf.kernel.font.PdfType0Font;
import com.itextpdf.kernel.font.PdfType1Font;
import com.itextpdf.kernel.font.PdfType3Font;
import com.itextpdf.kernel.geom.PageSize;
import com.itextpdf.kernel.geom.Point;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.DocumentProperties;
import com.itextpdf.kernel.pdf.PdfAConformanceLevel;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfDocumentInfo;
import com.itextpdf.kernel.pdf.PdfOutputIntent;
import com.itextpdf.kernel.pdf.PdfPage;
import com.itextpdf.kernel.pdf.PdfString;
import com.itextpdf.kernel.pdf.PdfViewerPreferences;
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.kernel.pdf.canvas.PdfCanvas;
import com.itextpdf.kernel.pdf.canvas.PdfCanvasConstants.TextRenderingMode;
import com.itextpdf.kernel.pdf.layer.PdfLayer;
import com.itextpdf.layout.Canvas;
import com.itextpdf.layout.Document;
import com.itextpdf.layout.element.Paragraph;
import com.itextpdf.layout.element.Text;
import com.itextpdf.layout.font.FontProvider;
import com.itextpdf.layout.property.TextAlignment;
import com.itextpdf.pdfa.PdfADocument;
import com.itextpdf.pdfocr.OcrPdfCreatorMetaInfo.PdfDocumentType;
import com.itextpdf.pdfocr.events.IThreadLocalMetaInfoAware;

import java.io.File;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * {@link OcrPdfCreator} is the class that creates PDF documents containing input
 * images and text that was recognized using provided {@link IOcrEngine}.
 *
 * {@link OcrPdfCreator} provides possibilities to set list of input images to
 * be used for OCR, to set scaling mode for images, to set color of text in
 * output PDF document, to set fixed size of the PDF document's page and to
 * perform OCR using given images and to return
 * {@link com.itextpdf.kernel.pdf.PdfDocument} as result.
 * OCR is based on the provided {@link IOcrEngine}
 * (e.g. tesseract reader). This parameter is obligatory and it should be
 * provided in constructor
 * or using setter.
 */
public class OcrPdfCreator {

    /**
     * The logger.
     */
    private static final Logger LOGGER = LoggerFactory
            .getLogger(OcrPdfCreator.class);

    /**
     * Indices in array representing bbox.
     */
    private static final int LEFT_IDX = 0;
    private static final int TOP_IDX = 1;
    private static final int RIGHT_IDX = 2;
    private static final int BOTTOM_IDX = 3;

    /**
     * Selected {@link IOcrEngine}.
     */
    private IOcrEngine ocrEngine;

    /**
     * Set of properties.
     */
    private OcrPdfCreatorProperties ocrPdfCreatorProperties;

    /**
     * Creates a new {@link OcrPdfCreator} instance.
     *
     * @param ocrEngine {@link IOcrEngine} selected OCR Reader
     */
    public OcrPdfCreator(final IOcrEngine ocrEngine) {
        this(ocrEngine, new OcrPdfCreatorProperties());
    }

    /**
     * Creates a new {@link OcrPdfCreator} instance.
     *
     * @param ocrEngine selected OCR Reader {@link IOcrEngine}
     * @param ocrPdfCreatorProperties set of properties for {@link OcrPdfCreator}
     */
    public OcrPdfCreator(final IOcrEngine ocrEngine,
            final OcrPdfCreatorProperties ocrPdfCreatorProperties) {
        setOcrEngine(ocrEngine);
        setOcrPdfCreatorProperties(ocrPdfCreatorProperties);
    }

    /**
     * Gets properties for {@link OcrPdfCreator}.
     *
     * @return set properties {@link OcrPdfCreatorProperties}
     */
    public final OcrPdfCreatorProperties getOcrPdfCreatorProperties() {
        return ocrPdfCreatorProperties;
    }

    /**
     * Sets properties for {@link OcrPdfCreator}.
     *
     * @param ocrPdfCreatorProperties set of properties
     * {@link OcrPdfCreatorProperties} for {@link OcrPdfCreator}
     */
    public final void setOcrPdfCreatorProperties(
            final OcrPdfCreatorProperties ocrPdfCreatorProperties) {
        this.ocrPdfCreatorProperties = ocrPdfCreatorProperties;
    }

    /**
     * Performs OCR with set parameters using provided {@link IOcrEngine} and
     * creates PDF using provided {@link com.itextpdf.kernel.pdf.PdfWriter} and
     * {@link com.itextpdf.kernel.pdf.PdfOutputIntent}.
     * PDF/A-3u document will be created if
     * provided {@link com.itextpdf.kernel.pdf.PdfOutputIntent} is not null.
     *
     * @param inputImages {@link java.util.List} of images to be OCRed
     * @param pdfWriter the {@link com.itextpdf.kernel.pdf.PdfWriter} object
     *                  to write final PDF document to
     * @param pdfOutputIntent {@link com.itextpdf.kernel.pdf.PdfOutputIntent}
     *                        for PDF/A-3u document
     * @return result PDF/A-3u {@link com.itextpdf.kernel.pdf.PdfDocument}
     * object
     * @throws OcrException if it was not possible to read provided or
     * default font
     */
    public final PdfDocument createPdfA(final List inputImages,
            final PdfWriter pdfWriter,
            final PdfOutputIntent pdfOutputIntent)
            throws OcrException {
        LOGGER.info(MessageFormatUtil.format(
                PdfOcrLogMessageConstant.START_OCR_FOR_IMAGES,
                inputImages.size()));

        IMetaInfo storedMetaInfo = null;
        if (ocrEngine instanceof IThreadLocalMetaInfoAware) {
            storedMetaInfo = ((IThreadLocalMetaInfoAware)ocrEngine).getThreadLocalMetaInfo();
            ((IThreadLocalMetaInfoAware)ocrEngine).setThreadLocalMetaInfo(
                    new OcrPdfCreatorMetaInfo(((IThreadLocalMetaInfoAware)ocrEngine).getThreadLocalMetaInfo(),
                            UUID.randomUUID(),
                            null != pdfOutputIntent ? PdfDocumentType.PDFA : PdfDocumentType.PDF));
        }

        // map contains:
        // keys: image files
        // values:
        // map pageNumber -> retrieved text data(text and its coordinates)
        Map>> imagesTextData =
                new LinkedHashMap>>();
        try {
            for (File inputImage : inputImages) {
                imagesTextData.put(inputImage,
                        ocrEngine.doImageOcr(inputImage));
            }
        } finally {
            if (ocrEngine instanceof IThreadLocalMetaInfoAware) {
                ((IThreadLocalMetaInfoAware)ocrEngine).setThreadLocalMetaInfo(storedMetaInfo);
            }
        }


        // create PdfDocument
        return createPdfDocument(pdfWriter, pdfOutputIntent, imagesTextData);
    }

    /**
     * Performs OCR with set parameters using provided {@link IOcrEngine} and
     * creates PDF using provided {@link com.itextpdf.kernel.pdf.PdfWriter}.
     *
     * @param inputImages {@link java.util.List} of images to be OCRed
     * @param pdfWriter the {@link com.itextpdf.kernel.pdf.PdfWriter} object
     *                  to write final PDF document to
     * @return result {@link com.itextpdf.kernel.pdf.PdfDocument} object
     * @throws OcrException if provided font is incorrect
     */
    public final PdfDocument createPdf(final List inputImages,
            final PdfWriter pdfWriter)
            throws OcrException {
        return createPdfA(inputImages, pdfWriter, null);
    }

    /**
     * Gets used {@link IOcrEngine}.
     *
     * Returns {@link IOcrEngine} reader object to perform OCR.
     * @return selected {@link IOcrEngine} instance
     */
    public final IOcrEngine getOcrEngine() {
        return ocrEngine;
    }

    /**
     * Sets {@link IOcrEngine} reader object to perform OCR.
     * @param reader selected {@link IOcrEngine} instance
     */
    public final void setOcrEngine(final IOcrEngine reader) {
        ocrEngine = reader;
    }

    /**
     * Adds image (or its one page) and text that was found there to canvas.
     *
     * @param pdfDocument result {@link com.itextpdf.kernel.pdf.PdfDocument}
     * @param imageSize size of the image according to the selected
     *                  {@link ScaleMode}
     * @param pageText text that was found on this image (or on this page)
     * @param imageData input image if it is a single page or its one page if
     *                 this is a multi-page image
     * @param createPdfA3u true if PDF/A3u document is being created
     * @throws OcrException if PDF/A3u document is being created and provided
     * font contains notdef glyphs
     */
    private void addToCanvas(final PdfDocument pdfDocument,
            final Rectangle imageSize,
            final List pageText, final ImageData imageData,
            final boolean createPdfA3u) throws OcrException {
        final Rectangle rectangleSize =
                ocrPdfCreatorProperties.getPageSize() == null
                        ? imageSize : ocrPdfCreatorProperties.getPageSize();
        PageSize size = new PageSize(rectangleSize);
        PdfPage pdfPage = pdfDocument.addNewPage(size);
        PdfCanvas canvas = new NotDefCheckingPdfCanvas(pdfPage, createPdfA3u);

        PdfLayer[] layers = createPdfLayers(ocrPdfCreatorProperties.getImageLayerName(),
                ocrPdfCreatorProperties.getTextLayerName(),
                pdfDocument);

        if (layers[0] != null) {
            canvas.beginLayer(layers[0]);
        }
        addImageToCanvas(imageData, imageSize, canvas);
        if (layers[0] != null && layers[0] != layers[1]) {
            canvas.endLayer();
        }

        // how much the original image size changed
        float multiplier = imageData == null
                ? 1 : imageSize.getWidth()
                / PdfCreatorUtil.getPoints(imageData.getWidth());
        if (layers[1] != null && layers[0] != layers[1]) {
            canvas.beginLayer(layers[1]);
        }

        try {
            addTextToCanvas(imageSize, pageText, canvas, multiplier,
                    pdfPage.getMediaBox());
        } catch (OcrException e) {
            LOGGER.error(MessageFormatUtil.format(
                    OcrException.CANNOT_CREATE_PDF_DOCUMENT,
                    e.getMessage()));
            throw new OcrException(OcrException.CANNOT_CREATE_PDF_DOCUMENT)
                    .setMessageParams(e.getMessage());
        }
        if (layers[1] != null) {
            canvas.endLayer();
        }
    }

    /**
     * Creates a new PDF document using provided properties, adds images with
     * recognized text.
     *
     * @param pdfWriter the {@link com.itextpdf.kernel.pdf.PdfWriter} object
     *                  to write final PDF document to
     * @param pdfOutputIntent {@link com.itextpdf.kernel.pdf.PdfOutputIntent}
     *                        for PDF/A-3u document
     * @param imagesTextData map that contains input image files as keys,
     *                       and as value: map pageNumber -> text for the page
     * @return result {@link com.itextpdf.kernel.pdf.PdfDocument} object
     */
    private PdfDocument createPdfDocument(final PdfWriter pdfWriter,
            final PdfOutputIntent pdfOutputIntent,
            final Map>> imagesTextData) {
        PdfDocument pdfDocument;
        boolean createPdfA3u = pdfOutputIntent != null;
        if (createPdfA3u) {
            pdfDocument = new PdfADocument(pdfWriter,
                    PdfAConformanceLevel.PDF_A_3U, pdfOutputIntent,
                    new DocumentProperties().setEventCountingMetaInfo(new PdfOcrMetaInfo()));
        } else {
            pdfDocument = new PdfDocument(pdfWriter,
                    new DocumentProperties().setEventCountingMetaInfo(new PdfOcrMetaInfo()));
        }

        // pdfLang should be set in PDF/A mode
        boolean hasPdfLangProperty = ocrPdfCreatorProperties.getPdfLang() != null
                && !ocrPdfCreatorProperties.getPdfLang().equals("");
        if (createPdfA3u && !hasPdfLangProperty) {
            LOGGER.error(MessageFormatUtil.format(
                    OcrException.CANNOT_CREATE_PDF_DOCUMENT,
                    PdfOcrLogMessageConstant.PDF_LANGUAGE_PROPERTY_IS_NOT_SET));
            throw new OcrException(OcrException.CANNOT_CREATE_PDF_DOCUMENT)
                    .setMessageParams(PdfOcrLogMessageConstant.PDF_LANGUAGE_PROPERTY_IS_NOT_SET);
        }

        // add metadata
        if (hasPdfLangProperty) {
            pdfDocument.getCatalog()
                    .setLang(new PdfString(ocrPdfCreatorProperties.getPdfLang()));
        }

        // set title if it is not empty
        if (ocrPdfCreatorProperties.getTitle() != null) {
            pdfDocument.getCatalog().setViewerPreferences(
                    new PdfViewerPreferences().setDisplayDocTitle(true));
            PdfDocumentInfo info = pdfDocument.getDocumentInfo();
            info.setTitle(ocrPdfCreatorProperties.getTitle());
        }

        // reset passed font provider
        ocrPdfCreatorProperties.getFontProvider().reset();

        addDataToPdfDocument(imagesTextData, pdfDocument, createPdfA3u);

        return pdfDocument;
    }

    /**
     * Places provided images and recognized text to the result PDF document.
     *
     * @param imagesTextData map that contains input image
     *                       files as keys, and as value:
     *                       map pageNumber -> text for the page
     * @param pdfDocument result {@link com.itextpdf.kernel.pdf.PdfDocument}
     * @param createPdfA3u true if PDF/A3u document is being created
     * @throws OcrException if input image cannot be read or provided font
     * contains NOTDEF glyphs
     */
    private void addDataToPdfDocument(
            final Map>> imagesTextData,
            final PdfDocument pdfDocument,
            final boolean createPdfA3u) throws OcrException {
        for (Map.Entry>> entry
                : imagesTextData.entrySet()) {
            try {
                File inputImage = entry.getKey();
                List imageDataList =
                        PdfCreatorUtil.getImageData(inputImage,
                                ocrPdfCreatorProperties.getImageRotationHandler());
                LOGGER.info(MessageFormatUtil.format(
                        PdfOcrLogMessageConstant.NUMBER_OF_PAGES_IN_IMAGE,
                        inputImage.toString(), imageDataList.size()));

                Map> imageTextData = entry.getValue();
                if (imageTextData.keySet().size() > 0) {
                    for (int page = 0; page < imageDataList.size(); ++page) {
                        ImageData imageData = imageDataList.get(page);
                        final Rectangle imageSize =
                                PdfCreatorUtil.calculateImageSize(
                                        imageData,
                                        ocrPdfCreatorProperties.getScaleMode(),
                                        ocrPdfCreatorProperties.getPageSize());

                        if (imageTextData.containsKey(page + 1)) {
                            addToCanvas(pdfDocument, imageSize,
                                    imageTextData.get(page + 1),
                                    imageData, createPdfA3u);
                        }
                    }
                }
            } catch (IOException e) {
                LOGGER.error(MessageFormatUtil.format(
                        PdfOcrLogMessageConstant.CANNOT_ADD_DATA_TO_PDF_DOCUMENT,
                        e.getMessage()));
            }
        }
    }

    /**
     * Places given image to canvas to background to a separate layer.
     *
     * @param imageData input image as {@link java.io.File}
     * @param imageSize size of the image according to the selected
     *                  {@link ScaleMode}
     * @param pdfCanvas canvas to place the image
     */
    private void addImageToCanvas(final ImageData imageData,
            final Rectangle imageSize,
            final PdfCanvas pdfCanvas) {
        if (imageData != null) {
            if (ocrPdfCreatorProperties.getPageSize() == null) {
                pdfCanvas.addImage(imageData, imageSize, false);
            } else {
                final Point coordinates =
                        PdfCreatorUtil.calculateImageCoordinates(
                        ocrPdfCreatorProperties.getPageSize(), imageSize);
                final Rectangle rect =
                        new Rectangle(
                                (float)coordinates.x, (float)coordinates.y,
                                imageSize.getWidth(), imageSize.getHeight());
                pdfCanvas.addImage(imageData, rect, false);
            }
        }
    }

    /**
     * Places retrieved text to canvas to a separate layer.
     *
     * @param imageSize size of the image according to the selected
     *                  {@link ScaleMode}
     * @param pageText text that was found on this image (or on this page)
     * @param pdfCanvas canvas to place the text
     * @param multiplier coefficient to adjust text placing on canvas
     * @param pageMediaBox page parameters
     * @throws OcrException if PDF/A3u document is being created and provided
     * font contains notdef glyphs
     */
    private void addTextToCanvas(
            final Rectangle imageSize,
            final List pageText,
            final PdfCanvas pdfCanvas,
            final float multiplier,
            final Rectangle pageMediaBox)
            throws OcrException {
        if (pageText != null && pageText.size() > 0) {
            final Point imageCoordinates =
                    PdfCreatorUtil.calculateImageCoordinates(
                    ocrPdfCreatorProperties.getPageSize(), imageSize);
            for (TextInfo item : pageText) {
                String line = item.getText();

                final float bboxWidthPt = getWidthPt(item, multiplier);
                final float bboxHeightPt = getHeightPt(item, multiplier);
                FontProvider fontProvider = getOcrPdfCreatorProperties()
                        .getFontProvider();
                String fontFamily = getOcrPdfCreatorProperties()
                        .getDefaultFontFamily();
                if (lineNotEmpty(line, bboxHeightPt, bboxWidthPt)) {
                    Document document = new Document(pdfCanvas.getDocument());
                    document.setFontProvider(fontProvider);

                    // Scale the text width to fit the OCR bbox
                    final float fontSize = PdfCreatorUtil.calculateFontSize(
                            document, line, fontFamily,
                            bboxHeightPt, bboxWidthPt);

                    final float lineWidth = PdfCreatorUtil.getRealLineWidth(document,
                            line, fontFamily, fontSize);

                    final float xOffset = getXOffsetPt(item, multiplier);
                    final float yOffset = getYOffsetPt(item, multiplier, imageSize);

                    Canvas canvas = new Canvas(pdfCanvas, pageMediaBox);
                    canvas.setFontProvider(fontProvider);

                    Text text = new Text(line)
                            .setHorizontalScaling(bboxWidthPt / lineWidth);

                    Paragraph paragraph = new Paragraph(text)
                            .setMargin(0)
                            .setMultipliedLeading(1.2f);
                    paragraph.setFontFamily(fontFamily)
                            .setFontSize(fontSize);
                    paragraph.setWidth(bboxWidthPt * 1.5f);

                    if (ocrPdfCreatorProperties.getTextColor() != null) {
                        paragraph.setFontColor(ocrPdfCreatorProperties.getTextColor());
                    } else {
                        paragraph.setTextRenderingMode(TextRenderingMode.INVISIBLE);
                    }

                    canvas.showTextAligned(paragraph,
                            xOffset + (float)imageCoordinates.x,
                            yOffset + (float)imageCoordinates.y,
                            TextAlignment.LEFT);
                    canvas.close();
                }
            }
        }
    }

    /**
     * Creates layers for image and text according rules set in {@link OcrPdfCreatorProperties}.
     *
     * @param imageLayerName name of the image layer
     * @param textLayerName name of the text layer
     * @param pdfDocument document to add layers to
     *
     * @return array of two layers: first layer is for image, second layer is for text.
     * Elements may be null meaning that layer creation is not requested
     */
    private static PdfLayer[] createPdfLayers(
            String imageLayerName,
            String textLayerName,
            PdfDocument pdfDocument) {
        if (imageLayerName == null && textLayerName == null) {
            return new PdfLayer[] {null, null};
        } else if (imageLayerName == null) {
            return new PdfLayer[]{null, new PdfLayer(textLayerName, pdfDocument)};
        } else if (textLayerName == null) {
            return new PdfLayer[]{new PdfLayer(imageLayerName, pdfDocument), null};
        } else if (imageLayerName.equals(textLayerName)) {
            PdfLayer pdfLayer = new PdfLayer(imageLayerName, pdfDocument);
            return new PdfLayer[] {pdfLayer, pdfLayer};
        } else {
            return new PdfLayer[] {new PdfLayer(imageLayerName, pdfDocument), new PdfLayer(textLayerName, pdfDocument)};
        }
    }

    /**
     * Get left bound of text chunk.
     */
    private static float getLeft(TextInfo textInfo, float multiplier) {
        if (textInfo.getBboxRect() == null) {
            return textInfo.getBbox().get(LEFT_IDX) * multiplier;
        } else {
            return textInfo.getBboxRect().getLeft() * multiplier;
        }
    }

    /**
     * Get right bound of text chunk.
     */
    private static float getRight(TextInfo textInfo, float multiplier) {
        if (textInfo.getBboxRect() == null) {
            return (textInfo.getBbox().get(RIGHT_IDX) + 1) * multiplier - 1;
        } else {
            return (textInfo.getBboxRect().getRight() + 1) * multiplier - 1;
        }
    }

    /**
     * Get top bound of text chunk.
     */
    private static float getTop(TextInfo textInfo, float multiplier) {
        if (textInfo.getBboxRect() == null) {
            return textInfo.getBbox().get(TOP_IDX) * multiplier;
        } else {
            return textInfo.getBboxRect().getTop() * multiplier;
        }
    }

    /**
     * Get bottom bound of text chunk.
     */
    private static float getBottom(TextInfo textInfo, float multiplier) {
        if (textInfo.getBboxRect() == null) {
            return (textInfo.getBbox().get(BOTTOM_IDX) + 1) * multiplier - 1;
        } else {
            return (textInfo.getBboxRect().getBottom() + 1) * multiplier - 1;
        }
    }

    /**
     * Check if line is not empty.
     */
    private static boolean lineNotEmpty(String line, float bboxHeightPt, float bboxWidthPt) {
        return !line.isEmpty() && bboxHeightPt > 0 && bboxWidthPt > 0;
    }

    /**
     * Get width of text chunk in points.
     */
    private static float getWidthPt(TextInfo textInfo, float multiplier) {
        if (textInfo.getBboxRect() == null) {
            return PdfCreatorUtil.getPoints(
                    getRight(textInfo, multiplier) - getLeft(textInfo, multiplier));
        } else {
            return getRight(textInfo, multiplier) - getLeft(textInfo, multiplier);
        }
    }

    /**
     * Get height of text chunk in points.
     */
    private static float getHeightPt(TextInfo textInfo, float multiplier) {
        if (textInfo.getBboxRect() == null) {
            return PdfCreatorUtil.getPoints(
                    getBottom(textInfo, multiplier) - getTop(textInfo, multiplier));
        } else {
            return getTop(textInfo, multiplier) - getBottom(textInfo, multiplier);
        }
    }

    /**
     * Get horizontal text offset in points.
     */
    private static float getXOffsetPt(TextInfo textInfo, float multiplier) {
        if (textInfo.getBboxRect() == null) {
            return PdfCreatorUtil.getPoints(getLeft(textInfo, multiplier));
        } else {
            return getLeft(textInfo, multiplier);
        }
    }

    /**
     * Get vertical text offset in points.
     */
    private static float getYOffsetPt(TextInfo textInfo, float multiplier, Rectangle imageSize) {
        if (textInfo.getBboxRect() == null) {
            return imageSize.getHeight() - PdfCreatorUtil.getPoints(getBottom(textInfo, multiplier));
        } else {
            return getBottom(textInfo, multiplier);
        }
    }

    /**
     * A handler for PDF canvas that validates existing glyphs.
     */
    private static class NotDefCheckingPdfCanvas extends PdfCanvas {
        private static final long serialVersionUID = 708713860707664107L;
        private final boolean createPdfA3u;
        public NotDefCheckingPdfCanvas(PdfPage page, boolean createPdfA3u) {
            super(page);
            this.createPdfA3u = createPdfA3u;
        }

        @Override
        public PdfCanvas showText(GlyphLine text) {
            ActualTextCheckingGlyphLine glyphLine =
                    new ActualTextCheckingGlyphLine(text);
            PdfFont currentFont = getGraphicsState().getFont();
            boolean notDefGlyphsExists = false;
            // default value for error message, it'll be updated with the
            // unicode of the not found glyph
            String message = PdfOcrLogMessageConstant
                    .COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER;
            for (int i = glyphLine.start; i < glyphLine.end; i++) {
                if (isNotDefGlyph(currentFont, glyphLine.get(i))) {
                    notDefGlyphsExists = true;
                    message = MessageFormatUtil.format(PdfOcrLogMessageConstant
                                    .COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER,
                            glyphLine.get(i).getUnicode());
                    if (this.createPdfA3u) {
                        // exception is thrown only if PDF/A document is
                        // being created
                        throw new OcrException(message);
                    }
                    // setting actual text to NotDef glyph
                    glyphLine.setActualTextToGlyph(i,
                            glyphLine.toUnicodeString(i, i + 1));
                    // setting a fake unicode deliberately to pass further
                    // checks for actual text necessity during iterating over
                    // glyphline chunks with ActualTextIterator
                    Glyph glyph = new Glyph(glyphLine.get(i));
                    glyph.setUnicode(-1);
                    glyphLine.set(i, glyph);
                }
            }
            // Warning is logged if not PDF/A document is being created
            if (notDefGlyphsExists) {
                LOGGER.warn(message);
            }
            return this.showText(glyphLine, new ActualTextIterator(glyphLine));
        }

        private static boolean isNotDefGlyph(PdfFont font, Glyph glyph) {
            if (font instanceof PdfType0Font
                    || font instanceof PdfTrueTypeFont) {
                return glyph.getCode() == 0;
            } else if (font instanceof PdfType1Font
                    || font instanceof PdfType3Font) {
                return glyph.getCode() == -1;
            }
            return false;
        }
    }

    /**
     * A handler for GlyphLine that checks existing actual text not to
     * overwrite it.
     */
    private static class ActualTextCheckingGlyphLine extends GlyphLine {
        private static final long serialVersionUID = -946356392098459518L;

        public ActualTextCheckingGlyphLine(GlyphLine other) {
            super(other);
        }

        public void setActualTextToGlyph(int i, String text) {
            // set actual text if it doesn't exist for i-th glyph
            if ((this.actualText == null || this.actualText.size() <= i
                    || this.actualText.get(i) == null)) {
                super.setActualText(i, i + 1, text);
            }
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy