com.lowagie.text.pdf.parser.PdfTextExtractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of openpdf Show documentation
There is a newer version: 2.0.3
/*
 * Copyright 2008 by Kevin Day.
 *
 * Contributions copyright 2014 Tizra Inc.
 *
 * The contents of this file are subject to the Mozilla Public License Version 1.1
 * (the "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the License.
 *
 * The Original Code is 'iText, a free JAVA-PDF library'.
 *
 * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
 * the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie.
 * All Rights Reserved.
 * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
 * are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved.
 *
 * Contributor(s): all the names of the contributors are added in the source code
 * where applicable.
 *
 * Alternatively, the contents of this file may be used under the terms of the
 * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
 * provisions of LGPL are applicable instead of those above.  If you wish to
 * allow use of your version of this file only under the terms of the LGPL
 * License and not to allow others to use your version of this file under
 * the MPL, indicate your decision by deleting the provisions above and
 * replace them with the notice and other provisions required by the LGPL.
 * If you do not delete the provisions above, a recipient may use your version
 * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
 *
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the MPL as stated above or under the terms of the GNU
 * Library General Public License as published by the Free Software Foundation;
 * either version 2 of the License, or any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
 * details.
 *
 * If you didn't download this code from the following link, you should check if
 * you aren't using an obsolete version:
 * https://github.com/LibrePDF/OpenPDF
 */
package com.lowagie.text.pdf.parser;


import com.lowagie.text.ExceptionConverter;
import com.lowagie.text.pdf.PRIndirectReference;
import com.lowagie.text.pdf.PRStream;
import com.lowagie.text.pdf.PRTokeniser;
import com.lowagie.text.pdf.PdfArray;
import com.lowagie.text.pdf.PdfContentParser;
import com.lowagie.text.pdf.PdfDictionary;
import com.lowagie.text.pdf.PdfLiteral;
import com.lowagie.text.pdf.PdfName;
import com.lowagie.text.pdf.PdfObject;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.RandomAccessFileOrArray;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * Extracts text from a PDF file.
 *
 * @since 2.1.4
 */
@SuppressWarnings("WeakerAccess")
public class PdfTextExtractor {

    /**
     * The PdfReader that holds the PDF file.
     */
    private final PdfReader reader;

    /**
     * The {@link TextAssembler} that will receive render notifications and
     * provide resultant text
     */
    private final TextAssembler renderListener;

    /**
     * Creates a new Text Extractor object, using a {@link TextAssembler} as the
     * render listener
     *
     * @param reader the reader with the PDF
     */
    public PdfTextExtractor(PdfReader reader) {
        this(reader, new MarkedUpTextAssembler(reader));
    }

    /**
     * Creates a new Text Extractor object, using a {@link TextAssembler} as the
     * render listener
     *
     * @param reader               the reader with the PDF
     * @param usePdfMarkupElements should we use higher level tags for PDF markup entities?
     */
    public PdfTextExtractor(PdfReader reader, boolean usePdfMarkupElements) {
        this(reader, new MarkedUpTextAssembler(reader, usePdfMarkupElements));
    }

    /**
     * Creates a new Text Extractor object.
     *
     * @param reader         the reader with the PDF
     * @param renderListener the render listener that will be used to analyze renderText
     *                       operations and provide resultant text
     */
    public PdfTextExtractor(PdfReader reader, TextAssembler renderListener) {
        this.reader = reader;
        this.renderListener = renderListener;
    }

    /**
     * Gets the content bytes of a page.
     *
     * @param pageNum the 1-based page number of page you want get the content
     *                stream from
     * @return a byte array with the effective content stream of a page
     * @throws IOException
     */
    private byte[] getContentBytesForPage(int pageNum) throws IOException {
        try (RandomAccessFileOrArray ignored = reader.getSafeFile()) {
            PdfDictionary pageDictionary = reader.getPageN(pageNum);
            PdfObject contentObject = pageDictionary.get(PdfName.CONTENTS);
            return getContentBytesFromContentObject(contentObject);
        }
    }

    /**
     * Gets the content bytes from a content object, which may be a reference a
     * stream or an array.
     *
     * @param contentObject the object to read bytes from
     * @return the content bytes
     * @throws IOException
     */
    private byte[] getContentBytesFromContentObject(PdfObject contentObject) throws IOException {
        final byte[] result;
        switch (contentObject.type()) {
            case PdfObject.INDIRECT:
                PRIndirectReference ref = (PRIndirectReference) contentObject;
                PdfObject directObject = PdfReader.getPdfObject(ref);
                result = getContentBytesFromContentObject(directObject);
                break;
            case PdfObject.STREAM:
                PRStream stream = (PRStream) PdfReader.getPdfObject(contentObject);
                result = PdfReader.getStreamBytes(stream);
                break;
            case PdfObject.ARRAY:
                // Stitch together all content before calling processContent(),
                // because
                // processContent() resets state.
                ByteArrayOutputStream allBytes = new ByteArrayOutputStream();
                PdfArray contentArray = (PdfArray) contentObject;
                for (PdfObject pdfObject : contentArray.getElements()) {
                    allBytes.write(getContentBytesFromContentObject(pdfObject));
                }
                result = allBytes.toByteArray();
                break;
            default:
                throw new IllegalStateException("Unable to handle Content of type " + contentObject.getClass());
        }
        return result;
    }

    /**
     * Gets the text from a page.
     *
     * @param page the 1-based page number of page
     * @return a String with the content as plain text (without PDF syntax)
     * @throws IOException on error
     */
    public String getTextFromPage(int page) throws IOException {
        return getTextFromPage(page, false);
    }

    /**
     * get the text from the page
     *
     * @param page               page number we are interested in
     * @param useContainerMarkup should we put tags in for PDf markup container elements (not
     *                           really HTML at the moment).
     * @return result of extracting the text, with tags as requested.
     * @throws IOException on error
     */
    public String getTextFromPage(int page, boolean useContainerMarkup) throws IOException {
        PdfDictionary pageDict = reader.getPageN(page);
        if (pageDict == null) {
            return "";
        }
        PdfDictionary resources = pageDict.getAsDict(PdfName.RESOURCES);

        renderListener.reset();
        renderListener.setPage(page);
        PdfContentStreamHandler handler = new PdfContentStreamHandler(renderListener);
        processContent(getContentBytesForPage(page), resources, handler);
        return handler.getResultantText();
    }

    /**
     * Processes PDF syntax
     *
     * @param contentBytes the bytes of a content stream
     * @param resources    the resources that come with the content stream
     * @param handler      interprets events caused by recognition of operations in a
     *                     content stream.
     */
    public void processContent(byte[] contentBytes, PdfDictionary resources,
                               PdfContentStreamHandler handler) {
        handler.pushContext("div class='t-extracted-page'");
        try {
            PdfContentParser ps = new PdfContentParser(new PRTokeniser(contentBytes));
            List operands = new ArrayList<>();
            while (ps.parse(operands).size() > 0) {
                PdfLiteral operator = (PdfLiteral) operands.get(operands.size() - 1);
                handler.invokeOperator(operator, operands, resources);
            }
        } catch (Exception e) {
            throw new ExceptionConverter(e);
        }
        handler.popContext();
    }
}