com.lowagie.text.pdf.parser.PdfContentReaderTool Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of openpdf Show documentation
There is a newer version: 2.0.3
/*
 * Copyright 2008 by Kevin Day.
 *
 * The contents of this file are subject to the Mozilla Public License Version 1.1
 * (the "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the License.
 *
 * The Original Code is 'iText, a free JAVA-PDF library'.
 *
 * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
 * the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie.
 * All Rights Reserved.
 * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
 * are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved.
 *
 * Contributor(s): all the names of the contributors are added in the source code
 * where applicable.
 *
 * Alternatively, the contents of this file may be used under the terms of the
 * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
 * provisions of LGPL are applicable instead of those above.  If you wish to
 * allow use of your version of this file only under the terms of the LGPL
 * License and not to allow others to use your version of this file under
 * the MPL, indicate your decision by deleting the provisions above and
 * replace them with the notice and other provisions required by the LGPL.
 * If you do not delete the provisions above, a recipient may use your version
 * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
 *
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the MPL as stated above or under the terms of the GNU
 * Library General Public License as published by the Free Software Foundation;
 * either version 2 of the License, or any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
 * details.
 *
 * If you didn't download this code from the following link, you should check if
 * you aren't using an obsolete version:
 * https://github.com/LibrePDF/OpenPDF
 */
package com.lowagie.text.pdf.parser;

import com.lowagie.text.pdf.PdfDictionary;
import com.lowagie.text.pdf.PdfName;
import com.lowagie.text.pdf.PdfObject;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.RandomAccessFileOrArray;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;

/**
 * Tool that parses the content of a PDF document.
 * 
 * @since 2.1.4
 */
@SuppressWarnings("WeakerAccess")
public class PdfContentReaderTool {

    /**
     * Shows the detail of a dictionary. This is similar to the PdfLister
     * functionality.
     * 
     * @param dic
     *            the dictionary of which you want the detail
     * @return a String representation of the dictionary
     */
    public static String getDictionaryDetail(PdfDictionary dic) {
        return getDictionaryDetail(dic, 0);
    }

    /**
     * Shows the detail of a dictionary.
     * 
     * @param dic
     *            the dictionary of which you want the detail
     * @param depth
     *            the depth of the current dictionary (for nested dictionaries)
     * @return a String representation of the dictionary
     */
    public static String getDictionaryDetail(PdfDictionary dic, int depth) {
        StringBuilder builder = new StringBuilder();
        builder.append('(');
        List subDictionaries = new ArrayList<>();
        for (PdfName key : dic.getKeys()) {
            PdfObject val = dic.getDirectObject(key);
            if (val.isDictionary()) {
                subDictionaries.add(key);
            }
            builder.append(key);
            builder.append('=');
            builder.append(val);
            builder.append(", ");
        }
        builder.setLength(builder.length() - 2);
        builder.append(')');
        PdfName pdfSubDictionaryName;
        for (Object subDictionary : subDictionaries) {
            pdfSubDictionaryName = (PdfName) subDictionary;
            builder.append('\n');
            for (int i = 0; i < depth + 1; i++) {
                builder.append('\t');
            }
            builder.append("Subdictionary ");
            builder.append(pdfSubDictionaryName);
            builder.append(" = ");
            builder.append(getDictionaryDetail(
                    dic.getAsDict(pdfSubDictionaryName), depth + 1));
        }
        return builder.toString();
    }

    /**
     * Writes information about a specific page from PdfReader to the specified
     * output stream.
     * 
     * @since 2.1.5
     * @param reader
     *            the PdfReader to read the page content from
     * @param pageNum
     *            the page number to read
     * @param out
     *            the output stream to send the content to
     * @throws IOException  thrown when an I/O operation goes wrong
     */
    public static void listContentStreamForPage(PdfReader reader, int pageNum, PrintWriter out)
            throws IOException {
        out.println("==============Page " + pageNum + "====================");
        out.println("- - - - - Dictionary - - - - - -");
        PdfDictionary pageDictionary = reader.getPageN(pageNum);
        out.println(getDictionaryDetail(pageDictionary));
        out.println("- - - - - Content Stream - - - - - -");
        RandomAccessFileOrArray f = reader.getSafeFile();

        byte[] contentBytes = reader.getPageContent(pageNum, f);
        f.close();

        InputStream is = new ByteArrayInputStream(contentBytes);
        int ch;
        while ((ch = is.read()) != -1) {
            out.print((char) ch);
        }

        out.println("\n- - - - - Text Extraction - - - - - -");
        PdfTextExtractor extractor = new PdfTextExtractor(reader,
                new MarkedUpTextAssembler(reader));
        String extractedText = extractor.getTextFromPage(pageNum);
        if (extractedText.length() != 0) {
            out.println(extractedText);
        } else {
            out.println("No text found on page " + pageNum);
        }

        out.println();

    }

    /**
     * Writes information about each page in a PDF file to the specified output
     * stream.
     * 
     * @since 2.1.5
     * @param pdfFile
     *            a File instance referring to a PDF file
     * @param out
     *            the output stream to send the content to
     * @throws IOException  thrown when an I/O operation goes wrong
     */
    public static void listContentStream(File pdfFile, PrintWriter out)
            throws IOException {
        PdfReader reader = new PdfReader(pdfFile.getCanonicalPath());

        int maxPageNum = reader.getNumberOfPages();

        for (int pageNum = 1; pageNum <= maxPageNum; pageNum++) {
            listContentStreamForPage(reader, pageNum, out);
        }

    }

    /**
     * Writes information about the specified page in a PDF file to the
     * specified output stream.
     * 
     * @since 2.1.5
     * @param pdfFile
     *            a File instance referring to a PDF file
     * @param pageNum
     *            the page number to read
     * @param out
     *            the output stream to send the content to
     * @throws IOException  thrown when an I/O operation goes wrong
     */
    public static void listContentStream(File pdfFile, int pageNum,
            PrintWriter out) throws IOException {
        PdfReader reader = new PdfReader(pdfFile.getCanonicalPath());

        listContentStreamForPage(reader, pageNum, out);
    }

    /**
     * Writes information about each page in a PDF file to the specified file,
     * or System.out.
     * 
     * @param args the arguments passed to the command line
     */
    public static void main(String[] args) {
        try {
            if (args.length < 1 || args.length > 3) {
                System.out.println("Usage:  PdfContentReaderTool  [|stdout] []");
                return;
            }

            PrintWriter writer = new PrintWriter(System.out);
            if (args.length >= 2) {
                if (args[1].compareToIgnoreCase("stdout") != 0) {
                    System.out.println("Writing PDF content to " + args[1]);
                    writer = new PrintWriter(new FileOutputStream(new File(args[1])));
                }
            }

            int pageNum = -1;
            if (args.length >= 3) {
                pageNum = Integer.parseInt(args[2]);
            }

            if (pageNum == -1) {
                listContentStream(new File(args[0]), writer);
            } else {
                listContentStream(new File(args[0]), pageNum, writer);
            }
            writer.flush();

            if (args.length >= 2) {
                writer.close();
                System.out.println("Finished writing content to " + args[1]);
            }
        } catch (Exception e) {
            e.printStackTrace(System.err);
        }
    }

}