org.archive.modules.extractor.PDFParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.
There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.extractor;

import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.PdfName;
import com.lowagie.text.pdf.PdfObject;
import com.lowagie.text.pdf.PdfDictionary;
import com.lowagie.text.pdf.PRIndirectReference;
import com.lowagie.text.pdf.PdfArray;

import java.io.*;
import java.util.*;


/** Supports PDF parsing operations.  For now this primarily means
 *  extracting URIs, but the logic in extractURIs() could easily be adopted/extended
 * for a variety of PDF processing tasks.
 *
 * @author Parker Thompson
 *
 */
//TODO make this more effecient, it currently had to read the whole file into memory
// before processing can begin, and appears to take much longer than it "should"
// to parse small, but admittedly complex, documents.
public class PDFParser {

    protected ArrayList foundURIs;
    protected ArrayList> encounteredReferences;
    protected PdfReader documentReader;
    protected byte[] document;
    protected PdfDictionary catalog;

    public PDFParser(String doc) throws IOException {
        resetState();
        getInFromFile(doc);
        initialize();
    }
    public PDFParser(byte[] doc) throws IOException{
        resetState();
        document = doc;
        initialize();
    }

    /** Reinitialize the object as though a new one were created.
     */
    protected void resetState(){
        foundURIs = new ArrayList();
        encounteredReferences = new ArrayList>();
        documentReader = null;
        document = null;
        catalog = null;

        for(int i=0; i < encounteredReferences.size(); i++){
            encounteredReferences.add(new ArrayList());
        }
    }

    /**
     * Reset the object and initialize it with a new byte array (the document).
     * @param doc
     * @throws IOException
     */
    public void resetState(byte[] doc) throws IOException{
        resetState();
        document = doc;
        initialize();
    }

    /** Reinitialize the object as though a new one were created, complete
     * with a valid pointer to a document that can be read
     * @param doc
     * @throws IOException
     */
    public void resetState(String doc) throws IOException{
        resetState();
        getInFromFile(doc);
        initialize();
    }

    /**
     * Read a file named 'doc' and store its' bytes for later processing.
     * @param doc
     * @throws IOException
     */
    protected void getInFromFile(String doc) throws IOException{
        File documentOnDisk = new File(doc);

        long length = documentOnDisk.length();
        document = new byte[(int)length];

        FileInputStream inStream = new FileInputStream(documentOnDisk);

        inStream.read(document);
    }

    /**
     * Indicates, based on a PDFObject's generation/id pair whether
     * the parser has already encountered this object (or a reference to it)
     * so we don't infinitely loop on circuits within the PDF.
     * @param generation
     * @param id
     * @return True if already seen.
     */
    protected boolean haveSeen(int generation, int id){

        // if we can't store this generation grow our list until we can
        if(generation >= encounteredReferences.size()){
            for(int i=encounteredReferences.size(); i <= generation; i++){
                encounteredReferences.add(new ArrayList());
            }

            // clearly we haven't seen it
            return false;
        }

        ArrayList generationList
         = encounteredReferences.get(generation);
        
        for (int i: generationList) {
            if(i == id){
                return true;
            }
        }
        return false;
    }

    /**
     * Note that an object (id/generation pair) has been seen by this parser
     * so that it can be handled differently when it is encountered again.
     * @param generation
     * @param id
     */
    protected void markAsSeen(int generation, int id){
        ArrayList objectIds = encounteredReferences.get(generation);
        objectIds.add(id);
    }

    /**
     * Get a list of URIs retrieved from the Pdf during the
     * extractURIs operation.
     * @return A list of URIs retrieved from the Pdf during the
     * extractURIs operation.
     */
    public ArrayList getURIs(){
        return foundURIs;
    }

    /**
     * Initialize opens the document for reading.  This is done implicitly
     * by the constuctor.  This should only need to be called directly following
     * a reset.
     * @throws IOException
     */
    protected void initialize() throws IOException{
        if(document != null){
            documentReader = new PdfReader(document);
        }

        catalog = documentReader.getCatalog();
    }

    /**
     * Extract URIs from all objects found in a Pdf document's catalog.
     * Returns an array list representing all URIs found in the document catalog tree.
     * @return URIs from all objects found in a Pdf document's catalog.
     */
    public ArrayList extractURIs(){
        extractURIs(catalog);
        return getURIs();
    }

    /**
     * Parse a PdfDictionary, looking for URIs recursively and adding
     * them to foundURIs
     * @param entity
     */
    @SuppressWarnings("unchecked")
    protected void extractURIs(PdfObject entity){

            // deal with dictionaries
            if(entity.isDictionary()){

                PdfDictionary dictionary= (PdfDictionary)entity;

                Set allkeys = dictionary.getKeys();
                for (PdfName key: allkeys) {
                    PdfObject value = dictionary.get(key);

                    // see if it's the key is a UR[I,L]
                    if( key.toString().equals("/URI") ||
		            key.toString().equals("/URL") ) {
                        foundURIs.add(value.toString());

                    }else{
                        this.extractURIs(value);
                    }

                }

            // deal with arrays
            }else if(entity.isArray()){

                PdfArray array = (PdfArray)entity;
                for (PdfObject pdfObject : (Iterable)array.getArrayList()) {
                    this.extractURIs(pdfObject);
                }

            // deal with indirect references
            }else if(entity.getClass() == PRIndirectReference.class){

                    PRIndirectReference indirect = (PRIndirectReference)entity;

                    // if we've already seen a reference to this object
                    if( haveSeen( indirect.getGeneration(), indirect.getNumber()) ){
                        return;

                    // note that we've seen it if it's new
                    }else{
                        markAsSeen(indirect.getGeneration(), indirect.getNumber() );
                    }

                    // dereference the "pointer" and process the object
                    indirect.getReader(); // FIXME: examine side-effects
                    PdfObject direct = PdfReader.getPdfObject(indirect);

                    this.extractURIs(direct);
            }
    }

    public static void main(String[] argv){

        try {
            PDFParser parser = new PDFParser("/home/parkert/files/pdfspec.pdf");
            ArrayList uris = parser.extractURIs();
            Iterator i = uris.iterator();
            while(i.hasNext()){
                String uri = (String)i.next();
                System.out.println("got uri: " + uri);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}