org.apache.pdfbox.pdfparser.ConformingPDFParser Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of pdfbox Show documentation
The Apache PDFBox library is an open source Java tool for working with PDF documents.
There is a newer version: 3.0.2
/*
 *  Copyright 2010 adam.
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *       http://www.apache.org/licenses/LICENSE-2.0
 * 
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  under the License.
 */

package org.apache.pdfbox.pdfparser;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.cos.COSUnread;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdmodel.ConformingPDDocument;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.XrefEntry;
import org.apache.pdfbox.persistence.util.COSObjectKey;

/**
 * 
 * @author Adam Nichols
 */
public class ConformingPDFParser extends BaseParser {
    protected RandomAccess inputFile;
    List xrefEntries;
    private long currentOffset;
    private ConformingPDDocument doc = null;
    private boolean throwNonConformingException = true;
    private boolean recursivlyRead = true;

    /**
     * Constructor.
     *
     * @param inputFile The input stream that contains the PDF document.
     *
     * @throws IOException If there is an error initializing the stream.
     */
    public ConformingPDFParser(File inputFile) throws IOException {
        this.inputFile = new RandomAccessFile(inputFile, "r");
    }

    /**
     * This will parse the stream and populate the COSDocument object.  This will close
     * the stream when it is done parsing.
     *
     * @throws IOException If there is an error reading from the stream or corrupt data
     * is found.
     */
    public void parse() throws IOException {
        document = new COSDocument();
        doc = new ConformingPDDocument(document);
        currentOffset = inputFile.length()-1;
        long xRefTableLocation = parseTrailerInformation();
        currentOffset = xRefTableLocation;
        parseXrefTable();
        // now that we read the xref table and put null references in the doc,
        // we can deference those objects now.
        boolean oldValue = recursivlyRead;
        recursivlyRead = false;
        List keys = doc.getObjectKeysFromPool();
        for(COSObjectKey key : keys) {
            // getObject will put it into the document's object pool for us
            getObject(key.getNumber(), key.getGeneration());
        }
        recursivlyRead = oldValue;
    }

    /**
     * This will get the document that was parsed.  parse() must be called before this is called.
     * When you are done with this document you must call close() on it to release
     * resources.
     *
     * @return The document that was parsed.
     *
     * @throws IOException If there is an error getting the document.
     */
    public COSDocument getDocument() throws IOException {
        if( document == null ) {
            throw new IOException( "You must call parse() before calling getDocument()" );
        }
        return document;
    }

    /**
     * This will get the PD document that was parsed.  When you are done with
     * this document you must call close() on it to release resources.
     *
     * @return The document at the PD layer.
     *
     * @throws IOException If there is an error getting the document.
     */
    public PDDocument getPDDocument() throws IOException {
        return doc;
    }
    
    private boolean parseXrefTable() throws IOException {
        String currentLine = readLine();
        if(throwNonConformingException) {
            if(!"xref".equals(currentLine))
                throw new AssertionError("xref table not found.\nExpected: xref\nFound: "+currentLine);
        }

        int objectNumber = readInt();
        int entries = readInt();
        xrefEntries = new ArrayList(entries);
        for(int i=0; i")) {
            // string of hex codes
            return COSString.createFromHexString(string.replaceAll("^<", "").replaceAll(">$", ""));
        }
        return null;
    }

    protected COSBase readObjectBackwards() throws IOException {
        COSBase obj = null;
        consumeWhitespaceBackwards();
        String lastSection = readBackwardUntilWhitespace();
        if("R".equals(lastSection)) {
            // indirect reference
            long gen = readLongBackwards();
            long number = readLongBackwards();
            // We just put a placeholder in the pool for now, we'll read the data later
            doc.putObjectInPool(new COSUnread(), number, gen);
            obj = new COSUnread(number, gen, this);
        } else if(">>".equals(lastSection)) {
            // dictionary
            throw new RuntimeException("Not yet implemented");
        } else if(lastSection != null && lastSection.endsWith("]")) {
            // array
            COSArray array = new COSArray();
            lastSection = lastSection.replaceAll("]$", "");
            while(!lastSection.startsWith("[")) {
                if(lastSection.matches("^\\s*<.*>\\s*$")) // it's a hex string
                    array.add(COSString.createFromHexString(lastSection.replaceAll("^\\s*<", "").replaceAll(">\\s*$", "")));
                lastSection = readBackwardUntilWhitespace();
            }
            lastSection = lastSection.replaceAll("^\\[", "");
            if(lastSection.matches("^\\s*<.*>\\s*$")) // it's a hex string
                array.add(COSString.createFromHexString(lastSection.replaceAll("^\\s*<", "").replaceAll(">\\s*$", "")));
            obj = array;
        } else if(lastSection != null && lastSection.endsWith(">")) {
            // string of hex codes
            obj = processCosObject(lastSection);
        } else {
            // try a number, otherwise fall back on a string
            try {
                Long.parseLong(lastSection);
                obj = COSNumber.get(lastSection);
            } catch(NumberFormatException e) {
                throw new RuntimeException("Not yet implemented");
            }
        }

        return obj;
    }

    protected COSName readNameBackwards() throws IOException {
        String name = readBackwardUntilWhitespace();
        name = name.replaceAll("^/", "");
        return COSName.getPDFName(name);
    }

    public COSBase getObject(long objectNumber, long generation) throws IOException {
        // we could optionally, check to see if parse() have been called &
        // throw an exception here, but I don't think that's really necessary
        XrefEntry entry = xrefEntries.get((int)objectNumber);
        currentOffset = entry.getByteOffset();
        return readObject(objectNumber, generation);
    }

    /**
     * This will read an object from the inputFile at whatever our currentOffset
     * is.  If the object and generation are not the expected values and this
     * object is set to throw an exception for non-conforming documents, then an
     * exception will be thrown.
     * @param objectNumber the object number you expect to read
     * @param generation the generation you expect this object to be
     * @return the object being read.
     */
    public COSBase readObject(long objectNumber, long generation) throws IOException {
        // when recursivly reading, we always pull the object from the filesystem
        if(document != null && recursivlyRead) {
            // check to see if it is in the document cache before hitting the filesystem
            COSBase obj = doc.getObjectFromPool(objectNumber, generation);
            if(obj != null)
                return obj;
        }

        int actualObjectNumber = readInt();
        if(objectNumber != actualObjectNumber)
            if(throwNonConformingException)
                throw new AssertionError("Object numer expected was " +
                        objectNumber + " but actual was " + actualObjectNumber);
        consumeWhitespace();

        int actualGeneration = readInt();
        if(generation != actualGeneration)
            if(throwNonConformingException)
                throw new AssertionError("Generation expected was " +
                        generation + " but actual was " + actualGeneration);
        consumeWhitespace();

        String obj = readWord();
        if(!"obj".equals(obj))
            if(throwNonConformingException)
                throw new AssertionError("Expected keyword 'obj' but found " + obj);
        
        // put placeholder object in doc to prevent infinite recursion
        // e.g. read Root -> dereference object -> read object which has /Parent -> GOTO read Root
        doc.putObjectInPool(new COSObject(null), objectNumber, generation);
        COSBase object = readObject();
        doc.putObjectInPool(object, objectNumber, generation);
        return object;
    }

    /**
     * This actually reads the object data.
     * @return the object which is read
     * @throws IOException
     */
    protected COSBase readObject() throws IOException {
        consumeWhitespace();
        String string = readWord();
        if(string.startsWith("<<")) {
            // this is a dictionary
            COSDictionary dictionary = new COSDictionary();
            boolean atEndOfDictionary = false;
            // remove the marker for the beginning of the dictionary
            string = string.replaceAll("^<<", "");
            
            if("".equals(string) || string.matches("^\\w$"))
                string = readWord().trim();
            while(!atEndOfDictionary) {
                COSName name = COSName.getPDFName(string);
                COSBase object = readObject();
                dictionary.setItem(name, object);

                byte singleByte = consumeWhitespace();
                if(singleByte == '>') {
                    readByte(); // get rid of the second '>'
                    atEndOfDictionary = true;
                }
                if(!atEndOfDictionary)
                    string = readWord().trim();
            }
            return dictionary;
        } else if(string.startsWith("/")) {
            // it's a dictionary label. i.e. /Type or /Pages or something similar
            COSBase name = COSName.getPDFName(string);
            return name;
        } else if(string.startsWith("-")) {
            // it's a negitive number
            return parseNumber(string);
        } else if(string.charAt(0) >= '0' && string.charAt(0) <= '9' ) {
            // it's a COSInt or COSFloat, or a weak reference (i.e. "3 0 R")
            // we'll have to peek ahead a little to see if it's a reference or not
            long tempOffset = this.currentOffset;
            consumeWhitespace();
            String tempString = readWord();
            if(tempString.matches("^[0-9]+$")) {
                // it is an int, might be a weak reference...
                tempString = readWord();
                if(!"R".equals(tempString)) {
                    // it's just a number, not a weak reference
                    this.currentOffset = tempOffset;
                    return parseNumber(string);
                }
            } else {
                // it's just a number, not a weak reference
                this.currentOffset = tempOffset;
                return parseNumber(string);
            }

            // it wasn't a number, so we need to parse the weak-reference
            this.currentOffset = tempOffset;
            int number = Integer.parseInt(string);
            int gen = readInt();
            String r = readWord();

            if(!"R".equals(r))
                if(throwNonConformingException)
                    throw new AssertionError("Expected keyword 'R' but found " + r);

            if(recursivlyRead) {
                // seek to the object, read it, seek back to current location
                long tempLocation = this.currentOffset;
                this.currentOffset = this.xrefEntries.get(number).getByteOffset();
                COSBase returnValue = readObject(number, gen);
                this.currentOffset = tempLocation;
                return returnValue;
            } else {
                // Put a COSUnknown there as a placeholder
                COSObject obj = new COSObject(new COSUnread());
                obj.setObjectNumber(COSInteger.get(number));
                obj.setGenerationNumber(COSInteger.get(gen));
                return obj;
            }
        } else if(string.startsWith("]")) {
            // end of an array, just return null
            if("]".equals(string))
                return null;
            int oldLength = string.length();
            this.currentOffset -= oldLength;
            return null;
        } else if(string.startsWith("[")) {
            // array of values
            // we'll just pay attention to the first part (this is in case there
            // is no whitespace between the "[" and the first element)
            int oldLength = string.length();
            string = "[";
            this.currentOffset -= (oldLength - string.length() + 1);

            COSArray array = new COSArray();
            COSBase object = readObject();
            while(object != null) {
                array.add(object);
                object = readObject();
            }
            return array;
        } else if(string.startsWith("(")) {
            // this is a string (not hex encoded), strip off the '(' and read until ')'
            StringBuilder sb = new StringBuilder(string.substring(1));
            byte singleByte = readByte();
            while(singleByte != ')') {
                sb.append((char)singleByte);
                singleByte = readByte();
            }
            return new COSString(sb.toString());
        } else {
            throw new RuntimeException("Not yet implemented: " + string
                    + " loation=" + this.currentOffset);
        }
    }

    /**
     * This will read the next string from the stream.
     * @return The string that was read from the stream.
     * @throws IOException If there is an error reading from the stream.
     */
    @Override
    protected String readString() throws IOException {
        consumeWhitespace();
        StringBuilder buffer = new StringBuilder();
        int c = pdfSource.read();
        while(!isEndOfName((char)c) && !isClosing(c) && c != -1) {
            buffer.append( (char)c );
            c = pdfSource.read();
        }
        if (c != -1) {
            pdfSource.unread(c);
        }
        return buffer.toString();
    }

    protected COSDictionary readDictionaryBackwards() throws IOException {
        COSDictionary dict = new COSDictionary();
        
        // consume the last two '>' chars which signify the end of the dictionary
        consumeWhitespaceBackwards();
        byte singleByte = readByteBackwards();
        if(throwNonConformingException) {
            if(singleByte != '>')
                throw new AssertionError("");
        }
        singleByte = readByteBackwards();
        if(throwNonConformingException) {
            if(singleByte != '>')
                throw new AssertionError("");
        }
        
        // check to see if we're at the end of the dictionary
        boolean atEndOfDictionary = false;
        singleByte = consumeWhitespaceBackwards();
        if(singleByte == '<') {
            inputFile.seek(currentOffset-1);
            atEndOfDictionary =  ((byte)inputFile.read()) == '<';
        }

        COSDictionary backwardsDictionary = new COSDictionary();
        // while we're not at the end of the dictionary, read in entries
        while(!atEndOfDictionary) {
            COSBase object = readObjectBackwards();
            COSName name = readNameBackwards();
            backwardsDictionary.setItem(name, object);
            
            singleByte = consumeWhitespaceBackwards();
            if(singleByte == '<') {
                inputFile.seek(currentOffset-1);
                atEndOfDictionary =  ((byte)inputFile.read()) == '<';
            }
        }

        // the dictionaries preserve the order keys were added, as such we shall
        // add them in the proper order, not the reverse order
        Set backwardsKeys = backwardsDictionary.keySet();
        for(int i = backwardsKeys.size()-1; i >=0; i--)
            dict.setItem((COSName)backwardsKeys.toArray()[i], backwardsDictionary.getItem((COSName)backwardsKeys.toArray()[i]));
        
        // consume the last two '<' chars
        readByteBackwards();
        readByteBackwards();

        return dict;
    }

    /**
     * This will read a line starting with the byte at offset and going 
     * backwards until it finds a newline.  This should only be used if we are
     * certain that the data will only be text, and not binary data.
     * 
     * @return the string which was read
     * @throws IOException if there was an error reading data from the file
     */
    protected String readLineBackwards() throws IOException {
        StringBuilder sb = new StringBuilder();
        boolean endOfObject = false;
        
        do {
            // first we read the %%EOF marker
            byte singleByte = readByteBackwards();
            if(singleByte == '\n') {
                // if ther's a preceeding \r, we'll eat that as well
                inputFile.seek(currentOffset);
                if((byte)inputFile.read() == '\r')
                    currentOffset--;
                endOfObject = true;
            } else if(singleByte == '\r') {
                endOfObject = true;
            } else {
                sb.insert(0, (char)singleByte);
            }
        } while(!endOfObject);
        
        return sb.toString();
    }

    /**
     * This will read a line starting with the byte at offset and going
     * forward until it finds a newline.  This should only be used if we are
     * certain that the data will only be text, and not binary data.
     * @return the string which was read
     * @throws IOException if there was an error reading data from the file
     */
    @Override
    protected String readLine() throws IOException {
        StringBuilder sb = new StringBuilder();
        boolean endOfLine = false;

        do {
            // first we read the %%EOF marker
            byte singleByte = readByte();
            if(singleByte == '\n') {
                // if ther's a preceeding \r, we'll eat that as well
                inputFile.seek(currentOffset);
                if((byte)inputFile.read() == '\r')
                    currentOffset++;
                endOfLine = true;
            } else if(singleByte == '\r') {
                endOfLine = true;
            } else {
                sb.append((char)singleByte);
            }
        } while(!endOfLine);

        return sb.toString();
    }

    protected String readWord() throws IOException {
        StringBuilder sb = new StringBuilder();
        boolean stop = true;
        do {
            byte singleByte = readByte();
            stop = this.isWhitespace(singleByte);

            // there are some additional characters which indicate the next element/word has begun
            // ignore the first char we read, b/c the first char is the beginnging of this object, not the next one
            if(!stop && sb.length() > 0) {
                stop = singleByte == '/' || singleByte == '['
                        || singleByte == ']'
                        || (singleByte == '>' && !">".equals(sb.toString()));
                if(stop) // we're stopping on a non-whitespace char, decrement the
                    this.currentOffset--; // counter so we don't miss this character
            }
            if(!stop)
                sb.append((char)singleByte);
        } while(!stop);

        return sb.toString();
    }

    /**
     * @return the recursivlyRead
     */
    public boolean isRecursivlyRead() {
        return recursivlyRead;
    }

    /**
     * @param recursivlyRead the recursivlyRead to set
     */
    public void setRecursivlyRead(boolean recursivlyRead) {
        this.recursivlyRead = recursivlyRead;
    }
}