All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pdfbox.pdfparser.VisualSignatureParser Maven / Gradle / Ivy

Go to download

The Apache PDFBox library is an open source Java tool for working with PDF documents.

There is a newer version: 3.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.pdfparser;

import java.io.IOException;
import java.io.InputStream;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.pdfwriter.COSWriter;
import org.apache.pdfbox.persistence.util.COSObjectKey;

public class VisualSignatureParser extends BaseParser 
{

    /**
     * Log instance.
     */
    private static final Log LOG = LogFactory.getLog(PDFParser.class);

    /**
     * Constructor.
     * 
     * @param input the inputstream to be read.
     * 
     * @throws IOException If something went wrong
     */
    public VisualSignatureParser(InputStream input) throws IOException 
    {
        super(input);
    }

    /**
     * This will parse the tokens making up the visual signature.
     * 
     * @throws IOException If there is an error while parsing the visual signature.
     */
    public void parse() throws IOException 
    {
        document = new COSDocument();
        skipToNextObj();

        boolean wasLastParsedObjectEOF = false;
        try 
        {
            while(!wasLastParsedObjectEOF) 
            {
                if(pdfSource.isEOF()) 
                {
                    break;
                }
                try 
                {
                    wasLastParsedObjectEOF = parseObject();
                } 
                catch(IOException e) 
                {
                    /*
                     * Warning is sent to the PDFBox.log and to the Console that
                     * we skipped over an object
                     */
                    LOG.warn("Parsing Error, Skipping Object", e);
                    skipToNextObj();
                }
                skipSpaces();
            }
        } 
        catch(IOException e) 
        {
            /*
             * PDF files may have random data after the EOF marker. Ignore errors if
             * last object processed is EOF.
             */
            if(!wasLastParsedObjectEOF) 
            {
                throw e;
            }
        }
    }

    private void skipToNextObj() throws IOException 
    {
        byte[] b = new byte[16];
        Pattern p = Pattern.compile("\\d+\\s+\\d+\\s+obj.*", Pattern.DOTALL);
        /* Read a buffer of data each time to see if it starts with a
         * known keyword. This is not the most efficient design, but we should
         * rarely be needing this function. We could update this to use the
         * circular buffer, like in readUntilEndStream().
         */
        while(!pdfSource.isEOF()) 
        {
            int l = pdfSource.read(b);
            if(l < 1) 
            {
                break;
            }
            String s = new String(b, "US-ASCII");
            if(s.startsWith("trailer")
                    || s.startsWith("xref")
                    || s.startsWith("startxref")
                    || s.startsWith("stream")
                    || p.matcher(s).matches()) 
            {
                pdfSource.unread(b);
                break;
            } 
            else 
            {
                pdfSource.unread(b, 1, l - 1);
            }
        }
    }

    private boolean parseObject() throws IOException 
    {
        boolean isEndOfFile = false;
        skipSpaces();
        //peek at the next character to determine the type of object we are parsing
        char peekedChar = (char) pdfSource.peek();

        //ignore endobj and endstream sections.
        while(peekedChar == 'e') 
        {
            //there are times when there are multiple endobj, so lets
            //just read them and move on.
            readString();
            skipSpaces();
            peekedChar = (char) pdfSource.peek();
        }
        if(pdfSource.isEOF()) 
        {
            // end of file we will return a false and call it a day.
        } 
        else if(peekedChar == 'x') 
        {
            //xref table. Note: The contents of the Xref table are currently ignored
            return true;
        } 
        else if(peekedChar == 't' || peekedChar == 's') 
        {
            // Note: startxref can occur in either a trailer section or by itself
            if(peekedChar == 't') 
            {
                return true;
            }
            if(peekedChar == 's') 
            {
                skipToNextObj();
                //verify that EOF exists
                String eof = readExpectedString("%%EOF");
                if(eof.indexOf("%%EOF") == -1 && !pdfSource.isEOF()) 
                {
                    throw new IOException("expected='%%EOF' actual='" + eof + "' next=" + readString()
                            + " next=" + readString());
                }
                isEndOfFile = true;
            }
        } 
        else 
        {
            //we are going to parse an normal object
            long number = -1;
            int genNum = -1;
            String objectKey = null;
            boolean missingObjectNumber = false;
            try 
            {
                char peeked = (char) pdfSource.peek();
                if(peeked == '<') 
                {
                    missingObjectNumber = true;
                } 
                else 
                {
                    number = readObjectNumber();
                }
            } 
            catch(IOException e) 
            {
                //ok for some reason "GNU Ghostscript 5.10" puts two endobj
                //statements after an object, of course this is nonsense
                //but because we want to support as many PDFs as possible
                //we will simply try again
                number = readObjectNumber();
            }
            if(!missingObjectNumber)
            {
                skipSpaces();
                genNum = readGenerationNumber();

                objectKey = readString(3);
                //System.out.println( "parseObject() num=" + number +
                //" genNumber=" + genNum + " key='" + objectKey + "'" );
                if(!objectKey.equals("obj")) 
                {
                    throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource);
                }
            } 
            else 
            {
                number = -1;
                genNum = -1;
            }

            skipSpaces();
            COSBase pb = parseDirObject();
            String endObjectKey = readString();

            if(endObjectKey.equals("stream")) 
            {
                pdfSource.unread(endObjectKey.getBytes());
                pdfSource.unread(' ');
                if(pb instanceof COSDictionary) 
                {
                    pb = parseCOSStream((COSDictionary) pb, getDocument().getScratchFile());

                } 
                else 
                {
                    // this is not legal
                    // the combination of a dict and the stream/endstream forms a complete stream object
                    throw new IOException("stream not preceded by dictionary");
                }
                endObjectKey = readString();
            }

            COSObjectKey key = new COSObjectKey(number, genNum);
            COSObject pdfObject = document.getObjectFromPool(key);
            pb.setNeedToBeUpdate(true);
            pdfObject.setObject(pb);

            if(!endObjectKey.equals("endobj")) 
            {
                if(endObjectKey.startsWith("endobj")) 
                {
                    /*
                     * Some PDF files don't contain a new line after endobj so we
                     * need to make sure that the next object number is getting read separately
                     * and not part of the endobj keyword. Ex. Some files would have "endobj28"
                     * instead of "endobj"
                     */
                    pdfSource.unread(endObjectKey.substring(6).getBytes());
                } 
                else if(!pdfSource.isEOF()) 
                {
                    try 
                    {
                        //It is possible that the endobj  is missing, there
                        //are several PDFs out there that do that so skip it and move on.
                        Float.parseFloat(endObjectKey);
                        pdfSource.unread(COSWriter.SPACE);
                        pdfSource.unread(endObjectKey.getBytes());
                    } 
                    catch(NumberFormatException e) 
                    {
                        //we will try again incase there was some garbage which
                        //some writers will leave behind.
                        String secondEndObjectKey = readString();
                        if(!secondEndObjectKey.equals("endobj")) 
                        {
                            if(isClosing()) 
                            {
                                //found a case with 17506.pdf object 41 that was like this
                                //41 0 obj [/Pattern /DeviceGray] ] endobj
                                //notice the second array close, here we are reading it
                                //and ignoring and attempting to continue
                                pdfSource.read();
                            }
                            skipSpaces();
                            String thirdPossibleEndObj = readString();
                            if(!thirdPossibleEndObj.equals("endobj")) 
                            {
                                throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' "
                                        + "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
                            }
                        }
                    }
                }
            }
            skipSpaces();
        }
        return isEndOfFile;
    }

    /**
     * Returns the underlying COSDocument.
     * 
     * @return the COSDocument
     * 
     * @throws IOException If something went wrong
     */
    public COSDocument getDocument() throws IOException 
    {
        if(document == null) 
        {
            throw new IOException("You must call parse() before calling getDocument()");
        }
        return document;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy