All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pdfbox.pdfparser.NonSequentialPDFParser Maven / Gradle / Ivy

Go to download

The Apache PDFBox library is an open source Java tool for working with PDF documents.

There is a newer version: 3.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.pdfparser;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.security.KeyStore;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.Set;
import java.util.TreeMap;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNull;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.io.PushBackInputStream;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.DecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
import org.apache.pdfbox.pdmodel.encryption.PublicKeyDecryptionMaterial;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandlersManager;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.persistence.util.COSObjectKey;

/**
 * PDFParser which first reads startxref and xref tables in order to know valid
 * objects and parse only these objects. Thus it is closer to a conforming
 * parser than the sequential reading of {@link PDFParser}.
 * 
 * This class can be used as a {@link PDFParser} replacement. First
 * {@link #parse()} must be called before page objects can be retrieved, e.g.
 * {@link #getPDDocument()}.
 * 
 * This class is a much enhanced version of QuickParser presented
 * in PDFBOX-1104 by
 * Jeremy Villalobos.
 */
public class NonSequentialPDFParser extends PDFParser
{
    private static final byte[] XREF_TABLE = new byte[] { 'x', 'r', 'e', 'f' };
    private static final byte[] XREF_STREAM = new byte[] { '/','X', 'R', 'e', 'f' };
    private static final long MINIMUM_SEARCH_OFFSET = 6;
    
    private static final int X = 'x';

    public static final String SYSPROP_PARSEMINIMAL = "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.parseMinimal";
    public static final String SYSPROP_EOFLOOKUPRANGE = "org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange";

    private static final InputStream EMPTY_INPUT_STREAM = new ByteArrayInputStream(new byte[0]);

    protected static final int DEFAULT_TRAIL_BYTECOUNT = 2048;
    /**
     * EOF-marker.
     */
    protected static final char[] EOF_MARKER = new char[] { '%', '%', 'E', 'O', 'F' };
    /**
     * StartXRef-marker.
     */
    protected static final char[] STARTXREF_MARKER = new char[] { 's', 't', 'a', 'r', 't', 'x', 'r', 'e', 'f' };
    /**
     * obj-marker.
     */
    protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };

    /**
     * trailer-marker.
     */
    private static final char[] TRAILER_MARKER = new char[] { 't', 'r', 'a', 'i', 'l', 'e', 'r' };
    
    private long trailerOffset;
    private final File pdfFile;
    private long fileLen;
    private final RandomAccessBufferedFileInputStream raStream;

    /**
     * is parser using auto healing capacity ?
     */
    private boolean isLenient = true;

    /**
     * Contains all found objects of a brute force search.
     */
    private HashMap bfSearchObjectOffsets = null;
    private HashMap bfSearchCOSObjectKeyOffsets = null;
    private Vector bfSearchXRefOffsets = null;

    /**
     * The security handler.
     */
    protected SecurityHandler securityHandler = null;

    private String keyStoreFilename = null;
    private String alias = null;
    private String password = "";
    private int readTrailBytes = DEFAULT_TRAIL_BYTECOUNT; // how many trailing
                                                          // bytes to read for
                                                          // EOF marker

    /**
     * If true object references in catalog are not followed; pro:
     * page objects will be only parsed when needed; cons: some information of
     * catalog might not be available (e.g. outline). Catalog parsing without
     * pages is not an option since a number of entries will also refer to page
     * objects (like OpenAction).
     */
    private boolean parseMinimalCatalog = "true".equals(System.getProperty(SYSPROP_PARSEMINIMAL));

    private boolean initialParseDone = false;
    private boolean allPagesParsed = false;

    private static final Log LOG = LogFactory.getLog(NonSequentialPDFParser.class);

    /**
     * true if the NonSequentialPDFParser is initialized by a
     * InputStream, in this case a temporary file is created. At the end of the
     * {@linkplain #parse()} method,the temporary file will be deleted.
     */
    private boolean isTmpPDFFile = false;

    public static final String TMP_FILE_PREFIX = "tmpPDF";

    // ------------------------------------------------------------------------
    /**
     * Constructs parser for given file using memory buffer.
     * 
     * @param filename the filename of the pdf to be parsed
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(String filename) throws IOException
    {
        this(new File(filename), null);
    }

    /**
     * Constructs parser for given file using given buffer for temporary
     * storage.
     * 
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     * 
     * @throws IOException If something went wrong.
     */
    /**
     * Constructs parser for given file using given buffer for temporary
     * storage.
     * 
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(File file, RandomAccess raBuf) throws IOException
    {
        this(file, raBuf, "");
    }

    /**
     * Constructs parser for given file using given buffer for temporary
     * storage.
     * 
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     * 
     * @throws IOException If something went wrong.
     */
    /**
     * Constructs parser for given file using given buffer for temporary
     * storage.
     * 
     * @param file the pdf to be parsed
     * @param raBuf the buffer to be used for parsing
     * @param decryptionPassword password to be used for decryption
     * 
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(File file, RandomAccess raBuf, String decryptionPassword) throws IOException
    {
        super(EMPTY_INPUT_STREAM, null, false);
        pdfFile = file;
        raStream = new RandomAccessBufferedFileInputStream(pdfFile);
        init(file, raBuf, decryptionPassword);
    }

    private void init(File file, RandomAccess raBuf, String decryptionPassword) throws IOException
    {
        String eofLookupRangeStr = System.getProperty(SYSPROP_EOFLOOKUPRANGE);
        if (eofLookupRangeStr != null)
        {
            try
            {
                setEOFLookupRange(Integer.parseInt(eofLookupRangeStr));
            }
            catch (NumberFormatException nfe)
            {
                LOG.warn("System property " + SYSPROP_EOFLOOKUPRANGE + " does not contain an integer value, but: '"
                        + eofLookupRangeStr + "'");
            }
        }

        setDocument((raBuf == null) ? new COSDocument(new RandomAccessBuffer(), false) : new COSDocument(raBuf, false));

        pdfSource = new PushBackInputStream(raStream, 4096);

        password = decryptionPassword;
    }

    /**
     * Constructor.
     * 
     * @param input input stream representing the pdf.
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(InputStream input) throws IOException
    {
        this(input, null, "");
    }

    /**
     * Constructor.
     * 
     * @param input input stream representing the pdf.
     * @param raBuf the buffer to be used for parsing
     * @param decryptionPassword password to be used for decryption.
     * @throws IOException If something went wrong.
     */
    public NonSequentialPDFParser(InputStream input, RandomAccess raBuf, String decryptionPassword) throws IOException
    {
        super(EMPTY_INPUT_STREAM, null, false);
        pdfFile = createTmpFile(input);
        raStream = new RandomAccessBufferedFileInputStream(pdfFile);
        init(pdfFile, raBuf, decryptionPassword);
    }

    /**
     * Create a temporary file with the input stream. If the creation succeed,
     * the {@linkplain #isTmpPDFFile} is set to true. This Temporary file will
     * be deleted at end of the parse method
     * 
     * @param input
     * @return the temporary file
     * @throws IOException If something went wrong.
     */
    private File createTmpFile(InputStream input) throws IOException
    {
        File tmpFile = null;
        FileOutputStream fos = null;
        try
        {
            tmpFile = File.createTempFile(TMP_FILE_PREFIX, ".pdf");
            fos = new FileOutputStream(tmpFile);
            IOUtils.copy(input, fos);
            isTmpPDFFile = true;
            return tmpFile;
        }
        finally
        {
            IOUtils.closeQuietly(input);
            IOUtils.closeQuietly(fos);
        }
    }

    // ------------------------------------------------------------------------
    /**
     * Sets how many trailing bytes of PDF file are searched for EOF marker and
     * 'startxref' marker. If not set we use default value
     * {@link #DEFAULT_TRAIL_BYTECOUNT}.
     * 
     * 
     * 
     * 

In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is defined * this value will be set on initialization but can be overwritten * later.

* * @param byteCount number of trailing bytes */ public void setEOFLookupRange(int byteCount) { if (byteCount > 15) { readTrailBytes = byteCount; } } /** * The initial parse will first parse only the trailer, the xrefstart and * all xref tables to have a pointer (offset) to all the pdf's objects. It * can handle linearized pdfs, which will have an xref at the end pointing * to an xref at the beginning of the file. Last the root object is parsed. * * @throws IOException If something went wrong. */ protected void initialParse() throws IOException { COSDictionary trailer = null; // ---- parse startxref long startXRefOffset = getStartxrefOffset(); if (startXRefOffset > 0) { trailer = parseXref(startXRefOffset); } else if (isFDFDocment || isLenient) { // signal start of new XRef xrefTrailerResolver.nextXrefObj(startXRefOffset); bfSearchForObjects(); for (COSObjectKey objectKey : bfSearchCOSObjectKeyOffsets.keySet()) { xrefTrailerResolver.setXRef(objectKey, bfSearchCOSObjectKeyOffsets.get(objectKey)); } // parse the last trailer. pdfSource.seek(trailerOffset); if (!parseTrailer()) { throw new IOException("Expected trailer object at position: " + pdfSource.getOffset()); } xrefTrailerResolver.setStartxref(startXRefOffset); trailer = xrefTrailerResolver.getCurrentTrailer(); document.setTrailer(trailer); } // ---- prepare decryption if necessary prepareDecryption(); // PDFBOX-1557 - ensure that all COSObject are loaded in the trailer // PDFBOX-1606 - after securityHandler has been instantiated for (COSBase trailerEntry : trailer.getValues()) { if (trailerEntry instanceof COSObject) { COSObject tmpObj = (COSObject) trailerEntry; parseObjectDynamically(tmpObj, false); } } // ---- parse catalog or root object COSObject root = (COSObject) xrefTrailerResolver.getTrailer().getItem(COSName.ROOT); if (root == null) { throw new IOException("Missing root object specification in trailer."); } COSBase rootObject = parseObjectDynamically(root, false); // ---- resolve all objects if (isFDFDocment) { // A FDF doesn't have a catalog, all FDF fields are within the root object if (rootObject instanceof COSDictionary) { parseDictObjects((COSDictionary) rootObject, (COSName[]) null); allPagesParsed = true; document.setDecrypted(); } } else { if (!(rootObject instanceof COSDictionary)) { throw new IOException("Expected root dictionary, but got this: " + rootObject); } COSDictionary rootDictionary = (COSDictionary)rootObject; // in some pdfs the type value "Catalog" is missing in the root object if (isLenient() && !rootDictionary.containsKey(COSName.TYPE)) { rootDictionary.setItem(COSName.TYPE, COSName.CATALOG); } if(!parseMinimalCatalog) { COSObject catalogObj = document.getCatalog(); if (catalogObj != null) { if (catalogObj.getObject() instanceof COSDictionary) { parseDictObjects((COSDictionary) catalogObj.getObject(), (COSName[]) null); COSBase infoBase = trailer.getDictionaryObject(COSName.INFO); if (infoBase instanceof COSDictionary) { parseDictObjects((COSDictionary) infoBase, (COSName[]) null); } allPagesParsed = true; document.setDecrypted(); } } } } // PDFBOX-1922: read the version again now that all objects have been resolved readVersionInTrailer(trailer); initialParseDone = true; } /** * Resolves all not already parsed objects of a dictionary recursively. * * @param dictionaryObject dictionary to be parsed * @throws IOException if something went wrong * */ private void parseDictionaryRecursive(COSObject dictionaryObject) throws IOException { parseObjectDynamically(dictionaryObject, true); COSDictionary dictionary = (COSDictionary)dictionaryObject.getObject(); for(COSBase value : dictionary.getValues()) { if (value instanceof COSObject) { COSObject object = (COSObject)value; if (object.getObject() == null) { parseDictionaryRecursive(object); } } } } /** * Prepare for decryption. * * @throws IOException if something went wrong */ private void prepareDecryption() throws IOException { COSBase trailerEncryptItem = document.getTrailer().getItem(COSName.ENCRYPT); if (trailerEncryptItem != null && !(trailerEncryptItem instanceof COSNull)) { if (trailerEncryptItem instanceof COSObject) { COSObject trailerEncryptObj = (COSObject) trailerEncryptItem; parseDictionaryRecursive(trailerEncryptObj); } try { PDEncryptionDictionary encParameters = new PDEncryptionDictionary(document.getEncryptionDictionary()); DecryptionMaterial decryptionMaterial = null; if (keyStoreFilename != null) { KeyStore ks = KeyStore.getInstance("PKCS12"); ks.load(new FileInputStream(keyStoreFilename), password.toCharArray()); decryptionMaterial = new PublicKeyDecryptionMaterial(ks, alias, password); } else { decryptionMaterial = new StandardDecryptionMaterial(password); } securityHandler = SecurityHandlersManager.getInstance().getSecurityHandler(encParameters.getFilter()); securityHandler.prepareForDecryption(encParameters, document.getDocumentID(), decryptionMaterial); AccessPermission permission = securityHandler.getCurrentAccessPermission(); if (!permission.canExtractContent()) { LOG.warn("PDF file '" + pdfFile.getPath() + "' does not allow extracting content."); } } catch (Exception e) { throw new IOException("Error (" + e.getClass().getSimpleName() + ") while creating security handler for decryption: " + e.getMessage() /* * , e TODO: remove * remark with Java 1.6 */); } } } /** * Parses cross reference tables. * * @param startXRefOffset start offset of the first table * @return the trailer dictionary * @throws IOException if something went wrong */ private COSDictionary parseXref(long startXRefOffset) throws IOException { setPdfSource(startXRefOffset); parseStartXref(); long startXrefOffset = document.getStartXref(); // check the startxref offset long fixedOffset = checkXRefOffset(startXrefOffset); if (fixedOffset > -1) { startXrefOffset = fixedOffset; document.setStartXref(startXrefOffset); } long prev = startXrefOffset; // ---- parse whole chain of xref tables/object streams using PREV // reference Set prevSet = new HashSet(); while (prev > 0) { // seek to xref table setPdfSource(prev); // skip white spaces skipSpaces(); // -- parse xref if (pdfSource.peek() == X) { // xref table and trailer // use existing parser to parse xref table parseXrefTable(prev); // parse the last trailer. trailerOffset = pdfSource.getOffset(); //PDFBOX-1739 skip extra xref entries in RegisSTAR documents while (isLenient && pdfSource.peek() != 't') { if (pdfSource.getOffset() == trailerOffset) { // warn only the first time LOG.warn("Expected trailer object at position " + trailerOffset + ", keep trying"); } readLine(); } if (!parseTrailer()) { throw new IOException("Expected trailer object at position: " + pdfSource.getOffset()); } COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer(); // check for a XRef stream, it may contain some object ids of compressed objects if(trailer.containsKey(COSName.XREF_STM)) { int streamOffset = trailer.getInt(COSName.XREF_STM); // check the xref stream reference fixedOffset = checkXRefOffset(streamOffset); if (fixedOffset > -1 && fixedOffset != streamOffset) { LOG.warn("/XRefStm offset " + streamOffset + " is incorrect, corrected to " + fixedOffset); streamOffset = (int)fixedOffset; trailer.setInt(COSName.XREF_STM, streamOffset); } setPdfSource(streamOffset); skipSpaces(); try { parseXrefObjStream(prev, false); } catch (IOException ex) { if (isLenient) { LOG.error("Failed to parse /XRefStm at offset " + streamOffset, ex); } else { throw ex; } } } prev = trailer.getInt(COSName.PREV); if (prev > 0) { // check the xref table reference fixedOffset = checkXRefOffset(prev); if (fixedOffset > -1 && fixedOffset != prev) { prev = fixedOffset; trailer.setLong(COSName.PREV, prev); } } } else { // parse xref stream prev = parseXrefObjStream(prev, true); if (prev > 0) { // check the xref table reference fixedOffset = checkXRefOffset(prev); if (fixedOffset > -1 && fixedOffset != prev) { prev = fixedOffset; COSDictionary trailer = xrefTrailerResolver.getCurrentTrailer(); trailer.setLong(COSName.PREV, prev); } } } if (prevSet.contains(prev)) { throw new IOException("/Prev loop at offset " + prev); } prevSet.add(prev); } // ---- build valid xrefs out of the xref chain xrefTrailerResolver.setStartxref(startXrefOffset); COSDictionary trailer = xrefTrailerResolver.getTrailer(); document.setTrailer(trailer); // check the offsets of all referenced objects checkXrefOffsets(); return trailer; } /** * Parses an xref object stream starting with indirect object id. * * @return value of PREV item in dictionary or -1 if no such * item exists */ private long parseXrefObjStream(long objByteOffset, boolean isStandalone) throws IOException { // ---- parse indirect object head readObjectNumber(); readGenerationNumber(); readPattern(OBJ_MARKER); COSDictionary dict = parseCOSDictionary(); COSStream xrefStream = parseCOSStream(dict, getDocument().getScratchFile()); parseXrefStream(xrefStream, (int) objByteOffset,isStandalone); return dict.getLong(COSName.PREV); } // ------------------------------------------------------------------------ /** Get current offset in file at which next byte would be read. */ private final long getPdfSourceOffset() { return pdfSource.getOffset(); } /** * Sets {@link #pdfSource} to start next parsing at given file offset. * * @param fileOffset file offset * @throws IOException If something went wrong. */ protected final void setPdfSource(long fileOffset) throws IOException { pdfSource.seek(fileOffset); // alternative using 'old fashioned' input stream // if ( pdfSource != null ) // pdfSource.close(); // // pdfSource = new PushBackInputStream( // new BufferedInputStream( // new FileInputStream( file ), 16384), 4096); // pdfSource.skip( _fileOffset ); } /** * Enable handling of alternative pdfSource implementation. * @throws IOException If something went wrong. */ protected final void releasePdfSourceInputStream() throws IOException { // if ( pdfSource != null ) // pdfSource.close(); } private final void closeFileStream() throws IOException { if (pdfSource != null) { pdfSource.close(); } } // ------------------------------------------------------------------------ /** * Looks for and parses startxref. We first look for last '%%EOF' marker * (within last {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via * {@link #setEOFLookupRange(int)}) and go back to find * startxref. * * @return the offset of StartXref * @throws IOException If something went wrong. */ protected final long getStartxrefOffset() throws IOException { byte[] buf; long skipBytes; // ---- read trailing bytes into buffer fileLen = pdfFile.length(); FileInputStream fIn = null; try { fIn = new FileInputStream(pdfFile); final int trailByteCount = (fileLen < readTrailBytes) ? (int) fileLen : readTrailBytes; buf = new byte[trailByteCount]; fIn.skip(skipBytes = fileLen - trailByteCount); int off = 0; int readBytes; while (off < trailByteCount) { readBytes = fIn.read(buf, off, trailByteCount - off); // in order to not get stuck in a loop we check readBytes (this // should never happen) if (readBytes < 1) { throw new IOException("No more bytes to read for trailing buffer, but expected: " + (trailByteCount - off)); } off += readBytes; } } finally { if (fIn != null) { try { fIn.close(); } catch (IOException ioe) { } } } // ---- find last '%%EOF' int bufOff = lastIndexOf(EOF_MARKER, buf, buf.length); if (bufOff < 0) { if (isLenient) { // in lenient mode the '%%EOF' isn't needed bufOff = buf.length; LOG.debug("Missing end of file marker '" + (new String(EOF_MARKER)) + "'"); } else { throw new IOException("Missing end of file marker '" + (new String(EOF_MARKER)) + "'"); } } // ---- find last startxref preceding EOF marker bufOff = lastIndexOf(STARTXREF_MARKER, buf, bufOff); if (bufOff < 0) { if (isLenient) { trailerOffset = lastIndexOf(TRAILER_MARKER, buf, buf.length); if (trailerOffset > 0) { trailerOffset += skipBytes; } return -1; } else { throw new IOException("Missing 'startxref' marker."); } } return skipBytes + bufOff; } // ------------------------------------------------------------------------ /** * Searches last appearance of pattern within buffer. Lookup before _lastOff * and goes back until 0. * * @param pattern pattern to search for * @param buf buffer to search pattern in * @param endOff offset (exclusive) where lookup starts at * * @return start offset of pattern within buffer or -1 if * pattern could not be found */ protected int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff) { final int lastPatternChOff = pattern.length - 1; int bufOff = endOff; int patOff = lastPatternChOff; char lookupCh = pattern[patOff]; while (--bufOff >= 0) { if (buf[bufOff] == lookupCh) { if (--patOff < 0) { // whole pattern matched return bufOff; } // matched current char, advance to preceding one lookupCh = pattern[patOff]; } else if (patOff < lastPatternChOff) { // no char match but already matched some chars; reset lookupCh = pattern[patOff = lastPatternChOff]; } } return -1; } // ------------------------------------------------------------------------ /** * Reads given pattern from {@link #pdfSource}. Skipping whitespace at start * and end. * * @param pattern pattern to be skipped * @throws IOException if pattern could not be read */ protected final void readPattern(final char[] pattern) throws IOException { skipSpaces(); for (char c : pattern) { if (pdfSource.read() != c) { throw new IOException("Expected pattern '" + new String(pattern) + "' but missed at character '" + c + "' at offset " + pdfSource.getOffset()); } } skipSpaces(); } // ------------------------------------------------------------------------ private COSDictionary pagesDictionary = null; /** * Returns PAGES {@link COSDictionary} object or throws {@link IOException} * if PAGES dictionary does not exist. */ private COSDictionary getPagesObject() throws IOException { if (pagesDictionary != null) { return pagesDictionary; } COSObject pages = (COSObject) document.getCatalog().getItem(COSName.PAGES); if (pages == null) { throw new IOException("Missing PAGES entry in document catalog."); } COSBase object = parseObjectDynamically(pages, false); if (!(object instanceof COSDictionary)) { throw new IOException("PAGES not a dictionary object, but: " + object.getClass().getSimpleName()); } pagesDictionary = (COSDictionary) object; return pagesDictionary; } // ------------------------------------------------------------------------ /** Parses all objects needed by pages and closes input stream. */ /** * {@inheritDoc} */ @Override public void parse() throws IOException { boolean exceptionOccurred = true; // set to false if all is processed try { // PDFBOX-1922 read the version header and rewind // this part copied from the sequential parser parseHeader(); pdfSource.seek(0); if (!initialParseDone) { initialParse(); } // a FDF doesn't have any pages if (!isFDFDocment) { final int pageCount = getPageNumber(); if (!allPagesParsed) { for (int pNr = 0; pNr < pageCount; pNr++) { getPage(pNr); } allPagesParsed = true; document.setDecrypted(); } } exceptionOccurred = false; } finally { try { closeFileStream(); } catch (IOException ioe) { } deleteTempFile(); if (exceptionOccurred && (document != null)) { try { document.close(); document = null; } catch (IOException ioe) { } } } } /** * Return the pdf file. * * @return the pdf file */ protected File getPdfFile() { return this.pdfFile; } /** * Return true if parser is lenient. Meaning auto healing capacity of the parser are used. * * @return true if parser is lenient */ public boolean isLenient () { return isLenient; } /** * Change the parser leniency flag. * * This method can only be called before the parsing of the file. * * @param lenient * * @throws IllegalArgumentException if the method is called after parsing. */ public void setLenient(boolean lenient) throws IllegalArgumentException { if (initialParseDone) { throw new IllegalArgumentException("Cannot change leniency after parsing"); } this.isLenient = lenient; } /** * Remove the temporary file. A temporary file is created if this class is * instantiated with an InputStream */ protected void deleteTempFile() { if (isTmpPDFFile) { try { if (!pdfFile.delete()) { LOG.warn("Temporary file '" + pdfFile.getName() + "' can't be deleted"); } } catch (SecurityException e) { LOG.warn("Temporary file '" + pdfFile.getName() + "' can't be deleted", e); } } } // ------------------------------------------------------------------------ /** * Returns security handler of the document or null if document * is not encrypted or {@link #parse()} wasn't called before. * * @return the security handler. */ public SecurityHandler getSecurityHandler() { return securityHandler; } // ------------------------------------------------------------------------ /** * This will get the PD document that was parsed. When you are done with * this document you must call close() on it to release resources. * * Overwriting super method was necessary in order to set security handler. * * @return The document at the PD layer. * * @throws IOException If there is an error getting the document. */ @Override public PDDocument getPDDocument() throws IOException { PDDocument pdDocument = super.getPDDocument(); if (securityHandler != null) { pdDocument.setSecurityHandler(securityHandler); } return pdDocument; } // ------------------------------------------------------------------------ /** * Returns the number of pages in a document. * * @return the number of pages. * * @throws IOException if PAGES or other needed object is missing */ public int getPageNumber() throws IOException { int pageCount = getPagesObject().getInt(COSName.COUNT); if (pageCount < 0) { throw new IOException("No page number specified."); } return pageCount; } // ------------------------------------------------------------------------ /** * Returns the page requested with all the objects loaded into it. * * @param pageNr starts from 0 to the number of pages. * @return the page with the given pagenumber. * @throws IOException If something went wrong. */ public PDPage getPage(int pageNr) throws IOException { getPagesObject(); // ---- get list of top level pages COSArray kids = (COSArray) pagesDictionary.getDictionaryObject(COSName.KIDS); if (kids == null) { throw new IOException("Missing 'Kids' entry in pages dictionary."); } // ---- get page we are looking for (possibly going recursively into // subpages) COSObject pageObj = getPageObject(pageNr, kids, 0); if (pageObj == null) { throw new IOException("Page " + pageNr + " not found."); } // parse all objects necessary to load page. COSDictionary pageDict = (COSDictionary) pageObj.getObject(); // parse all objects necessary to load page. if (parseMinimalCatalog && (!allPagesParsed)) { parseDictObjects(pageDict); } return new PDPage(pageDict); } /** * Returns the object for a specific page. The page tree is made up of kids. * The kids have COSArray with COSObjects inside of them. The COSObject can * be parsed using the dynamic parsing method We want to only parse the * minimum COSObjects and still return a complete page. ready to be used. * * @param num the requested page number; numbering starts with 0 * @param startKids Kids array to start with looking up page number * @param startPageCount * * @return page object or null if no such page exists * * @throws IOException */ private COSObject getPageObject(int num, COSArray startKids, int startPageCount) throws IOException { int curPageCount = startPageCount; Iterator kidsIter = startKids.iterator(); while (kidsIter.hasNext()) { COSObject obj = (COSObject) kidsIter.next(); COSBase base = obj.getObject(); if (base == null) { base = parseObjectDynamically(obj, false); obj.setObject(base); } COSDictionary dic = (COSDictionary) base; int count = dic.getInt(COSName.COUNT); if (count >= 0) { // skip this branch if requested page comes later if ((curPageCount + count) <= num) { curPageCount += count; continue; } } COSArray kids = (COSArray) dic.getDictionaryObject(COSName.KIDS); if (kids != null) { // recursively scan subpages COSObject ans = getPageObject(num, kids, curPageCount); // if ans is not null, we got what we were looking for if (ans != null) { return ans; } } else { // found page? if (curPageCount == num) { return obj; } // page has no kids and it is not the page we are looking for curPageCount++; } } return null; } /** * Creates a unique object id using object number and object generation * number. (requires object number < 2^31)) */ private final long getObjectId(final COSObject obj) { return (obj.getObjectNumber().longValue() << 32) | obj.getGenerationNumber().longValue(); } /** * Adds all from newObjects to toBeParsedList if it is not an COSObject or * we didn't add this COSObject already (checked via addedObjects). */ private final void addNewToList(final Queue toBeParsedList, final Collection newObjects, final Set addedObjects) { for (COSBase newObject : newObjects) { if (newObject instanceof COSObject) { final long objId = getObjectId((COSObject) newObject); if (!addedObjects.add(objId)) { continue; } } toBeParsedList.add(newObject); } } /** * Adds newObject to toBeParsedList if it is not an COSObject or we didn't * add this COSObject already (checked via addedObjects). */ private final void addNewToList(final Queue toBeParsedList, final COSBase newObject, final Set addedObjects) { if (newObject instanceof COSObject) { final long objId = getObjectId((COSObject) newObject); if (!addedObjects.add(objId)) { return; } } toBeParsedList.add(newObject); } /** * Will parse every object necessary to load a single page from the pdf * document. We try our best to order objects according to offset in file * before reading to minimize seek operations. * * @param dict the COSObject from the parent pages. * @param excludeObjects dictionary object reference entries with these * names will not be parsed * * @throws IOException */ private void parseDictObjects(COSDictionary dict, COSName... excludeObjects) throws IOException { // ---- create queue for objects waiting for further parsing final Queue toBeParsedList = new LinkedList(); // offset ordered object map final TreeMap> objToBeParsed = new TreeMap>(); // in case of compressed objects offset points to stmObj final Set parsedObjects = new HashSet(); final Set addedObjects = new HashSet(); // ---- add objects not to be parsed to list of already parsed objects if (excludeObjects != null) { for (COSName objName : excludeObjects) { COSBase baseObj = dict.getItem(objName); if (baseObj instanceof COSObject) { parsedObjects.add(getObjectId((COSObject) baseObj)); } } } addNewToList(toBeParsedList, dict.getValues(), addedObjects); // ---- go through objects to be parsed while (!(toBeParsedList.isEmpty() && objToBeParsed.isEmpty())) { // -- first get all COSObject from other kind of objects and // put them in objToBeParsed; afterwards toBeParsedList is empty COSBase baseObj; while ((baseObj = toBeParsedList.poll()) != null) { if (baseObj instanceof COSStream) { addNewToList(toBeParsedList, ((COSStream) baseObj).getValues(), addedObjects); } else if (baseObj instanceof COSDictionary) { addNewToList(toBeParsedList, ((COSDictionary) baseObj).getValues(), addedObjects); } else if (baseObj instanceof COSArray) { final Iterator arrIter = ((COSArray) baseObj).iterator(); while (arrIter.hasNext()) { addNewToList(toBeParsedList, arrIter.next(), addedObjects); } } else if (baseObj instanceof COSObject) { COSObject obj = (COSObject) baseObj; long objId = getObjectId(obj); COSObjectKey objKey = new COSObjectKey(obj.getObjectNumber().intValue(), obj.getGenerationNumber() .intValue()); if (!(parsedObjects.contains(objId) /* * || document.hasObjectInPool ( objKey ) */)) { Long fileOffset = xrefTrailerResolver.getXrefTable().get(objKey); // it is allowed that object references point to null, // thus we have to test if (fileOffset != null && fileOffset != 0) { if (fileOffset > 0) { objToBeParsed.put(fileOffset, Collections.singletonList(obj)); } else { // negative offset means we have a compressed // object within object stream; // get offset of object stream fileOffset = xrefTrailerResolver.getXrefTable().get(new COSObjectKey(-fileOffset, 0)); if ((fileOffset == null) || (fileOffset <= 0)) { throw new IOException( "Invalid object stream xref object reference for key '" + objKey + "': " + fileOffset); } List stmObjects = objToBeParsed.get(fileOffset); if (stmObjects == null) { objToBeParsed.put(fileOffset, stmObjects = new ArrayList()); } stmObjects.add(obj); } } else { // NULL object COSObject pdfObject = document.getObjectFromPool(objKey); pdfObject.setObject(COSNull.NULL); } } } } // ---- read first COSObject with smallest offset; // resulting object will be added to toBeParsedList if (objToBeParsed.isEmpty()) { break; } for (COSObject obj : objToBeParsed.remove(objToBeParsed.firstKey())) { COSBase parsedObj = parseObjectDynamically(obj, false); obj.setObject(parsedObj); addNewToList(toBeParsedList, parsedObj, addedObjects); parsedObjects.add(getObjectId(obj)); } } } /** * This will parse the next object from the stream and add it to the local * state. This is taken from {@link PDFParser} and reduced to parsing an * indirect object. * * @param obj object to be parsed (we only take object number and generation * number for lookup start offset) * @param requireExistingNotCompressedObj if true object to be * parsed must not be contained within compressed stream * @return the parsed object (which is also added to document object) * * @throws IOException If an IO error occurs. */ protected final COSBase parseObjectDynamically(COSObject obj, boolean requireExistingNotCompressedObj) throws IOException { return parseObjectDynamically(obj.getObjectNumber().intValue(), obj.getGenerationNumber().intValue(), requireExistingNotCompressedObj); } /** * This will parse the next object from the stream and add it to the local * state. This is taken from {@link PDFParser} and reduced to parsing an * indirect object. * * @param objNr object number of object to be parsed * @param objGenNr object generation number of object to be parsed * @param requireExistingNotCompressedObj if true the object to * be parsed must be defined in xref (comment: null objects may * be missing from xref) and it must not be a compressed object * within object stream (this is used to circumvent being stuck * in a loop in a malicious PDF) * * @return the parsed object (which is also added to document object) * * @throws IOException If an IO error occurs. */ protected COSBase parseObjectDynamically(int objNr, int objGenNr, boolean requireExistingNotCompressedObj) throws IOException { // ---- create object key and get object (container) from pool final COSObjectKey objKey = new COSObjectKey(objNr, objGenNr); final COSObject pdfObject = document.getObjectFromPool(objKey); if (pdfObject.getObject() == null) { // not previously parsed // ---- read offset or object stream object number from xref table Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get(objKey); // sanity test to circumvent loops with broken documents if (requireExistingNotCompressedObj && ((offsetOrObjstmObNr == null) || (offsetOrObjstmObNr <= 0))) { throw new IOException("Object must be defined and must not be compressed object: " + objKey.getNumber() + ":" + objKey.getGeneration()); } if (offsetOrObjstmObNr == null) { // not defined object -> NULL object (Spec. 1.7, chap. 3.2.9) pdfObject.setObject(COSNull.NULL); } else if (offsetOrObjstmObNr > 0) { // offset of indirect object in file // ---- go to object start setPdfSource(offsetOrObjstmObNr); // ---- we must have an indirect object final long readObjNr = readObjectNumber(); final long readObjGen = readGenerationNumber(); readPattern(OBJ_MARKER); // ---- consistency check if ((readObjNr != objKey.getNumber()) || (readObjGen != objKey.getGeneration())) { throw new IOException("XREF for " + objKey.getNumber() + ":" + objKey.getGeneration() + " points to wrong object: " + readObjNr + ":" + readObjGen); } skipSpaces(); COSBase pb = parseDirObject(); String endObjectKey = readString(); if (endObjectKey.equals("stream")) { pdfSource.unread(endObjectKey.getBytes("ISO-8859-1")); pdfSource.unread(' '); if (pb instanceof COSDictionary) { COSStream stream = parseCOSStream((COSDictionary) pb, getDocument().getScratchFile()); if (securityHandler != null) { try { securityHandler.decryptStream(stream, objNr, objGenNr); } catch (CryptographyException ce) { throw new IOException("Error decrypting stream object " + objNr + ": " + ce.getMessage() /* , ce // TODO: remove remark with Java 1.6 */); } } pb = stream; } else { // this is not legal // the combination of a dict and the stream/endstream // forms a complete stream object throw new IOException("Stream not preceded by dictionary (offset: " + offsetOrObjstmObNr + ")."); } skipSpaces(); endObjectKey = readLine(); // we have case with a second 'endstream' before endobj if (!endObjectKey.startsWith("endobj")) { if (endObjectKey.startsWith("endstream")) { endObjectKey = endObjectKey.substring(9).trim(); if (endObjectKey.length() == 0) { // no other characters in extra endstream line endObjectKey = readLine(); // read next line } } } } else if (securityHandler != null) { decrypt(pb, objNr, objGenNr); } pdfObject.setObject(pb); if (!endObjectKey.startsWith("endobj")) { if (isLenient) { LOG.warn("Object (" + readObjNr + ":" + readObjGen + ") at offset " + offsetOrObjstmObNr + " does not end with 'endobj' but with '" + endObjectKey + "'"); } else { throw new IOException("Object (" + readObjNr + ":" + readObjGen + ") at offset " + offsetOrObjstmObNr + " does not end with 'endobj' but with '" + endObjectKey + "'"); } } releasePdfSourceInputStream(); } else { // xref value is object nr of object stream containing object to // be parsed; // since our object was not found it means object stream was not // parsed so far final int objstmObjNr = (int) (-offsetOrObjstmObNr); final COSBase objstmBaseObj = parseObjectDynamically(objstmObjNr, 0, true); if (objstmBaseObj instanceof COSStream) { // parse object stream PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj, document, forceParsing); parser.parse(); // register all objects which are referenced to be contained // in object stream for (COSObject next : parser.getObjects()) { COSObjectKey stmObjKey = new COSObjectKey(next); Long offset = xrefTrailerResolver.getXrefTable().get(stmObjKey); if (offset != null && offset == -objstmObjNr) { COSObject stmObj = document.getObjectFromPool(stmObjKey); stmObj.setObject(next.getObject()); } } } } } return pdfObject.getObject(); } // ------------------------------------------------------------------------ /** * * @param dict the dictionary to be decrypted * @param objNr the object number * @param objGenNr the object generation number * @throws IOException ff something went wrong */ protected final void decryptDictionary(COSDictionary dict, long objNr, long objGenNr) throws IOException { if (dict.getItem(COSName.CF) != null) { // PDFBOX-2936: avoid orphan /CF dictionaries found in US govt "I-" files return; } COSBase type = dict.getDictionaryObject(COSName.TYPE); boolean isSignature = COSName.SIG.equals(type) || COSName.DOC_TIME_STAMP.equals(type); for (Entry entry : dict.entrySet()) { if (isSignature && COSName.CONTENTS.equals(entry.getKey())) { // do not decrypt the signature contents string continue; } if (entry.getValue() instanceof COSString) { decryptString((COSString) entry.getValue(), objNr, objGenNr); } else if (entry.getValue() instanceof COSArray) { try { securityHandler.decryptArray((COSArray) entry.getValue(), objNr, objGenNr); } catch (CryptographyException ce) { throw new IOException("Error decrypting stream object " + objNr + ": " + ce.getMessage() /* , ce // TODO: remove remark with Java 1.6 */); } } else if (entry.getValue() instanceof COSDictionary) { decryptDictionary((COSDictionary) entry.getValue(), objNr, objGenNr); } } } /** * Decrypts given COSString. * * @param str the string to be decrypted * @param objNr the object number * @param objGenNr the object generation number * @throws IOException ff something went wrong */ protected final void decryptString(COSString str, long objNr, long objGenNr) throws IOException { try { securityHandler.decryptString(str, objNr, objGenNr); } catch (CryptographyException ce) { throw new IOException("Error decrypting string: " + ce.getMessage() /* , ce // TODO: remove remark with Java 1.6 */); } } /** * Decrypts given object. * * @param pb the object to be decrypted * @param objNr the object number * @param objGenNr the object generation number * @throws IOException ff something went wrong */ protected final void decrypt(COSBase pb, int objNr, int objGenNr) throws IOException { if (pb instanceof COSString) { decryptString((COSString) pb, objNr, objGenNr); } else if (pb instanceof COSDictionary) { decryptDictionary((COSDictionary) pb, objNr, objGenNr); } else if (pb instanceof COSArray) { final COSArray array = (COSArray) pb; for (int aIdx = 0, len = array.size(); aIdx < len; aIdx++) { decrypt(array.get(aIdx), objNr, objGenNr); } } } /** Returns length value referred to or defined in given object. */ private COSNumber getLength(final COSBase lengthBaseObj, final COSBase streamType) throws IOException { if (lengthBaseObj == null) { return null; } COSNumber retVal = null; // ---- maybe length was given directly if (lengthBaseObj instanceof COSNumber) { retVal = (COSNumber) lengthBaseObj; } // ---- length in referenced object else if (lengthBaseObj instanceof COSObject) { COSObject lengthObj = (COSObject) lengthBaseObj; if (lengthObj.getObject() == null) { // not read so far // keep current stream position final long curFileOffset = getPdfSourceOffset(); releasePdfSourceInputStream(); boolean isObjectStream = COSName.OBJ_STM.equals(streamType); parseObjectDynamically(lengthObj, isObjectStream); // reset current stream position setPdfSource(curFileOffset); if (lengthObj.getObject() == null) { throw new IOException("Length object content was not read."); } } if (!(lengthObj.getObject() instanceof COSNumber)) { throw new IOException("Wrong type of referenced length object " + lengthObj + ": " + lengthObj.getObject().getClass().getSimpleName()); } retVal = (COSNumber) lengthObj.getObject(); } else { throw new IOException("Wrong type of length object: " + lengthBaseObj.getClass().getSimpleName()); } return retVal; } // ------------------------------------------------------------------------ private final int streamCopyBufLen = 8192; private final byte[] streamCopyBuf = new byte[streamCopyBufLen]; /** * This will read a COSStream from the input stream using length attribute * within dictionary. If length attribute is a indirect reference it is * first resolved to get the stream length. This means we copy stream data * without testing for 'endstream' or 'endobj' and thus it is no problem if * these keywords occur within stream. We require 'endstream' to be found * after stream data is read. * * @param dic dictionary that goes with this stream. * @param file file to write the stream to when reading. * * @return parsed pdf stream. * * @throws IOException if an error occurred reading the stream, like * problems with reading length attribute, stream does not end * with 'endstream' after data read, stream too short etc. */ @Override protected COSStream parseCOSStream(COSDictionary dic, RandomAccess file) throws IOException { final COSStream stream = new COSStream(dic, file); OutputStream out = null; try { readString(); // read 'stream'; this was already tested in // parseObjectsDynamically() // ---- skip whitespaces before start of data // PDF Ref 1.7, chap. 3.2.7: // 'stream' should be followed by either a CRLF (0x0d 0x0a) or LF // but nothing else. int whitespace = pdfSource.read(); // see brother_scan_cover.pdf, it adds whitespaces // after the stream but before the start of the // data, so just read those first while (whitespace == 0x20) { whitespace = pdfSource.read(); } boolean hasCR = false; if (whitespace == 0x0D) { hasCR = true; whitespace = pdfSource.read(); if (whitespace != 0x0A) { // the spec says this is invalid but it happens in the // real world so we must support it pdfSource.unread(whitespace); } } else if (whitespace != 0x0A) { // no whitespace after 'stream'; PDF ref. says 'should' so // that is ok pdfSource.unread(whitespace); } /* * This needs to be dic.getItem because when we are parsing, the underlying object might still be null. */ COSNumber streamLengthObj = getLength(dic.getItem(COSName.LENGTH), dic.getItem(COSName.TYPE)); if (streamLengthObj == null) { if (isLenient) { LOG.warn("The stream doesn't provide any stream length, using fallback readUntilEnd"); } else { throw new IOException("Missing length for stream."); } } boolean useReadUntilEnd = false; // ---- get output stream to copy data to if (streamLengthObj != null && validateStreamLength(streamLengthObj.longValue())) { out = stream.createFilteredStream(streamLengthObj); long remainBytes = streamLengthObj.longValue(); int bytesRead = 0; while (remainBytes > 0) { final int readBytes = pdfSource.read(streamCopyBuf, 0, (remainBytes > streamCopyBufLen) ? streamCopyBufLen : (int) remainBytes); if (readBytes <= 0) { useReadUntilEnd = true; out.close(); pdfSource.unread(bytesRead); break; } out.write(streamCopyBuf, 0, readBytes); remainBytes -= readBytes; bytesRead += readBytes; } } else { useReadUntilEnd = true; } if (useReadUntilEnd) { out = stream.createFilteredStream(); readUntilEndStream(new EndstreamOutputStream(out, hasCR)); } String endStream = readString(); if (endStream.equals("endobj") && isLenient) { LOG.warn("stream ends with 'endobj' instead of 'endstream' at offset " + pdfSource.getOffset()); // avoid follow-up warning about missing endobj pdfSource.unread("endobj".getBytes("ISO-8859-1")); } else if (endStream.length() > 9 && isLenient && endStream.substring(0,9).equals("endstream")) { LOG.warn("stream ends with '" + endStream + "' instead of 'endstream' at offset " + pdfSource.getOffset()); // unread the "extra" bytes pdfSource.unread(endStream.substring(9).getBytes("ISO-8859-1")); } else if (!endStream.equals("endstream")) { throw new IOException( "Error reading stream, expected='endstream' actual='" + endStream + "' at offset " + pdfSource.getOffset()); } } finally { if (out != null) { out.close(); } } return stream; } private boolean validateStreamLength(long streamLength) throws IOException { boolean streamLengthIsValid = true; long originOffset = pdfSource.getOffset(); long expectedEndOfStream = originOffset + streamLength; if (expectedEndOfStream > fileLen) { streamLengthIsValid = false; LOG.error("The end of the stream is out of range, using workaround to read the stream"); LOG.error("Stream start offset: " + originOffset); LOG.error("Expected endofstream offset: " + expectedEndOfStream); } else { pdfSource.seek(expectedEndOfStream); skipSpaces(); if (!checkBytesAtOffset("endstream".getBytes("ISO-8859-1"))) { streamLengthIsValid = false; LOG.error("The end of the stream doesn't point to the correct offset, using workaround to read the stream"); LOG.error("Stream start offset: " + originOffset); LOG.error("Expected endofstream offset: " + expectedEndOfStream); } pdfSource.seek(originOffset); } return streamLengthIsValid; } /** * Check if the cross reference table/stream can be found at the current offset. * * @param startXRefOffset * @return the revised offset * @throws IOException */ private long checkXRefOffset(long startXRefOffset) throws IOException { // repair mode isn't available in non-lenient mode if (!isLenient) { return startXRefOffset; } setPdfSource(startXRefOffset-1); // save th previous character int previous = pdfSource.read(); if (pdfSource.peek() == X && checkBytesAtOffset(XREF_TABLE)) { return startXRefOffset; } // the previous character has to be a whitespace if (isWhitespace(previous)) { int nextValue = pdfSource.peek(); // maybe there isn't a xref table but a xref stream // is the next character a digit? if (nextValue > 47 && nextValue < 58) { try { // Maybe it's a XRef stream readObjectNumber(); readGenerationNumber(); readPattern(OBJ_MARKER); setPdfSource(startXRefOffset); return startXRefOffset; } catch (IOException exception) { // there wasn't an object of a xref stream // try to repair the offset pdfSource.seek(startXRefOffset); } } } // try to find a fixed offset return calculateXRefFixedOffset(startXRefOffset); } /** * Check if the given bytes can be found at the current offset. * * @param string the bytes to look for * @return true if the bytes are in place, false if not * @throws IOException if something went wrong */ private boolean checkBytesAtOffset(byte[] string) throws IOException { boolean bytesMatching = false; if (pdfSource.peek() == string[0]) { int length = string.length; byte[] bytesRead = new byte[length]; int numberOfBytes = pdfSource.read(bytesRead, 0, length); while (numberOfBytes < length) { int readMore = pdfSource.read(bytesRead, numberOfBytes, length - numberOfBytes); if (readMore < 0) { break; } numberOfBytes += readMore; } if (Arrays.equals(string, bytesRead)) { bytesMatching = true; } pdfSource.unread(bytesRead, 0, numberOfBytes); } return bytesMatching; } /** * Try to find a fixed offset for the given xref table/stream. * * @param objectOffset the given offset where to look at * @return the fixed offset * * @throws IOException if something went wrong */ private long calculateXRefFixedOffset(long objectOffset) throws IOException { if (objectOffset < 0) { LOG.error("Invalid object offset " + objectOffset + " when searching for a xref table/stream"); return 0; } // start a brute force search for all xref tables and try to find the offset we are looking for long newOffset = bfSearchForXRef(objectOffset); if (newOffset > -1) { LOG.debug("Fixed reference for xref table/stream " + objectOffset + " -> " + newOffset); return newOffset; } LOG.error("Can't find the object axref table/stream at offset " + objectOffset); return 0; } /** * Check the XRef table by dereferencing all objects and fixing * the offset if necessary. * * @throws IOException if something went wrong. */ private void checkXrefOffsets() throws IOException { // repair mode isn't available in non-lenient mode if (!isLenient) { return; } Map xrefOffset = xrefTrailerResolver.getXrefTable(); if (xrefOffset != null) { for (COSObjectKey objectKey : xrefOffset.keySet()) { Long objectOffset = xrefOffset.get(objectKey); // a negative offset number represents a object number itself // see type 2 entry in xref stream if (objectOffset != null && objectOffset >= 0) { long objectNr = objectKey.getNumber(); long objectGen = objectKey.getGeneration(); String objectString = createObjectString(objectNr, objectGen); if (!checkObjectId(objectString, objectOffset)) { long newOffset = bfSearchForObject(objectString); if (newOffset > -1) { xrefOffset.put(objectKey, newOffset); LOG.debug("Fixed reference for object " + objectNr + " " + objectGen + " " + objectOffset + " -> " + newOffset); } else { LOG.error("Can't find the object " + objectNr + " " + objectGen + " (origin offset " + objectOffset + ")"); } } } } } } /** * Check if the given string can be found at the given offset. * * @param objectString the string we are looking for * @param offset the given where to look * @return returns true if the given string can be found at the givwen offset * @throws IOException if something went wrong */ private boolean checkObjectId(String objectString, long offset) throws IOException { boolean objectFound = false; long originOffset = pdfSource.getOffset(); pdfSource.seek(offset); objectFound = checkBytesAtOffset(objectString.getBytes("ISO-8859-1")); pdfSource.seek(originOffset); return objectFound; } /** * Create a string for the given object id. * * @param objectID the object id * @param genID the generation id * @return the generated string */ private String createObjectString(long objectID, long genID) { return Long.toString(objectID) + " " + Long.toString(genID) + " obj"; } /** * Search for the offset of the given object among the objects found by a brute force search. * * @param objectString the object we are looking for * @return the offset of the object * @throws IOException if something went wrong */ private long bfSearchForObject(String objectString) throws IOException { long newOffset = -1; bfSearchForObjects(); if (bfSearchObjectOffsets.containsKey(objectString)) { newOffset = bfSearchObjectOffsets.get(objectString); } return newOffset; } /** * Brute force search for every object in the pdf. * * @throws IOException if something went wrong */ private void bfSearchForObjects() throws IOException { if (bfSearchObjectOffsets == null) { bfSearchObjectOffsets = new HashMap(); bfSearchCOSObjectKeyOffsets = new HashMap(); long originOffset = pdfSource.getOffset(); long currentOffset = MINIMUM_SEARCH_OFFSET; String objString = " obj"; byte[] string = objString.getBytes("ISO-8859-1"); do { pdfSource.seek(currentOffset); if (checkBytesAtOffset(string)) { long tempOffset = currentOffset - 1; pdfSource.seek(tempOffset); int genID = pdfSource.peek(); // is the next char a digit? if (genID > 47 && genID < 58) { genID -= 48; tempOffset--; pdfSource.seek(tempOffset); if (pdfSource.peek() == 32) { while (tempOffset > MINIMUM_SEARCH_OFFSET && pdfSource.peek() == 32) { pdfSource.seek(--tempOffset); } int length = 0; while (tempOffset > MINIMUM_SEARCH_OFFSET && pdfSource.peek() > 47 && pdfSource.peek() < 58) { pdfSource.seek(--tempOffset); length++; } if (length > 0) { pdfSource.read(); byte[] objIDBytes = pdfSource.readFully(length); String objIdString = new String(objIDBytes, 0, objIDBytes.length, "ISO-8859-1"); Long objectID = null; try { objectID = Long.valueOf(objIdString); } catch (NumberFormatException excpetion) { objectID = null; } if (objectID != null) { bfSearchObjectOffsets.put( createObjectString(objectID, genID), ++tempOffset); bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(objectID, genID), tempOffset); } } } } } currentOffset++; } while (!pdfSource.isEOF()); // reestablish origin position pdfSource.seek(originOffset); } } /** * Search for the offset of the given xref table/stream among those found by a brute force search. * * @return the offset of the xref entry * @throws IOException if something went wrong */ private long bfSearchForXRef(long xrefOffset) throws IOException { long newOffset = -1; bfSearchForXRefs(); if (bfSearchXRefOffsets != null) { long currentDifference = -1; int currentOffsetIndex = -1; int numberOfOffsets = bfSearchXRefOffsets.size(); // find the most likely value // TODO to be optimized, this won't work in every case for (int i = 0; i < numberOfOffsets; i++) { long newDifference = xrefOffset - bfSearchXRefOffsets.get(i); // find the nearest offset if (currentDifference == -1 || (Math.abs(currentDifference) > Math.abs(newDifference))) { currentDifference = newDifference; currentOffsetIndex = i; } } if (currentOffsetIndex > -1) { newOffset = bfSearchXRefOffsets.remove(currentOffsetIndex); } } return newOffset; } /** * Brute force search for all xref entries. * * @throws IOException if something went wrong */ private void bfSearchForXRefs() throws IOException { if (bfSearchXRefOffsets == null) { // a pdf may contain more than one xref entry bfSearchXRefOffsets = new Vector(); long originOffset = pdfSource.getOffset(); pdfSource.seek(MINIMUM_SEARCH_OFFSET); // search for xref tables while (!pdfSource.isEOF()) { if (checkBytesAtOffset(XREF_TABLE)) { long newOffset = pdfSource.getOffset(); pdfSource.seek(newOffset - 1); // ensure that we don't read "startxref" instead of "xref" if (isWhitespace()) { bfSearchXRefOffsets.add(newOffset); } pdfSource.seek(newOffset + 4); } pdfSource.read(); } pdfSource.seek(MINIMUM_SEARCH_OFFSET); // search for XRef streams String objString = " obj"; byte[] string = objString.getBytes("ISO-8859-1"); while (!pdfSource.isEOF()) { if (checkBytesAtOffset(XREF_STREAM)) { // search backwards for the beginning of the stream long newOffset = -1; long xrefOffset = pdfSource.getOffset(); long currentOffset = xrefOffset; boolean objFound = false; for (int i = 1; i < 30 && !objFound; i++) { currentOffset = xrefOffset - (i * 10); if (currentOffset > 0) { pdfSource.seek(currentOffset); for (int j = 0; j < 10; j++) { if (checkBytesAtOffset(string)) { long tempOffset = currentOffset - 1; pdfSource.seek(tempOffset); int genID = pdfSource.peek(); // is the next char a digit? if (genID > 47 && genID < 58) { genID -= 48; tempOffset--; pdfSource.seek(tempOffset); if (pdfSource.peek() == 32) { int length = 0; pdfSource.seek(--tempOffset); while (tempOffset > MINIMUM_SEARCH_OFFSET && pdfSource.peek() > 47 && pdfSource.peek() < 58) { pdfSource.seek(--tempOffset); length++; } if (length > 0) { pdfSource.read(); newOffset = pdfSource.getOffset(); } } } LOG.debug("Fixed reference for xref stream " + xrefOffset + " -> " + newOffset); objFound = true; break; } else { currentOffset++; pdfSource.read(); } } } } if (newOffset > -1) { bfSearchXRefOffsets.add(newOffset); } pdfSource.seek(xrefOffset + 5); } pdfSource.read(); } pdfSource.seek(originOffset); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy