All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pdfbox.cos.COSDocument Maven / Gradle / Ivy

Go to download

The Apache PDFBox library is an open source Java tool for working with PDF documents.

There is a newer version: 3.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.cos;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFObjectStreamParser;
import org.apache.pdfbox.pdmodel.interactive.digitalsignature.SignatureInterface;
import org.apache.pdfbox.persistence.util.COSObjectKey;

/**
 * This is the in-memory representation of the PDF document.  You need to call
 * close() on this object when you are done using it!!
 *
 * @author Ben Litchfield
 * 
 */
public class COSDocument extends COSBase implements Closeable
{

    /**
     * Log instance.
     */
    private static final Log LOG = LogFactory.getLog(COSDocument.class);

    private float version = 1.4f;

    private String originalHeaderString = null;

    /**
     * Maps ObjectKeys to a COSObject. Note that references to these objects
     * are also stored in COSDictionary objects that map a name to a specific object.
     */
    private final Map objectPool =
        new HashMap();

    /**
     * Maps object and generation id to object byte offsets.
     */
    private final Map xrefTable =
        new HashMap();

    /**
     * Document trailer dictionary.
     */
    private COSDictionary trailer;
    
    /**
     * Signature interface.
     */
    private SignatureInterface signatureInterface;

    /**
     * This file will store the streams in order to conserve memory.
     */
    private final RandomAccess scratchFile;

    private final File tmpFile;

    private String headerString = "%PDF-" + version;

    private boolean warnMissingClose = true;
    
    /** signal that document is already decrypted, e.g. with {@link org.apache.pdfbox.pdfparser.NonSequentialPDFParser} */
    private boolean isDecrypted = false;
    
    private long startXref;
    
    private boolean closed = false;

    /**
     * Flag to skip malformed or otherwise unparseable input where possible.
     */
    private final boolean forceParsing;

    /**
     * Constructor that will use the given random access file for storage
     * of the PDF streams. The client of this method is responsible for
     * deleting the storage if necessary that this file will write to. The
     * close method will close the file though.
     *
     * @param scratchFileValue the random access file to use for storage
     * @param forceParsingValue flag to skip malformed or otherwise unparseable
     *                     document content where possible
     */
    public COSDocument(RandomAccess scratchFileValue, boolean forceParsingValue) 
    {
        scratchFile = scratchFileValue;
        tmpFile = null;
        forceParsing = forceParsingValue;
    }

    /**
     * Constructor that will use a temporary file in the given directory
     * for storage of the PDF streams. The temporary file is automatically
     * removed when this document gets closed.
     *
     * @param scratchDir directory for the temporary file,
     *                   or null to use the system default
     * @param forceParsingValue flag to skip malformed or otherwise unparseable
     *                     document content where possible
     * @throws IOException if something went wrong
     */
    public COSDocument(File scratchDir, boolean forceParsingValue) throws IOException 
    {
        tmpFile = File.createTempFile("pdfbox-", ".tmp", scratchDir);
        scratchFile = new RandomAccessFile(tmpFile, "rw");
        forceParsing = forceParsingValue;
    }

    /**
     * Constructor.  Uses memory to store stream.
     */
    public COSDocument()
    {
        this(new RandomAccessBuffer(), false);
    }

    /**
     * Constructor that will create a create a scratch file in the
     * following directory.
     *
     * @param scratchDir The directory to store a scratch file.
     *
     * @throws IOException If there is an error creating the tmp file.
     */
    public COSDocument(File scratchDir) throws IOException 
    {
        this(scratchDir, false);
    }

    /**
     * Constructor that will use the following random access file for storage
     * of the PDF streams.  The client of this method is responsible for deleting
     * the storage if necessary that this file will write to.  The close method
     * will close the file though.
     *
     * @param file The random access file to use for storage.
     */
    public COSDocument(RandomAccess file) 
    {
        this(file, false);
    }

    /**
     * This will get the scratch file for this document.
     *
     * @return The scratch file.
     * 
     * @deprecated direct access to the scratch file will be removed
     */
    public RandomAccess getScratchFile()
    {
        // TODO the direct access to the scratch file should be removed.
        if (!closed)
        {
            return scratchFile;
        }
        else
        {
            LOG.error("Can't access the scratch file as it is already closed!");
            return null;
        }
    }

    /**
     * Create a new COSStream using the underlying scratch file.
     * 
     * @return the new COSStream
     */
    public COSStream createCOSStream()
    {
        return new COSStream( getScratchFile() );
    }

    /**
     * Create a new COSStream using the underlying scratch file.
     *
     * @param dictionary the corresponding dictionary
     * 
     * @return the new COSStream
     */
    public COSStream createCOSStream(COSDictionary dictionary)
    {
        return new COSStream( dictionary, getScratchFile() );
    }

    /**
     * This will get the first dictionary object by type.
     *
     * @param type The type of the object.
     *
     * @return This will return an object with the specified type.
     * @throws IOException If there is an error getting the object
     * 
     * @deprecated use {@link COSDocument#getObjectByType(COSName)} instead
     */
    public COSObject getObjectByType( String type ) throws IOException
    {
        return getObjectByType( COSName.getPDFName( type ) );
    }

    /**
     * This will get the first dictionary object by type.
     *
     * @param type The type of the object.
     *
     * @return This will return an object with the specified type.
     * @throws IOException If there is an error getting the object
     */
    public COSObject getObjectByType( COSName type ) throws IOException
    {
        for( COSObject object : objectPool.values() )
        {
            COSBase realObject = object.getObject();
            if( realObject instanceof COSDictionary )
            {
                try
                {
                    COSDictionary dic = (COSDictionary)realObject;
                    COSBase typeItem = dic.getItem(COSName.TYPE);
                    if (typeItem != null && typeItem instanceof COSName)
                    {
                        COSName objectType = (COSName) typeItem;
                        if (objectType.equals(type))
                        {
                            return object;
                        }
                    }
                    else if (typeItem != null)
                    {
                        LOG.debug("Expected a /Name object after /Type, got '" + typeItem + "' instead");
                    }
                }
                catch (ClassCastException e)
                {
                    LOG.warn(e, e);
                }
            }
        }
        return null;
    }

    /**
     * This will get all dictionary objects by type.
     *
     * @param type The type of the object.
     *
     * @return This will return an object with the specified type.
     * @throws IOException If there is an error getting the object
     */
    public List getObjectsByType( String type ) throws IOException
    {
        return getObjectsByType( COSName.getPDFName( type ) );
    }

    /**
     * This will get a dictionary object by type.
     *
     * @param type The type of the object.
     *
     * @return This will return an object with the specified type.
     * @throws IOException If there is an error getting the object
     */
    public List getObjectsByType( COSName type ) throws IOException
    {
        List retval = new ArrayList();
        for( COSObject object : objectPool.values() )
        {
            COSBase realObject = object.getObject();
            if( realObject instanceof COSDictionary )
            {
                try
                {
                    COSDictionary dic = (COSDictionary)realObject;
                    COSBase typeItem = dic.getItem(COSName.TYPE);
                    if (typeItem != null && typeItem instanceof COSName)
                    {
                        COSName objectType = (COSName) typeItem;
                        if (objectType.equals(type))
                        {
                            retval.add( object );
                        }
                    }
                    else if (typeItem != null)
                    {
                        LOG.debug("Expected a /Name object after /Type, got '" + typeItem + "' instead");
                    }
                }
                catch (ClassCastException e)
                {
                    LOG.warn(e, e);
                }
            }
        }
        return retval;
    }

    /**
     * This will print contents to stdout.
     */
    public void print()
    {
        for( COSObject object : objectPool.values() )
        {
            System.out.println( object);
        }
    }

    /**
     * This will set the version of this PDF document and update the header string.
     *
     * @param versionValue The version of the PDF document.
     */
    public void setVersion( float versionValue )
    {
        // update header string
        if (versionValue != version) 
        {
            headerString = headerString.replaceFirst(String.valueOf(version), String.valueOf(versionValue));
        }
        version = versionValue;
    }

    /**
     * This will get the version of this PDF document.
     *
     * @return This documents version.
     */
    public float getVersion()
    {
        return version;
    }

    /** 
     * Signals that the document is decrypted completely.
     *  Needed e.g. by {@link org.apache.pdfbox.pdfparser.NonSequentialPDFParser} to circumvent
     *  additional decryption later on.
     */
    public void setDecrypted()
    {
        isDecrypted = true;
    }

    /** 
     * Indicates if a encrypted pdf is already decrypted after parsing.
     * Does make sense only if the {@link org.apache.pdfbox.pdfparser.NonSequentialPDFParser} is used.
     * 
     *  @return true indicates that the pdf is decrypted.
     */
    public boolean isDecrypted()
    {
        return isDecrypted;
    }
    
    /**
     * This will tell if this is an encrypted document.
     *
     * @return true If this document is encrypted.
     */
    public boolean isEncrypted()
    {
        boolean encrypted = false;
        if( trailer != null )
        {
            encrypted = trailer.getDictionaryObject( COSName.ENCRYPT ) != null;
        }
        return encrypted;
    }

    /**
     * This will get the encryption dictionary if the document is encrypted or null
     * if the document is not encrypted.
     *
     * @return The encryption dictionary.
     */
    public COSDictionary getEncryptionDictionary()
    {
        return (COSDictionary)trailer.getDictionaryObject( COSName.ENCRYPT );
    }

    /**
     * This will return the signature interface.
     * @return the signature interface 
     */
    public SignatureInterface getSignatureInterface() 
    {
        return signatureInterface;
    }
    
    /**
     * This will set the encryption dictionary, this should only be called when
     * encrypting the document.
     *
     * @param encDictionary The encryption dictionary.
     */
    public void setEncryptionDictionary( COSDictionary encDictionary )
    {
        trailer.setItem( COSName.ENCRYPT, encDictionary );
    }

    /**
     * This will return a list of signature dictionaries as COSDictionary.
     *
     * @return list of signature dictionaries as COSDictionary
     * @throws IOException if no document catalog can be found
     */
    public List getSignatureDictionaries() throws IOException
    {
        List signatureFields = getSignatureFields(false);
        List signatures = new LinkedList();
        for ( COSDictionary dict : signatureFields )
        {
            COSBase dictionaryObject = dict.getDictionaryObject(COSName.V);
            if (dictionaryObject != null)
            {
                signatures.add((COSDictionary)dictionaryObject);
            }
        }
        return signatures;
    }

    /**
     * This will return a list of signature fields.
     *
     * @param onlyEmptyFields only empty signature fields will be returned
     * @return list of signature dictionaries as COSDictionary
     * @throws IOException if no document catalog can be found
     */
    public List getSignatureFields(boolean onlyEmptyFields) throws IOException
    {
        COSObject documentCatalog = getCatalog();
        if (documentCatalog != null)
        {
            COSDictionary acroForm = (COSDictionary)documentCatalog.getDictionaryObject(COSName.ACRO_FORM);
            if (acroForm != null)
            {
                COSArray fields = (COSArray)acroForm.getDictionaryObject(COSName.FIELDS);
                if (fields != null)
                {
                    // Some fields may contain twice references to a single field. 
                    // This will prevent such double entries.
                    HashMap signatures = new HashMap();
                    for ( Object object : fields )
                    {
                        COSObject dict = (COSObject)object;
                        if (COSName.SIG.equals(dict.getItem(COSName.FT)))
                        {
                            COSBase dictionaryObject = dict.getDictionaryObject(COSName.V);
                            if (dictionaryObject == null || (dictionaryObject != null && !onlyEmptyFields))
                            {
                                signatures.put(new COSObjectKey(dict), (COSDictionary)dict.getObject());
                            }
                        }
                    }
                    return new LinkedList(signatures.values());
                }
            }
        }
        return Collections.emptyList();
    }
    
    /**
     * This will get the document ID.
     *
     * @return The document id.
     */
    public COSArray getDocumentID()
    {
        return (COSArray) getTrailer().getDictionaryObject(COSName.ID);
    }

    /**
     * This will set the document ID.
     *
     * @param id The document id.
     */
    public void setDocumentID( COSArray id )
    {
        getTrailer().setItem(COSName.ID, id);
    }
    
    /**
     * Set the signature interface to the given value.
     * @param sigInterface the signature interface
     */
    public void setSignatureInterface(SignatureInterface sigInterface) 
    {
        signatureInterface = sigInterface;
    }

    /**
     * This will get the document catalog.
     *
     * Maybe this should move to an object at PDFEdit level
     *
     * @return catalog is the root of all document activities
     *
     * @throws IOException If no catalog can be found.
     */
    public COSObject getCatalog() throws IOException
    {
        COSObject catalog = getObjectByType( COSName.CATALOG );
        if( catalog == null )
        {
            throw new IOException( "Catalog cannot be found" );
        }
        return catalog;
    }

    /**
     * This will get a list of all available objects.
     *
     * @return A list of all objects.
     */
    public List getObjects()
    {
        return new ArrayList(objectPool.values());
    }

    /**
     * This will get the document trailer.
     *
     * @return the document trailer dict
     */
    public COSDictionary getTrailer()
    {
        return trailer;
    }

    /**
     * // MIT added, maybe this should not be supported as trailer is a persistence construct.
     * This will set the document trailer.
     *
     * @param newTrailer the document trailer dictionary
     */
    public void setTrailer(COSDictionary newTrailer)
    {
        trailer = newTrailer;
    }

    /**
     * visitor pattern double dispatch method.
     *
     * @param visitor The object to notify when visiting this object.
     * @return any object, depending on the visitor implementation, or null
     * @throws COSVisitorException If an error occurs while visiting this object.
     */
    @Override
    public Object accept(ICOSVisitor visitor) throws COSVisitorException
    {
        return visitor.visitFromDocument( this );
    }

    /**
     * This will close all storage and delete the tmp files.
     *
     *  @throws IOException If there is an error close resources.
     */
    public void close() throws IOException
    {
        if (!closed) 
        {
            scratchFile.close();
            if (tmpFile != null) 
            {
                tmpFile.delete();
            }
            if (trailer != null)
            {
                trailer.clear();
                trailer = null;
            }
            // Clear object pool
            for (COSObject object : objectPool.values()) 
            {
                COSBase cosObject = object.getObject();
                // clear the resources of the pooled objects
                if (cosObject instanceof COSStream)
                {
                    ((COSStream)cosObject).close();
                }
                else if (cosObject instanceof COSDictionary)
                {
                    ((COSDictionary)cosObject).clear();
                }
                else if (cosObject instanceof COSArray)
                {
                    ((COSArray)cosObject).clear();
                }
                // TODO are there other kind of COSObjects to be cleared?
            }
            objectPool.clear();
            closed = true;
        }
    }

    /**
     * Warn the user in the finalizer if he didn't close the PDF document. The method also
     * closes the document just in case, to avoid abandoned temporary files. It's still a good
     * idea for the user to close the PDF document at the earliest possible to conserve resources.
     * @throws IOException if an error occurs while closing the temporary files
     */
    @Override
    protected void finalize() throws IOException
    {
        if (!closed) 
        {
            if (warnMissingClose) 
            {
                LOG.warn( "Warning: You did not close a PDF Document" );
            }
            close();
        }
    }

    /**
     * Controls whether this instance shall issue a warning if the PDF document wasn't closed
     * properly through a call to the {@link #close()} method. If the PDF document is held in
     * a cache governed by soft references it is impossible to reliably close the document
     * before the warning is raised. By default, the warning is enabled.
     * @param warn true enables the warning, false disables it.
     */
    public void setWarnMissingClose(boolean warn)
    {
        this.warnMissingClose = warn;
    }

    /**
     * @return Returns the current headerString. (It may have been updated by calls to 
     * {@link #setVersion(float) })
     */
    public String getHeaderString()
    {
        return headerString;
    }
    /**
     * @param header The headerString to set.
     */
    public void setHeaderString(String header)
    {
        headerString = header;
        if (originalHeaderString == null)
        {
            originalHeaderString = header;
        }
    }

    /**
     * Get the original headerString from the PDF file. Unlike {@link #getHeaderString()}, the
     * value is not changed by files that have another header value in the document catalog.
     *
     * @return the original header string.
     */
    public String getOriginalHeaderString()
    {
        return originalHeaderString;
    }

    /**
     * This method will search the list of objects for types of ObjStm.  If it finds
     * them then it will parse out all of the objects from the stream that is contains.
     *
     * @throws IOException If there is an error parsing the stream.
     */
    public void dereferenceObjectStreams() throws IOException
    {
        for( COSObject objStream : getObjectsByType( COSName.OBJ_STM ) )
        {
            COSStream stream = (COSStream)objStream.getObject();
            PDFObjectStreamParser parser =
                new PDFObjectStreamParser(stream, this, forceParsing);
            parser.parse();
            for( COSObject next : parser.getObjects() )
            {
                COSObjectKey key = new COSObjectKey( next );
                if ( objectPool.get(key) == null || objectPool.get(key).getObject() == null ||
                     // xrefTable stores negated objNr of objStream for objects in objStreams
                     (xrefTable.containsKey(key) && xrefTable.get(key) == -objStream.getObjectNumber().longValue()) )
                {
                    COSObject obj = getObjectFromPool(key);
                    obj.setObject(next.getObject());
                }
            }
        }
    }

    /**
     * This will get an object from the pool.
     *
     * @param key The object key.
     *
     * @return The object in the pool or a new one if it has not been parsed yet.
     *
     * @throws IOException If there is an error getting the proxy object.
     */
    public COSObject getObjectFromPool(COSObjectKey key) throws IOException
    {
        COSObject obj = null;
        if( key != null )
        {
            obj = objectPool.get(key);
        }
        if (obj == null)
        {
            // this was a forward reference, make "proxy" object
            obj = new COSObject(null);
            if( key != null )
            {
                obj.setObjectNumber( COSInteger.get( key.getNumber() ) );
                obj.setGenerationNumber( COSInteger.get( key.getGeneration() ) );
                objectPool.put(key, obj);
            }
        }
        return obj;
    }

    /**
     * Removes an object from the object pool.
     * @param key the object key
     * @return the object that was removed or null if the object was not found
     */
    public COSObject removeObject(COSObjectKey key)
    {
        return objectPool.remove(key);
    }

    /**
     * Populate XRef HashMap with given values.
     * Each entry maps ObjectKeys to byte offsets in the file.
     * @param xrefTableValues  xref table entries to be added
     */
    public void addXRefTable( Map xrefTableValues )
    {
        xrefTable.putAll( xrefTableValues );
    }

    /**
     * Returns the xrefTable which is a mapping of ObjectKeys
     * to byte offsets in the file.
     * @return mapping of ObjectsKeys to byte offsets
     */
    public Map getXrefTable()
    {
        return xrefTable;
    }

    /**
     * This method set the startxref value of the document. This will only 
     * be needed for incremental updates.
     * 
     * @param startXrefValue the value for startXref
     */
    public void setStartXref(long startXrefValue)
    {
        startXref = startXrefValue;
    }

    /**
     * Return the startXref Position of the parsed document. This will only be needed for incremental updates.
     * 
     * @return a long with the old position of the startxref
     */
    public long getStartXref()
    {
      return startXref;
    }

    /**
     * Determines it the trailer is a XRef stream or not.
     * 
     * @return true if the trailer is a XRef stream
     */
    public boolean isXRefStream()
    {
        if (trailer != null)
        {
            return COSName.XREF.equals(trailer.getItem(COSName.TYPE));
        }
        return false;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy