All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pdfbox.pdfparser.XrefTrailerResolver Maven / Gradle / Ivy

Go to download

The Apache PDFBox library is an open source Java tool for working with PDF documents.

There is a newer version: 3.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.pdfparser;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import java.util.SortedSet;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSObjectKey;

/**
 * This class will collect all XRef/trailer objects and creates correct
 * xref/trailer information after all objects are read using startxref
 * and 'Prev' information (unused XRef/trailer objects are discarded).
 *
 * In case of missing startxref or wrong startxref pointer all
 * XRef/trailer objects are used to create xref table / trailer dictionary
 * in order they occur.
 *
 * For each new xref object/XRef stream method {@link #nextXrefObj(long, XRefType)}
 * must be called with start byte position. All following calls to
 * {@link #setXRef(COSObjectKey, long)} or {@link #setTrailer(COSDictionary)}
 * will add the data for this byte position.
 *
 * After all objects are parsed the startxref position must be provided
 * using {@link #setStartxref(long)}. This is used to build the chain of
 * active xref/trailer objects used for creating document trailer and xref table.
 *
 * @author Timo Böhme
 */
public class XrefTrailerResolver
{

    /**
     * A class which represents a xref/trailer object.
     */
    private static class XrefTrailerObj
    {
        protected COSDictionary trailer = null;

        private XRefType xrefType;

        private final Map xrefTable = new HashMap();
        
        /**
         *  Default constructor.
         */
        private XrefTrailerObj()
        {
            xrefType = XRefType.TABLE;
        }

        public void reset()
        {
            xrefTable.clear();
        }
    }

    /** 
     * The XRefType of a trailer.
     */
    public enum XRefType
    {
        /**
         * XRef table type.
         */
        TABLE, 
        /**
         * XRef stream type.
         */
        STREAM
    }
    
    private final Map bytePosToXrefMap = new HashMap();
    private XrefTrailerObj curXrefTrailerObj   = null;
    private XrefTrailerObj resolvedXrefTrailer = null;

    /** Log instance. */
    private static final Log LOG = LogFactory.getLog( XrefTrailerResolver.class );

    /**
     * Returns the first trailer if at least one exists.
     * 
     * @return the first trailer or null
     */
    public final COSDictionary getFirstTrailer() 
    {
        if (bytePosToXrefMap.isEmpty())
        {
            return null;
        }
        Set offsets = bytePosToXrefMap.keySet();
        SortedSet sortedOffset = new TreeSet(offsets);
        return bytePosToXrefMap.get(sortedOffset.first()).trailer;
    }
    
    /**
     * Returns the last trailer if at least one exists.
     * 
     * @return the last trailer ir null
     */
    public final COSDictionary getLastTrailer() 
    {
        if (bytePosToXrefMap.isEmpty()) 
        {
            return null;
        }
        Set offsets = bytePosToXrefMap.keySet();
        SortedSet sortedOffset = new TreeSet(offsets);
        return bytePosToXrefMap.get(sortedOffset.last()).trailer;
    }

    /**
     * Returns the count of trailers.
     *
     * @return the count of trailers.
     */
    public final int getTrailerCount()
    {
        return bytePosToXrefMap.size();
    }

    /**
     * Signals that a new XRef object (table or stream) starts.
     * @param startBytePos the offset to start at
     * @param type the type of the Xref object
     */
    public void nextXrefObj( final long startBytePos, XRefType type )
    {
        bytePosToXrefMap.put( startBytePos, curXrefTrailerObj = new XrefTrailerObj() );
        curXrefTrailerObj.xrefType = type;
    }

    /**
     * Returns the XRefTxpe of the resolved trailer.
     * 
     * @return the XRefType or null.
     */
    public XRefType getXrefType()
    { 
        return ( resolvedXrefTrailer == null ) ? null : resolvedXrefTrailer.xrefType; 
    } 
    
    /**
     * Populate XRef HashMap of current XRef object.
     * Will add an Xreftable entry that maps ObjectKeys to byte offsets in the file.
     * @param objKey The objkey, with id and gen numbers
     * @param offset The byte offset in this file
     */
    public void setXRef( COSObjectKey objKey, long offset )
    {
        if ( curXrefTrailerObj == null )
        {
            // should not happen...
            LOG.warn( "Cannot add XRef entry for '" + objKey.getNumber() + "' because XRef start was not signalled." );
            return;
        }
        // PDFBOX-3506 check before adding to the map, to avoid entries from the table being 
        // overwritten by obsolete entries in hybrid files (/XRefStm entry)
        if (!curXrefTrailerObj.xrefTable.containsKey(objKey) )
        {
            curXrefTrailerObj.xrefTable.put(objKey, offset);
        }
    }

    /**
     * Adds trailer information for current XRef object.
     *
     * @param trailer the current document trailer dictionary
     */
    public void setTrailer( COSDictionary trailer )
    {
        if ( curXrefTrailerObj == null )
        {
            // should not happen...
            LOG.warn( "Cannot add trailer because XRef start was not signalled." );
            return;
        }
        curXrefTrailerObj.trailer = trailer;
    }

    /**
     * Returns the trailer last set by {@link #setTrailer(COSDictionary)}.
     * 
     * @return the current trailer.
     * 
     */
    public COSDictionary getCurrentTrailer() 
    {
        return curXrefTrailerObj.trailer;
    }

    /**
     * Sets the byte position of the first XRef
     * (has to be called after very last startxref was read).
     * This is used to resolve chain of active XRef/trailer.
     *
     * In case startxref position is not found we output a
     * warning and use all XRef/trailer objects combined
     * in byte position order.
     * Thus for incomplete PDF documents with missing
     * startxref one could call this method with parameter value -1.
     * 
     * @param startxrefBytePosValue starting position of the first XRef
     * 
     */
    public void setStartxref( long startxrefBytePosValue )
    {
        if ( resolvedXrefTrailer != null )
        {
            LOG.warn( "Method must be called only ones with last startxref value." );
            return;
        }

        resolvedXrefTrailer = new XrefTrailerObj();
        resolvedXrefTrailer.trailer = new COSDictionary();

        XrefTrailerObj curObj = bytePosToXrefMap.get( startxrefBytePosValue );
        List  xrefSeqBytePos = new ArrayList();

        if ( curObj == null )
        {
            // no XRef at given position
            LOG.warn( "Did not found XRef object at specified startxref position " + startxrefBytePosValue );

            // use all objects in byte position order (last entries overwrite previous ones)
            xrefSeqBytePos.addAll( bytePosToXrefMap.keySet() );
            Collections.sort( xrefSeqBytePos );
        }
        else
        {
            // copy xref type
            resolvedXrefTrailer.xrefType = curObj.xrefType;
            // found starting Xref object
            // add this and follow chain defined by 'Prev' keys
            xrefSeqBytePos.add( startxrefBytePosValue );
            while ( curObj.trailer != null )
            {
                long prevBytePos = curObj.trailer.getLong( COSName.PREV, -1L );
                if ( prevBytePos == -1 )
                {
                    break;
                }

                curObj = bytePosToXrefMap.get( prevBytePos );
                if ( curObj == null )
                {
                    LOG.warn( "Did not found XRef object pointed to by 'Prev' key at position " + prevBytePos );
                    break;
                }
                xrefSeqBytePos.add( prevBytePos );

                // sanity check to prevent infinite loops
                if ( xrefSeqBytePos.size() >= bytePosToXrefMap.size() )
                {
                    break;
                }
            }
            // have to reverse order so that later XRefs will overwrite previous ones
            Collections.reverse( xrefSeqBytePos );
        }

        // merge used and sorted XRef/trailer
        for ( Long bPos : xrefSeqBytePos )
        {
            curObj = bytePosToXrefMap.get( bPos );
            if ( curObj.trailer != null )
            {
                resolvedXrefTrailer.trailer.addAll( curObj.trailer );
            }
            resolvedXrefTrailer.xrefTable.putAll( curObj.xrefTable );
        }

    }

    /**
     * Gets the resolved trailer. Might return null in case
     * {@link #setStartxref(long)} was not called before.
     *
     * @return the trailer if available
     */
    public COSDictionary getTrailer()
    {
        return ( resolvedXrefTrailer == null ) ? null : resolvedXrefTrailer.trailer;
    }

    /**
     * Gets the resolved xref table. Might return null in case
     *  {@link #setStartxref(long)} was not called before.
     *
     * @return the xrefTable if available
     */
    public Map getXrefTable()
    {
        return ( resolvedXrefTrailer == null ) ? null : resolvedXrefTrailer.xrefTable;
    }
    
    /** Returns object numbers which are referenced as contained
     *  in object stream with specified object number.
     *  
     *  This will scan resolved xref table for all entries having negated
     *  stream object number as value.
     *
     *  @param objstmObjNr  object number of object stream for which contained object numbers
     *                      should be returned
     *                       
     *  @return set of object numbers referenced for given object stream
     *          or null if {@link #setStartxref(long)} was not
     *          called before so that no resolved xref table exists
     */
    public Set getContainedObjectNumbers( final int objstmObjNr ) 
    {
        if ( resolvedXrefTrailer == null )
        {
            return null;
        }
        final Set refObjNrs = new HashSet();
        final long cmpVal = - objstmObjNr;
        
        for ( Entry xrefEntry : resolvedXrefTrailer.xrefTable.entrySet() ) 
        {
            if ( xrefEntry.getValue() == cmpVal )
            {
                refObjNrs.add( xrefEntry.getKey().getNumber() );
            }
        }
        return refObjNrs;
    }

    /**
     * Reset all data so that it can be used to rebuild the trailer.
     * 
     */
    protected void reset()
    {
        for (XrefTrailerObj trailerObj : bytePosToXrefMap.values())
        {
            trailerObj.reset();
        }
        curXrefTrailerObj = null;
        resolvedXrefTrailer = null;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy