All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.harvard.hul.ois.jhove.module.pdf.Parser Maven / Gradle / Ivy

There is a newer version: 1.12.7
Show newest version
/**********************************************************************
 * Jhove - JSTOR/Harvard Object Validation Environment
 * Copyright 2003 by JSTOR and the President and Fellows of Harvard College
 **********************************************************************/

package edu.harvard.hul.ois.jhove.module.pdf;

import java.io.IOException;
import java.text.MessageFormat;
import java.util.Map;
import java.util.Set;
import java.util.Vector;

import edu.harvard.hul.ois.jhove.messages.JhoveMessage;
import edu.harvard.hul.ois.jhove.messages.JhoveMessages;
import edu.harvard.hul.ois.jhove.module.PdfModule;

/**
 * The Parser class implements some limited syntactic analysis for PDF.
 * It isn't by any means intended to be a full parser.  Its main job is
 * to track nesting of syntactic elements such as dictionary and array
 * beginnings and ends.
 */
public class Parser
{
    /** The Tokenizer which the parser will use. */
    private Tokenizer _tokenizer;
    /** The number of dictionary starts on stack. */
    private int _dictDepth;
    /** The number of array starts on stack. */
    private int _arrayDepth;
    /** The file's object map. */
    private Map _objectMap;
    /** PDF/A compliance flag. */
    private boolean _pdfACompliant;
	protected PdfModule _module;


    /**
     * Constructor.  A Parser works with a Tokenizer that feeds it tokens.
     *
     * @param tokenizer  The Tokenizer which the parser will use
     */
    public Parser (Tokenizer tokenizer)
    {
        _tokenizer = tokenizer;
        _pdfACompliant = true;
        reset ();
    }

    /** Sets the object map on which the parser will work. */
    public void setObjectMap (Map objectMap)
    {
        _objectMap = objectMap;
    }

    /**
     * Clears the state of the parser so that it can start
     * reading at a different place in the file.  Clears the
     * stack and the dictionary and array depth counters.
     */
    public void reset () {
        _dictDepth = 0;
        _arrayDepth = 0;
    }

    /**
     * Clears the state of the parser so that it can start
     * reading at a different place in the file and ignore
     * any nesting errors.  Sets the stack and the dictionary
     * and array depth counters to a large number so that
     * nesting exceptions won't be thrown.
     */
    public void resetLoose () {
        _dictDepth = 1000000;
        _arrayDepth = 1000000;
    }

    /**
     * Gets a token.  Uses Tokenizer.getNext, and keeps track
     * of the depth of dictionary and array nesting.
     */
    public Token getNext () throws IOException, PdfException
    {
        return getNext (0L);
    }

    /**
     * Gets a token.  Uses Tokenizer.getNext, and keeps track
     * of the depth of dictionary and array nesting.
     *
     * @param max  Maximum allowable size of the token
     */
    public Token getNext (long max) throws IOException, PdfException
    {
        Token tok = _tokenizer.getNext (max);
        if (tok instanceof DictionaryStart) {
            ++_dictDepth;
        }
        else if (tok instanceof DictionaryEnd) {
            --_dictDepth;
            if (_dictDepth < 0) {
                throw new PdfMalformedException (MessageConstants.PDF_HUL_33); // PDF-HUL-33
            }
        }
        if (tok instanceof ArrayStart) {
            ++_arrayDepth;
        }
        else if (tok instanceof ArrayEnd) {
            --_arrayDepth;
            if (_arrayDepth < 0) {
                throw new PdfMalformedException (MessageConstants.PDF_HUL_34); // PDF-HUL-34
            }
        }
        return tok;
    }

    /**
     * A class-sensitive version of getNext.  The token which is
     * obtained must be of the specified class (or a subclass thereof),
     * or a PdfInvalidException with message errMsg
     * will be thrown.
     */
    public Token getNext (Class clas, JhoveMessage errMsg)
            throws IOException, PdfException
    {
        Token tok = getNext ();
        if (!clas.isInstance (tok)) {
            throw new PdfInvalidException (errMsg);
        }
        if (!tok.isPdfACompliant()) {
            _pdfACompliant = false;
        }
        return tok;
    }

    /**
     * Returns the number of dictionary starts not yet matched by
     * dictionary ends.
     */
    public int getDictDepth ()
    {
        return _dictDepth;
    }

    /**
     * Returns the number of array starts not yet matched by array ends.
     */
    public int getArrayDepth ()
    {
        return _arrayDepth;
    }

    /** Returns the Tokenizer's current whitespace string. */
    public String getWSString () {
        return _tokenizer.getWSString ();
    }

    /** Returns the language code set from the Tokenizer. */
    public Set getLanguageCodes ()
    {
        return _tokenizer.getLanguageCodes ();
    }

    /**
     * Returns false if either the parser or the tokenizer has detected
     * non-compliance with PDF/A restrictions.  A value of true
     * is no guarantee that the file is compliant.
     */
    public boolean getPDFACompliant ()
    {
        if (!_tokenizer.getPDFACompliant ()) {
            _pdfACompliant = false;
        }
        return _pdfACompliant;
    }

    /**
     * Sets the value of the pdfACompliant flag.  This may be used to
     * clear previous detection of noncompliance.  If the parameter
     * has a value of true, the tokenizer's pdfACompliant
     * flag is also set to true.
     */
    public void setPDFACompliant (boolean pdfACompliant)
    {
        _pdfACompliant = pdfACompliant;
        if (pdfACompliant) {
            _tokenizer.setPDFACompliant (true);
        }
    }

    /**
     * Reads an object definition, from wherever we are in the stream to
     * the completion of one full object after the obj keyword.
     */
    public PdfObject readObjectDef () throws IOException, PdfException
    {
        Numeric objNumTok = (Numeric) getNext 
            (Numeric.class, MessageConstants.PDF_HUL_35); // PDF-HUL-35
        return readObjectDef (objNumTok);
    }

    /**
     * Reads an object definition, given the first numeric object, which
     * has already been read and is passed as an argument.  This is called
     * by the no-argument readObjectDef; the only other case in which it
     * will be called is for a cross-reference stream, which can be distinguished
     * from a cross-reference table only once the first token is read.
     */
    public PdfObject readObjectDef (Numeric objNumTok)
            throws IOException, PdfException
    {
        reset ();
        // The start of an object must be   obj
        //Numeric objNumTok = (Numeric) getNext (Numeric.class, invDef);
        Numeric genNumTok = (Numeric) getNext (Numeric.class, MessageConstants.PDF_HUL_36); // PDF-HUL-36
        Keyword objKey = (Keyword) getNext (Keyword.class, MessageConstants.PDF_HUL_37); // PDF-HUL-37
        if (!"obj".equals (objKey.getValue ())) {
            throw new PdfMalformedException (MessageConstants.PDF_HUL_38); // PDF-HUL-38
        }
        if (_tokenizer.getWSString ().length () > 1) {
            _pdfACompliant = false;
        }
        PdfObject obj = readObject (false);
        
         // skip comment
        if (obj instanceof PdfSimpleObject
                && ((PdfSimpleObject) obj).getToken() instanceof Comment) {
        	obj = readObject (false);
        	  
        }

        // Now a special-case check to read a stream object, which
        // consists of a dictionary followed by a stream token.
        if (obj instanceof PdfDictionary) {
            Stream strm = null;
            try {
                strm = (Stream) getNext (Stream.class, JhoveMessages.DEFAULT_MESSAGE);
            }
            catch (Exception e) {
                // If we get an exception, it just means it wasn't a stream
            }
            if (strm != null) {
                // Assimilate the dictionary and the stream token into the
                // object to be returned
                PdfStream strmObj = new PdfStream ((PdfDictionary) obj, strm, _module);
                if (!strmObj.isPdfaCompliant()) {
                    _pdfACompliant = false;
                }
                obj = strmObj;
            }
        }

        obj.setObjNumber (objNumTok.getIntegerValue ());
        obj.setGenNumber (genNumTok.getIntegerValue ());
        return obj;
    }

    /**
     * Reads an object.  By design, this reader has a number
     * of limitations:
     * 
    *
  • It doesn't retain the contents of streams
  • *
  • It doesn't recognize a stream when it's pointing at * the stream's dictionary; it will just read the * dictionary
  • *
* Functions which it uses may call it recursively to build up structures. * If it encounters a token inappropriate for an object start, it * throws a PdfException on which getToken * may be called to retrieve that token. */ public PdfObject readObject (boolean allowPseudo) throws IOException, PdfException { Token tok = getNext (); if (tok instanceof ArrayStart) { return readArray (); } else if (tok instanceof DictionaryStart) { return readDictionary (); } // For the end of a dictionary or array, retu else if (allowPseudo && tok instanceof ArrayEnd) { return new PdfArrayEnd(tok); } else if (allowPseudo && tok instanceof DictionaryEnd) { return new PdfDictionaryEnd(tok); } else if (tok.isSimpleToken ()) { return new PdfSimpleObject (tok); } else { throw new PdfMalformedException (MessageConstants.PDF_HUL_39, getOffset(), tok); // PDF-HUL-39 } } /** * Reads a PDF array. When this is called we have already read the * ArrayStart token and arrayDepth has been incremented to reflect this. */ public PdfArray readArray () throws IOException, PdfException { PdfArray arr = new PdfArray (); for (;;) { PdfObject obj = readObject (true); if (!(obj instanceof PdfPseudoObject)) { arr.add (obj); } else if (obj instanceof PdfArrayEnd) { // We detect the end of an array by returning a PdfArrayEnd. // When we get the end of the array, collapse the vector // before returning the object. PdfArrayEnd eobj = (PdfArrayEnd) obj; Token tok = eobj.getToken(); if (tok instanceof ArrayEnd) { collapseObjectVector (arr.getContent ()); if (!arr.isPdfACompliant()) { _pdfACompliant = false; } return arr; } throw new PdfMalformedException (MessageConstants.PDF_HUL_40, getOffset()); // PDF-HUL-40 } } } /** * Reads a PDF dictionary. When this is called, we have already read the * DictionaryStart token, and dictDepth has been incremented to reflect * this. Only for use in this special case, where we're picking up * a dictionary in midstream. */ public PdfDictionary readDictionary () throws IOException, PdfException { PdfDictionary dict = new PdfDictionary (); // Create a vector as a temporary holding place for the objects Vector vec = new Vector<> (); for (;;) { PdfObject obj = readObject (true); // Comments within a dictionary need to be ignored. if (obj instanceof PdfSimpleObject && ((PdfSimpleObject) obj).getToken() instanceof Comment) { continue; } if (!(obj instanceof PdfPseudoObject)) { vec.add (obj); } else if (obj instanceof PdfDictionaryEnd) { // When we get the end of the dictionary, // collapse the vector before returning the object. PdfDictionaryEnd eobj = (PdfDictionaryEnd) obj; Token tok = eobj.getToken (); if (tok instanceof DictionaryEnd) { collapseObjectVector (vec); // The collapsed vector must contain an even number of objects int vecSize = vec.size (); if ((vecSize % 2) != 0) { String mess = MessageFormat.format(MessageConstants.PDF_HUL_41.getMessage(), Integer.valueOf(vecSize)); JhoveMessage message = JhoveMessages.getMessageInstance(MessageConstants.PDF_HUL_41.getId(), mess); throw new PdfMalformedException(message, getOffset ()); // PDF-HUL-41 } for (int i = 0; i < vecSize; i += 2) { try { Name key = (Name) ((PdfSimpleObject) vec.elementAt (i)).getToken (); PdfObject value = vec.elementAt (i + 1); dict.add (key.getValue (), value); } catch (Exception f) { throw new PdfMalformedException (MessageConstants.PDF_HUL_42, getOffset ()); // PDF-HUL-42 } } if (!dict.isPdfACompliant()) { _pdfACompliant = false; // Exceeds implementation limit for PDF/A } return dict; } throw new PdfMalformedException (MessageConstants.PDF_HUL_43, getOffset()); // PDF-HUL-43 } } } /** Returns the current offset into the file. */ public long getOffset () { return _tokenizer.getOffset (); } /** * Positions the file to the specified offset, and * resets the state for a new token stream. */ public void seek (long offset) throws IOException, PdfException { _tokenizer.seek (offset); reset (); } /** * PDF has a wacky grammar which must be a legacy of * PostScript's postfix syntax. A keyword of R means that * the two previous objects are really part of an indirect object * reference. This means that when a vector of objects is complete, * it has to be read backwards so that indirect object references can * be collapsed out. In the case of a dictionary, this has to be done * before the content can be interpreted as key-value pairs. */ private void collapseObjectVector (Vector v) throws PdfException { int lowestChanged = -1; for (int i = v.size() - 1; i >= 2; i--) { PdfObject obj = v.elementAt (i); if (obj instanceof PdfSimpleObject) { Token tok = ((PdfSimpleObject) obj).getToken (); if (tok instanceof Keyword && "R".equals (((Keyword)tok).getValue ())) { // We're in the key of 'R'. The two previous tokens // had better be Numerics. Three objects in the Vector // are replaced by one. try { PdfSimpleObject nobj = (PdfSimpleObject) v.elementAt (i - 2); Numeric ntok = (Numeric) nobj.getToken (); int objNum = ntok.getIntegerValue (); nobj = (PdfSimpleObject) v.elementAt (i - 1); ntok = (Numeric) nobj.getToken (); int genNum = ntok.getIntegerValue (); v.set (i - 2, new PdfIndirectObj (objNum, genNum, _objectMap)); //v.removeElementAt (i); //v.removeElementAt (i - 1); // Put in null as placeholder, to be removed below v.set(i, null); v.set(i - 1, null); lowestChanged = i - 1; i -= 2; } catch (Exception e) { throw new PdfMalformedException (MessageConstants.PDF_HUL_44); // PDF-HUL-44 } } } } // Now remove all the positioned that were nulled. if (lowestChanged > 0) { int i; int j; for (i = lowestChanged, j = lowestChanged; i < v.size(); i++) { PdfObject elem = v.elementAt(i); if (elem != null) { v.set(j++, elem); } } v.setSize(j); } } /** * If true, do not attempt to parse non-whitespace * delimited tokens, e.g., literal and hexadecimal strings. * * @param flag Scan mode flag */ public void scanMode (boolean flag) { _tokenizer.scanMode (flag); } public PdfObject readObjectDef(PdfModule pdfModule) throws IOException, PdfException { _module = pdfModule; return this.readObjectDef(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy