edu.harvard.hul.ois.jhove.module.pdf.Parser Maven / Gradle / Ivy
/**********************************************************************
* Jhove - JSTOR/Harvard Object Validation Environment
* Copyright 2003 by JSTOR and the President and Fellows of Harvard College
**********************************************************************/
package edu.harvard.hul.ois.jhove.module.pdf;
import java.io.*;
import java.util.*;
/**
* The Parser class implements some limited syntactic analysis
* for PDF. It isn't by any means intended to be a full
* parser. Its main job is to track nesting of syntactic
* elements such as dictionary and array beginnings and
* ends.
*/
public class Parser
{
private Tokenizer _tokenizer;
private int _dictDepth; // number of dictionary starts on stack
private int _arrayDepth; // number of array starts on stack
private Map _objectMap; // the object map for the file
private boolean _encrypted; // true if the document is encrypted
/* PDF/A compliance flag. */
private boolean _pdfACompliant;
/**
* Constructor. A Parser works with a Tokenizer that feeds
* it tokens.
*
* @param tokenizer The Tokenizer which the parser will use
*/
public Parser (Tokenizer tokenizer)
{
_tokenizer = tokenizer;
_pdfACompliant = true;
reset ();
}
/**
* Set the object map on which the parser will work.
*/
public void setObjectMap (Map objectMap)
{
_objectMap = objectMap;
}
/**
* Clear the state of the parser so that it can start
* reading at a different place in the file. Clears the
* stack and the dictionary and array depth counters.
*/
public void reset () {
_dictDepth = 0;
_arrayDepth = 0;
}
/**
* Clear the state of the parser so that it can start
* reading at a different place in the file and ignore
* any nesting errors. Sets the
* stack and the dictionary and array depth counters to
* a large number so that nesting exceptions won't be thrown.
*/
public void resetLoose () {
_dictDepth = 1000000;
_arrayDepth = 1000000;
}
/**
* Gets a token. Uses Tokenizer.getNext, and keeps track
* of the depth of dictionary and array nesting.
*/
public Token getNext ()
throws IOException, PdfException
{
return getNext (0L);
}
/**
* Gets a token. Uses Tokenizer.getNext, and keeps track
* of the depth of dictionary and array nesting.
* @param max Maximum allowable size of the token
*/
public Token getNext (long max)
throws IOException, PdfException
{
Token tok = _tokenizer.getNext (max);
if (tok instanceof DictionaryStart) {
++_dictDepth;
}
else if (tok instanceof DictionaryEnd) {
--_dictDepth;
if (_dictDepth < 0) {
throw new PdfMalformedException ("Improperly nested dictionary delimiters");
}
}
if (tok instanceof ArrayStart) {
++_arrayDepth;
}
else if (tok instanceof ArrayEnd) {
--_arrayDepth;
if (_arrayDepth < 0) {
throw new PdfMalformedException ("Improperly nested array delimiters");
}
}
return tok;
}
/**
* A class-sensitive version of getNext. The token
* which is obtained must be of the specified class
* (or a subclass thereof), or a PdfInvalidException with
* message errMsg will be thrown.
*/
public Token getNext (Class> clas, String errMsg)
throws IOException, PdfException
{
Token tok = getNext ();
if (!clas.isInstance (tok)) {
throw new PdfInvalidException (errMsg);
}
if (!tok.isPdfACompliant())
_pdfACompliant = false;
return tok;
}
/**
* Returns the number of dictionary starts not yet matched by
* dictionary ends.
*/
public int getDictDepth ()
{
return _dictDepth;
}
/**
* Tells this Parser, and its Tokenizer, whether the file
* is encrypted.
*/
public void setEncrypted (boolean encrypted)
{
_encrypted = encrypted;
_tokenizer.setEncrypted (encrypted);
}
/**
* Returns the number of array starts not yet matched by
* array ends.
*/
public int getArrayDepth ()
{
return _arrayDepth;
}
/**
* Returns the Tokenizer's current whitespace string.
*/
public String getWSString () {
return _tokenizer.getWSString ();
}
/**
* Returns the language code set from the Tokenizer.
*/
public Set getLanguageCodes ()
{
return _tokenizer.getLanguageCodes ();
}
/**
* Returns false if either the parser or the tokenizer has detected
* non-compliance with PDF/A restrictions. A value of true
* is no guarantee that the file is compliant.
*/
public boolean getPDFACompliant ()
{
if (!_tokenizer.getPDFACompliant ()) {
_pdfACompliant = false;
}
return _pdfACompliant;
}
/**
* Set the value of the pdfACompliant flag. This may be used to
* clear previous detection of noncompliance. If the parameter
* has a value of true
, the tokenizer's pdfACompliant
* flag is also set to true
.
*/
public void setPDFACompliant (boolean pdfACompliant)
{
_pdfACompliant = pdfACompliant;
if (pdfACompliant) {
_tokenizer.setPDFACompliant (true);
}
}
/**
* Reads an object definition, from wherever we are in the stream to
* the completion of one full object after the obj keyword.
*/
public PdfObject readObjectDef () throws IOException, PdfException
{
Numeric objNumTok = (Numeric) getNext
(Numeric.class, "Invalid object definition");
return readObjectDef (objNumTok);
}
/** Reads an object definition, given the first numeric object, which
* has already been read and is passed as an argument. This is called
* by the no-argument readObjectDef; the only other case in which it
* will be called is for a cross-reference stream, which can be distinguished
* from a cross-reference table only once the first token is read.
*/
public PdfObject readObjectDef (Numeric objNumTok)
throws IOException, PdfException
{
String invDef = "Invalid object definition";
reset ();
// The start of an object must be obj
//Numeric objNumTok = (Numeric) getNext (Numeric.class, invDef);
Numeric genNumTok = (Numeric) getNext (Numeric.class, invDef);
Keyword objKey = (Keyword) getNext (Keyword.class, invDef);
if (!"obj".equals (objKey.getValue ())) {
throw new PdfMalformedException (invDef);
}
if (_tokenizer.getWSString ().length () > 1) {
_pdfACompliant = false;
}
PdfObject obj = readObject (false);
// Now a special-case check to read a stream object, which
// consists of a dictionary followed by a stream token.
if (obj instanceof PdfDictionary) {
Stream strm = null;
try {
strm = (Stream) getNext (Stream.class, "");
}
catch (Exception e) {
// if we get an exception, it just means it wasn't a stream
}
if (strm != null) {
// Assimilate the dictionary and the stream token into the
// object to be returned
PdfStream strmObj = new PdfStream ((PdfDictionary) obj, strm);
if (!strmObj.isPdfaCompliant()) {
_pdfACompliant = false;
}
obj = strmObj;
}
}
obj.setObjNumber (objNumTok.getIntegerValue ());
obj.setGenNumber (genNumTok.getIntegerValue ());
return obj;
}
/**
* Reads an object. By design, this reader has a number
* of limitations.
*
* - It doesn't retain the contents of streams
* - It doesn't recognize a stream when it's pointing at
* the stream's dictionary; it will just read the
* dictionary
*
* Functions which it uses may call it recursively to build up structures.
* If it encounters a token inappropriate for an object start, it
* throws a PdfException on which getToken() may be called to retrieve
* that token.
*/
public PdfObject readObject (boolean allowPseudo) throws IOException, PdfException
{
Token tok = getNext ();
if (tok instanceof ArrayStart) {
return readArray ();
}
else if (tok instanceof DictionaryStart) {
return readDictionary ();
}
// For the end of a dictionary or array, retu
else if (allowPseudo && tok instanceof ArrayEnd) {
return new PdfArrayEnd(tok);
}
else if (allowPseudo && tok instanceof DictionaryEnd) {
return new PdfDictionaryEnd(tok);
}
else if (tok.isSimpleToken ()) {
return new PdfSimpleObject (tok);
}
else {
throw new PdfMalformedException
("Cannot parse object", getOffset(), tok);
}
}
/**
* Reads an array. When this is called, we have already read the
* ArrayStart token, and arrayDepth has been incremented to reflect this.
*/
public PdfArray readArray () throws IOException, PdfException
{
PdfArray arr = new PdfArray ();
for (;;) {
PdfObject obj = null;
obj = readObject (true);
if (!(obj instanceof PdfPseudoObject)) {
arr.add (obj);
}
else if (obj instanceof PdfArrayEnd) {
// We detect the end of an array by returning a PdfArrayEnd. When we get
// the end of the array, collapse the vector before returning the object.
PdfArrayEnd eobj = (PdfArrayEnd) obj;
Token tok = eobj.getToken();
if (tok instanceof ArrayEnd) {
collapseObjectVector (arr.getContent ());
if (!arr.isPdfACompliant()) {
_pdfACompliant = false;
}
return arr;
}
throw new PdfMalformedException
("Unexpected token in array", getOffset());
}
}
}
/** Reads a dictionary. When this is called, we have already read the
* DictionaryStart token, and dictDepth has been incremented to reflect this.
* Only for use in this special case, where we're picking up
* a dictionary in midstream.
*/
public PdfDictionary readDictionary () throws IOException, PdfException
{
PdfDictionary dict = new PdfDictionary ();
// Create a vector as a temporary holding place for the objects
Vector vec = new Vector ();
for (;;) {
PdfObject obj = null;
obj = readObject (true);
// Comments within a dictionary need to be ignored.
if (obj instanceof PdfSimpleObject &&
((PdfSimpleObject) obj).getToken() instanceof Comment) {
continue;
}
if (!(obj instanceof PdfPseudoObject)) {
vec.add (obj);
}
else if (obj instanceof PdfDictionaryEnd) {
// When we get
// the end of the array, collapse the vector before returning the object.
PdfDictionaryEnd eobj = (PdfDictionaryEnd) obj;
Token tok = eobj.getToken ();
if (tok instanceof DictionaryEnd) {
collapseObjectVector (vec);
String invalDict = "Malformed dictionary";
// The collapsed vector must contain an even number of objects
int vecSize = vec.size ();
if ((vecSize % 2) != 0) {
throw new PdfMalformedException (invalDict + ": Vector must contain an even number of objects, but has " + vecSize, getOffset ());
}
for (int i = 0; i < vecSize; i += 2) {
try {
Name key = (Name) ((PdfSimpleObject)
vec.elementAt (i)).getToken ();
PdfObject value = vec.elementAt (i + 1);
dict.add (key.getValue (), value);
}
catch (Exception f) {
throw new PdfMalformedException (invalDict, getOffset ());
}
}
if (!dict.isPdfACompliant()) {
_pdfACompliant = false; // exceeds implementation limit for PDF/A
}
return dict;
}
throw new PdfMalformedException
("Unexpected token in dictionary", getOffset());
}
}
}
/**
* Returns the current offset into the file.
*/
public long getOffset ()
{
return _tokenizer.getOffset ();
}
/**
* Positions the file to the specified offset, and
* resets the state for a new token stream.
*/
public void seek (long offset)
throws IOException, PdfException
{
_tokenizer.seek (offset);
reset ();
}
/**
* PDF has a wacky grammar which must be a legacy of
* PostScript's postfix syntax. A keyword of R means that
* the two previous objects are really part of an indirect object
* reference. This means that when a vector of objects is complete,
* it has to be read backwards so that indirect object references can
* be collapsed out. In the case of a dictionary, this has to be done
* before the content can be interpreted as key-value pairs.
*/
private void collapseObjectVector (Vector v) throws PdfException
{
int lowestChanged = -1;
for (int i = v.size() - 1; i >= 2; i--) {
PdfObject obj = v.elementAt (i);
if (obj instanceof PdfSimpleObject) {
Token tok = ((PdfSimpleObject) obj).getToken ();
if (tok instanceof Keyword) {
if ("R".equals (((Keyword)tok).getValue ())) {
// We're in the key of 'R'. The two previous tokens
// had better be Numerics. Three objects in the Vector
// are replaced by one.
try {
PdfSimpleObject nobj =
(PdfSimpleObject) v.elementAt (i - 2);
Numeric ntok = (Numeric) nobj.getToken ();
int objNum = ntok.getIntegerValue ();
nobj = (PdfSimpleObject) v.elementAt (i - 1);
ntok = (Numeric) nobj.getToken ();
int genNum = ntok.getIntegerValue ();
v.set (i - 2, new PdfIndirectObj
(objNum, genNum, _objectMap));
//v.removeElementAt (i);
//v.removeElementAt (i - 1);
// Put in null as placeholder, to be removed below
v.set(i, null);
v.set(i - 1, null);
lowestChanged = i - 1;
i -= 2;
}
catch (Exception e) {
throw new PdfMalformedException
("Malformed indirect object reference");
}
}
}
}
}
// Now remove all the positioned that were nulled.
if (lowestChanged > 0) {
int i;
int j;
for (i = lowestChanged, j = lowestChanged; i < v.size(); i++) {
PdfObject elem = v.elementAt(i);
if (elem != null) {
v.set(j++, elem);
}
}
v.setSize(j);
}
}
/**
* If true, do not attempt to parse non-whitespace delimited tokens, e.g.,
* literal and hexadecimal strings.
* @param flag Scan mode flag
*/
public void scanMode (boolean flag)
{
_tokenizer.scanMode (flag);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy