edu.harvard.hul.ois.jhove.module.pdf.PageTreeNode Maven / Gradle / Ivy
/**********************************************************************
* Jhove - JSTOR/Harvard Object Validation Environment
* Copyright 2003 by JSTOR and the President and Fellows of Harvard College
**********************************************************************/
package edu.harvard.hul.ois.jhove.module.pdf;
import edu.harvard.hul.ois.jhove.module.*;
import java.util.*;
/**
* Class encapsulating a PDF page tree node.
* The page tree is built such that callers can walk through
* it by calling startWalk and then calling nextDocNode
* (for all nodes) or nextPageObject (for pages only) repeatedly.
*/
public class PageTreeNode extends DocNode
{
/* The descendant DocNodes. */
private List _descendants;
private ListIterator _descendantsIter;
private DocNode _currentDescendant;
private boolean _walkFirst;
private Set _visitedNodes;
/**
* Superclass constructor.
* @param module The PDFModule under which we're operating
* @param parent The parent node in the document tree;
* may be null only for the root node
* @param dict The dictionary object on which this node
* is based
*/
public PageTreeNode (PdfModule module,
PageTreeNode parent,
PdfDictionary dict) throws PdfMalformedException
{
super (module, parent, dict);
_pageObjectFlag = false;
_descendants = new ArrayList<> (1); // Empty list in case it doesn't get built
}
/**
* Builds the subtree of descendants of this node, using
* the Kids entry in the dictionary.
*/
public void buildSubtree (boolean toplevel, int recGuard) throws PdfException
{
/* Guard against infinite recursion */
if (recGuard <= 0) {
throw new PdfMalformedException (MessageConstants.PDF_HUL_32); // PDF-HUL-32
}
PdfArray kids = null;
try {
/* Section 3.6.2 of the PDF 1.6 doc says:
* "Applications should be prepared
* to handle any form of tree structure built of such nodes
* [page tree nodes and page nodes]. The simplest structure
* would consist of a single page tree node that references
* all of the document's page objects directly."
* But actually, the simplest structure would be a single
* page node. And it appears that Acrobat 7 will indeed
* generate such.
*/
/* Note that the Kids dictionary can be an indirect object. */
PdfObject obj = _dict.get("Kids");
if (obj instanceof PdfIndirectObj) {
kids = (PdfArray) (((PdfIndirectObj) obj).getObject ());
}
else {
kids = (PdfArray) obj;
}
if (toplevel && kids == null) {
// The single page node case, maybe.
PdfSimpleObject type = (PdfSimpleObject) _dict.get ("Type");
if (type != null &&
"Page".equals (type.getStringValue())) {
PageObject pageObj = new PageObject
(_module, this, _dict);
_descendants = new ArrayList<> (1);
_descendants.add (pageObj);
}
}
else {
Vector kidsVec = kids.getContent ();
_descendants = new ArrayList<> (kidsVec.size ());
for (int i = 0; i < kidsVec.size (); i++) {
PdfIndirectObj kidRef =
(PdfIndirectObj) kidsVec.elementAt (i);
/**************************************************
* To avoid a simple case of infinite recursion, check
* that this kid is not the same page object as its
* parent.
**************************************************/
/**************************************************
int kidObjNumber = kidRef.getObjNumber ();
int kidGenNumber = kidRef.getGenNumber ();
if (objNumber >= 0 && genNumber >= 0 &&
objNumber == kidObjNumber &&
genNumber == kidGenNumber) {
break;
}
**************************************************/
PdfDictionary kid = (PdfDictionary)
_module.resolveIndirectObject (kidRef);
PdfSimpleObject kidtype =
(PdfSimpleObject) kid.get("Type");
String kidtypeStr = kidtype.getStringValue ();
if ("Page".equals (kidtypeStr)) {
PageObject pageObj = new PageObject
(_module, this, kid);
pageObj.loadContent (_module);
_descendants.add(pageObj);
}
else if ("Pages".equals (kidtypeStr)) {
PageTreeNode nodeObj =
new PageTreeNode (_module, this, kid);
nodeObj.buildSubtree (false, recGuard - 1);
_descendants.add(nodeObj);
}
}
}
}
catch (PdfException ee) {
throw ee;
}
catch (ArrayIndexOutOfBoundsException excep) {
throw new PdfInvalidException(MessageConstants.PDF_HUL_147); // PDF-HUL-147
}
catch (Exception e) {
throw new PdfInvalidException(MessageConstants.PDF_HUL_29); // PDF-HUL-29
}
}
/**
* Initialize an iterator through the descendants of this node.
*/
@Override
public void startWalk ()
{
_descendantsIter = _descendants.listIterator ();
_currentDescendant = null;
_walkFirst = true;
_walkFinished = false;
_visitedNodes = new HashSet<> (); // Track self-recursion
}
/**
* Get the next PageObject which is under this node. This function
* is designed such that calling startWalk() and then repeatedly
* calling nextPageObject() will return all the PageObjects in the tree
* under this node, and finally will return null when there are no more.
*/
@Override
public PageObject nextPageObject () throws PdfMalformedException
{
if (_walkFinished) {
return null;
}
// _currentDescendant == null and _walkFinished == false indicates
// we're at the start.
if (_currentDescendant == null) {
if (!_descendantsIter.hasNext ()) {
_walkFinished = true;
return null;
}
// Get first descendant
_currentDescendant = _descendantsIter.next ();
_currentDescendant.startWalk ();
}
PageObject retval = _currentDescendant.nextPageObject ();
if (retval == null) {
if (_descendantsIter.hasNext ()) {
// Every node is a page object or
// has at least one page object below it, right?
_currentDescendant = _descendantsIter.next ();
_currentDescendant.startWalk ();
retval = _currentDescendant.nextPageObject ();
}
else {
// We've gone through all our descendants.
_walkFinished = true;
}
}
if (retval != null) {
Integer objnum = Integer.valueOf(retval.getDict().getObjNumber());
if (_visitedNodes.contains(objnum)) {
throw new PdfMalformedException(MessageConstants.PDF_HUL_30); // PDF-HUL-30
}
_visitedNodes.add(objnum);
}
return retval;
}
/**
* Get the next DocNode which is under this node. This function
* is designed such that calling startWalk() and then repeatedly
* calling nextPageObject() will return first this node,
* then all the DocNodes in the tree
* under this node. It finally will return null when there
* are no more.
*/
@Override
public DocNode nextDocNode () throws PdfMalformedException
{
if (_walkFinished) {
return null;
}
// _walkFinished == false and _walkFirst == true indicates
// we need to return "this".
if (_walkFirst) {
_walkFirst = false;
return this;
}
// _currentDescendant == null and _walkFinished == false indicates
// we're at the start. This is almost identical to the
// logic for nextPageObject.
if (_currentDescendant == null) {
if (!_descendantsIter.hasNext ()) {
_walkFinished = true;
return null;
}
// Get first descendant
_currentDescendant = _descendantsIter.next ();
_currentDescendant.startWalk ();
}
DocNode retval = _currentDescendant.nextDocNode ();
if (retval == null) {
if (_descendantsIter.hasNext ()) {
// Every node is a page object or
// has at least one page object below it, right?
_currentDescendant = _descendantsIter.next ();
_currentDescendant.startWalk ();
retval = _currentDescendant.nextDocNode ();
}
else {
// We've gone through all our descendants.
_walkFinished = true;
}
}
if (retval != null) {
Integer objnum = Integer.valueOf(retval.getDict().getObjNumber());
if (_visitedNodes.contains(objnum)) {
throw new PdfMalformedException(MessageConstants.PDF_HUL_31); // PDF-HUL-31
}
_visitedNodes.add(objnum);
}
return retval;
}
}