All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.sejda.sambox.pdmodel.PDPageTree Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sejda.sambox.pdmodel;

import static java.util.Objects.isNull;
import static java.util.Objects.nonNull;
import static java.util.Optional.ofNullable;
import static java.util.stream.Collectors.toList;
import static org.sejda.commons.util.RequireUtils.require;
import static org.sejda.commons.util.RequireUtils.requireNotNullArg;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.Queue;
import java.util.Set;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import org.sejda.sambox.cos.COSArray;
import org.sejda.sambox.cos.COSBase;
import org.sejda.sambox.cos.COSDictionary;
import org.sejda.sambox.cos.COSInteger;
import org.sejda.sambox.cos.COSName;
import org.sejda.sambox.cos.COSNull;
import org.sejda.sambox.cos.COSObjectable;
import org.sejda.sambox.util.ObjectIdUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * The page tree, which defines the ordering of pages in the document in an efficient manner.
 *
 * @author John Hewson
 */
public class PDPageTree implements COSObjectable, Iterable
{
    private static final Logger LOG = LoggerFactory.getLogger(PDPageTree.class);

    private final COSDictionary root;
    private final PDDocument document;

    /**
     * Constructor for embedding.
     */
    public PDPageTree()
    {
        root = new COSDictionary();
        root.setItem(COSName.TYPE, COSName.PAGES);
        root.setItem(COSName.KIDS, new COSArray());
        root.setItem(COSName.COUNT, COSInteger.ZERO);
        document = null;
    }

    /**
     * Constructor for reading.
     *
     * @param root A page tree root.
     */
    public PDPageTree(COSDictionary root)
    {
        this(root, null);
    }

    /**
     * Constructor for reading.
     *
     * @param root     A page tree root.
     * @param document The document which contains "root".
     */
    PDPageTree(COSDictionary root, PDDocument document)
    {
        requireNotNullArg(root, "Page tree root cannot be null");
        // repair bad PDFs which contain a Page dict instead of a page tree, see PDFBOX-3154
        if (COSName.PAGE.equals(root.getCOSName(COSName.TYPE)))
        {
            COSArray kids = new COSArray();
            kids.add(root);
            this.root = new COSDictionary();
            this.root.setItem(COSName.KIDS, kids);
            this.root.setInt(COSName.COUNT, 1);
        }
        else
        {
            this.root = root;
        }
        root.setItem(COSName.TYPE, COSName.PAGES);
        this.document = document;
    }

    /**
     * Similar to {@link #getInheritableAttribute(COSDictionary, COSName)} but also checks the
     * returned element matches the expected type class
     *
     * @param node
     * @param key
     * @param clazz expected type class
     * @param 
     * @return
     */
    public static  COSBase getInheritableAttribute(COSDictionary node,
            COSName key, Class clazz)
    {
        COSBase result = getInheritableAttribute(node, key);
        if (clazz.isInstance(result))
        {
            return result;
        }
        return null;
    }

    /**
     * Returns the given attribute, inheriting from parent tree nodes if necessary.
     *
     * @param node page object
     * @param key  the key to look up
     * @return COS value for the given key
     */
    public static COSBase getInheritableAttribute(COSDictionary node, COSName key)
    {
        return getInheritableAttribute(node, key, new HashSet<>());
    }
    
    public static COSBase getInheritableAttribute(COSDictionary node, COSName key, Set visitedObjectIds)
    {
        COSBase value = node.getDictionaryObject(key);
        if (value != null)
        {
            return value;
        }

        COSDictionary parent = node.getDictionaryObject(COSName.PARENT, COSName.P,
                COSDictionary.class);
        
        // prevent infinite recursion
        if (parent == node)
        {
            return null;
        }
        
        if (parent != null)
        {
            String objId = ObjectIdUtils.getObjectIdOf(node);
            if(!objId.isBlank() && visitedObjectIds.contains(objId))
            {
                // prevent infinite recursion
                return null;
            }
            
            visitedObjectIds.add(objId);
            return getInheritableAttribute(parent, key, visitedObjectIds);
        }

        return null;
    }

    /**
     * Returns an iterator which walks all pages in the tree, in order.
     */
    @Override
    public Iterator iterator()
    {
        PageIterator iterator = new PageIterator(root);
        // there's a problem when the expected number of pages is different than the actual number of pages loaded
        // because iterating on PDPageTree.iterator() will silently skip broken pages that could not be loaded
        if(iterator.size() != document.getNumberOfPages()) {
            // try to throw a specific PageNotFound exception, identify which page is missing
            for (int i = 0; i < document.getNumberOfPages(); i++)
            {
                get(i);
            }
            
            // throw a generic "something's wrong" exception
            throw new InvalidNumberOfPagesException(iterator.size(), document.getNumberOfPages());
        }
        
        return iterator;
    }

    /**
     * @return a sequential {@code Stream} over the pages of this page tree.
     */
    public Stream stream()
    {
        return StreamSupport.stream(Spliterators.spliterator(iterator(), getCount(),
                Spliterator.ORDERED | Spliterator.NONNULL), false);
    }

    /**
     * @return a sequential {@code Stream} over the nodes of this page tree.
     */
    public Stream streamNodes()
    {
        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(new NodesIterator(root),
                Spliterator.ORDERED | Spliterator.NONNULL), false);
    }

    /**
     * Helper to get kids from malformed PDFs.
     *
     * @param node page tree node
     * @return list of kids
     */
    private List getKids(COSDictionary node)
    {
        COSArray kids = node.getDictionaryObject(COSName.KIDS, COSArray.class);
        if (nonNull(kids))
        {
            // we collect only non null, non COSNull COSDictionary kids
            return kids.stream().map(COSBase::getCOSObject).filter(i -> i != COSNull.NULL)
                    .filter(Objects::nonNull).filter(n -> n instanceof COSDictionary)
                    .map(n -> (COSDictionary) n).collect(toList());
        }
        return new ArrayList<>();
    }

    /**
     * Iterator which walks all pages in the tree, in order.
     */
    private final class PageIterator implements Iterator
    {
        private final Queue queue = new ArrayDeque<>();

        private PageIterator(COSDictionary node)
        {
            enqueueKids(node);
        }

        private void enqueueKids(COSDictionary node)
        {
            if (isPageTreeNode(node))
            {
                getKids(node).forEach(this::enqueueKids);
            }
            else
            {
                queue.add(node);
            }
        }

        @Override
        public boolean hasNext()
        {
            return !queue.isEmpty();
        }

        @Override
        public PDPage next()
        {
            if (!hasNext())
            {
                throw new NoSuchElementException();
            }
            COSDictionary next = queue.poll();

            sanitizeType(next);

            ResourceCache resourceCache = document != null ? document.getResourceCache() : null;
            return new PDPage(next, resourceCache);
        }
        
        public int size()
        {
            return queue.size();
        }

        @Override
        public void remove()
        {
            throw new UnsupportedOperationException();
        }
    }

    /**
     * Iterator which walks all the nodes in the tree.
     */
    private final class NodesIterator implements Iterator
    {
        private final Queue queue = new ArrayDeque<>();

        private NodesIterator(COSDictionary node)
        {
            enqueueKids(node);
        }

        private void enqueueKids(COSDictionary node)
        {
            queue.add(node);
            if (isPageTreeNode(node))
            {
                getKids(node).forEach(this::enqueueKids);
            }
        }

        @Override
        public boolean hasNext()
        {
            return !queue.isEmpty();
        }

        @Override
        public COSDictionary next()
        {
            if (!hasNext())
            {
                throw new NoSuchElementException();
            }
            return queue.poll();
        }

        @Override
        public void remove()
        {
            throw new UnsupportedOperationException();
        }
    }

    /**
     * Returns the page at the given index.
     *
     * @param index zero-based index
     */
    public PDPage get(int index)
    {
        PageAndPageTreeParent res = get(index + 1, root, 0, null, new HashSet<>());
        COSDictionary dict = res.node;

        sanitizeType(dict);

        ResourceCache resourceCache = document != null ? document.getResourceCache() : null;
        return new PDPage(dict, resourceCache, res.parent);
    }

    private static void sanitizeType(COSDictionary dictionary)
    {
        if (isNull(dictionary.getCOSName(COSName.TYPE)))
        {
            LOG.warn("Missing required 'Page' type for page");
            dictionary.setName(COSName.TYPE, COSName.PAGE.getName());
        }
        COSName type = dictionary.getCOSName(COSName.TYPE);
        if (!COSName.PAGE.equals(type))
        {
            LOG.error("Expected 'Page' but found '{}'", type.getName());
            dictionary.setName(COSName.TYPE, COSName.PAGE.getName());
        }
    }

    /**
     * Returns the given COS page using a depth-first search.
     *
     * @param pageNum        1-based page number
     * @param node           page tree node to search
     * @param encountered    number of pages encountered so far
     * @param pageTreeParent the parent node, as determined traversing the page tree top -> down
     * @return COS dictionary of the Page object
     */
    private PageAndPageTreeParent get(int pageNum, COSDictionary node, int encountered,
            COSDictionary pageTreeParent, Set visited)
    {
        require(pageNum >= 0, () -> new PageNotFoundException(
                "Index out of bounds: " + pageNum + " in " + getSourcePath(), pageNum,
                getSourcePath()));
        require(visited.add(node), () -> new IllegalStateException(
                "Possible recursion found when searching for page " + pageNum));

        if (isPageTreeNode(node))
        {
            int count = node.getInt(COSName.COUNT, 0);
            if (pageNum <= encountered + count)
            {
                // it's a kid of this node
                for (COSDictionary kid : getKids(node))
                {
                    // which kid?
                    if (isPageTreeNode(kid))
                    {
                        int kidCount = kid.getInt(COSName.COUNT, 0);
                        if (pageNum <= encountered + kidCount)
                        {
                            // it's this kid
                            return get(pageNum, kid, encountered, node, visited);
                        }
                        encountered += kidCount;
                    }
                    else
                    {
                        // single page
                        encountered++;
                        if (pageNum == encountered)
                        {
                            // it's this page
                            return get(pageNum, kid, encountered, node, visited);
                        }
                    }
                }
                throw new PageNotFoundException(
                        "Unable to find page " + pageNum + " in " + getSourcePath(), pageNum,
                        getSourcePath());
            }
            throw new PageNotFoundException(
                    "Index out of bounds: " + pageNum + " in " + getSourcePath(), pageNum,
                    getSourcePath());
        }
        if (encountered == pageNum)
        {
            return new PageAndPageTreeParent(node, pageTreeParent);
        }

        throw new PageNotFoundException("Unable to find page " + pageNum + " in " + getSourcePath(),
                pageNum, getSourcePath());
    }

    private String getSourcePath()
    {
        return ofNullable(getCOSObject().id()).map(i -> i.ownerIdentifier).orElse("Unknown");
    }

    /**
     * @return true if the node is a page tree node (i.e. and intermediate).
     */
    public static boolean isPageTreeNode(COSDictionary node)
    {
        // some files such as PDFBOX-2250-229205.pdf don't have Pages set as the Type, so we have
        // to check for the presence of Kids too
        return nonNull(node) && (node.getCOSName(COSName.TYPE) == COSName.PAGES || node.containsKey(
                COSName.KIDS));
    }

    /**
     * Returns the index of the given page, or -1 if it does not exist.
     *
     * @param page The page to search for.
     * @return the zero-based index of the given page, or -1 if the page is not found.
     */
    public int indexOf(PDPage page)
    {
        SearchContext context = new SearchContext(page);
        if (findPage(context, root))
        {
            return context.index;
        }
        return -1;
    }

    private boolean findPage(SearchContext context, COSDictionary node)
    {
        for (COSDictionary kid : getKids(node))
        {
            if (context.found)
            {
                break;
            }
            if (isPageTreeNode(kid))
            {
                findPage(context, kid);
            }
            else
            {
                context.visitPage(kid);
            }
        }
        return context.found;
    }

    private static final class SearchContext
    {
        private final COSDictionary searched;
        private int index = -1;
        private boolean found;

        private SearchContext(PDPage page)
        {
            this.searched = page.getCOSObject();
        }

        private void visitPage(COSDictionary current)
        {
            index++;
            found = searched.equals(current);
        }
    }

    /**
     * Returns the number of leaf nodes (page objects) that are descendants of this root within the
     * page tree.
     */
    public int getCount()
    {
        return root.getInt(COSName.COUNT, 0);
    }

    @Override
    public COSDictionary getCOSObject()
    {
        return root;
    }

    /**
     * Removes the page with the given index from the page tree.
     *
     * @param index zero-based page index
     */
    public void remove(int index)
    {
        PageAndPageTreeParent res = get(index + 1, root, 0, null, new HashSet<>());
        remove(res.node, res.parent);
    }

    /**
     * Removes the given page from the page tree.
     *
     * @param page The page to remove.
     */
    public void remove(PDPage page)
    {
        remove(page.getCOSObject());
    }

    /**
     * Removes the given COS page.
     */
    private void remove(COSDictionary node)
    {
        remove(node, null);
    }

    /**
     * Removes the given COS page.
     */
    private void remove(COSDictionary node, COSDictionary knownParent)
    {
        // remove from parent's kids
        COSDictionary parent = node.getDictionaryObject(COSName.PARENT, COSName.P,
                COSDictionary.class);

        if (parent == null)
        {
            // broken node with missing PARENT, use the one known from traversing the page tree
            parent = knownParent;
        }

        COSArray kids = parent.getDictionaryObject(COSName.KIDS, COSArray.class);
        if (kids.removeObject(node))
        {
            // update ancestor counts
            parent.setInt(COSName.COUNT, parent.getInt(COSName.COUNT) - 1);
            node = parent;

            do
            {
                node = node.getDictionaryObject(COSName.PARENT, COSName.P, COSDictionary.class);
                if (node != null)
                {
                    node.setInt(COSName.COUNT, node.getInt(COSName.COUNT) - 1);
                }
            } while (node != null);
        }
    }

    /**
     * Adds the given page to this page tree.
     *
     * @param page The page to add.
     */
    public void add(PDPage page)
    {
        // set parent
        COSDictionary node = page.getCOSObject();
        node.setItem(COSName.PARENT, root);

        // todo: re-balance tree? (or at least group new pages into tree nodes of e.g. 20)

        // add to parent's kids
        COSArray kids = root.getDictionaryObject(COSName.KIDS, COSArray.class);
        kids.add(node);

        // update ancestor counts
        do
        {
            node = (COSDictionary) node.getDictionaryObject(COSName.PARENT, COSName.P);
            if (node != null)
            {
                node.setInt(COSName.COUNT, node.getInt(COSName.COUNT) + 1);
            }
        } while (node != null);
    }

    /**
     * Insert a page before another page within a page tree.
     *
     * @param newPage  the page to be inserted.
     * @param nextPage the page that is to be after the new page.
     * @throws IllegalArgumentException if one attempts to insert a page that isn't part of a page
     *                                  tree.
     */
    public void insertBefore(PDPage newPage, PDPage nextPage)
    {
        COSDictionary nextPageDict = nextPage.getCOSObject();
        COSDictionary parentDict = nextPageDict.getDictionaryObject(COSName.PARENT,
                COSDictionary.class);
        if (nextPage.getPageTreeParent() != null)
        {
            parentDict = nextPage.getPageTreeParent();
        }

        COSArray kids = parentDict.getDictionaryObject(COSName.KIDS, COSArray.class);
        boolean found = false;
        for (int i = 0; i < kids.size(); ++i)
        {
            COSDictionary pageDict = (COSDictionary) kids.getObject(i);
            if (pageDict.equals(nextPage.getCOSObject()))
            {
                kids.add(i, newPage.getCOSObject());
                newPage.getCOSObject().setItem(COSName.PARENT, parentDict);
                found = true;
                break;
            }
        }
        if (!found)
        {
            throw new IllegalArgumentException("attempted to insert before orphan page");
        }
        increaseParents(parentDict);
    }

    /**
     * Insert a page after another page within a page tree.
     *
     * @param newPage  the page to be inserted.
     * @param prevPage the page that is to be before the new page.
     * @throws IllegalArgumentException if one attempts to insert a page that isn't part of a page
     *                                  tree.
     */
    public void insertAfter(PDPage newPage, PDPage prevPage)
    {
        COSDictionary prevPageDict = prevPage.getCOSObject();
        COSDictionary parentDict = prevPageDict.getDictionaryObject(COSName.PARENT,
                COSDictionary.class);
        if (prevPage.getPageTreeParent() != null)
        {
            parentDict = prevPage.getPageTreeParent();
        }

        COSArray kids = parentDict.getDictionaryObject(COSName.KIDS, COSArray.class);
        boolean found = false;
        for (int i = 0; i < kids.size(); ++i)
        {
            COSDictionary pageDict = (COSDictionary) kids.getObject(i);
            if (pageDict.equals(prevPage.getCOSObject()))
            {
                kids.add(i + 1, newPage.getCOSObject());
                newPage.getCOSObject().setItem(COSName.PARENT, parentDict);
                found = true;
                break;
            }
        }
        if (!found)
        {
            throw new IllegalArgumentException("attempted to insert before orphan page");
        }
        increaseParents(parentDict);
    }

    private void increaseParents(COSDictionary parentDict)
    {
        do
        {
            int cnt = parentDict.getInt(COSName.COUNT);
            parentDict.setInt(COSName.COUNT, cnt + 1);
            parentDict = (COSDictionary) parentDict.getDictionaryObject(COSName.PARENT);
        } while (parentDict != null);
    }

    /**
     * A tuple of a page and the parent found when traversing the page tree top down. NOTE: The page
     * tree parent can be different from page.PARENT (for invalid documents) hence the need for this
     * class when adding new pages and having to increment kids counters in the page tree
     */
    public static class PageAndPageTreeParent
    {
        public final COSDictionary node;
        public final COSDictionary parent;

        public PageAndPageTreeParent(COSDictionary node, COSDictionary parent)
        {
            this.node = node;
            this.parent = parent;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy