org.apache.fop.render.pdf.PDFLogicalStructureHandler Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of fop Show documentation
Apache FOP (Formatting Objects Processor) is the world's first print formatter driven by XSL formatting objects (XSL-FO) and the world's first output independent formatter. It is a Java application that reads a formatting object (FO) tree and renders the resulting pages to a specified output. Output formats currently supported include PDF, PCL, PS, AFP, TIFF, PNG, SVG, XML (area tree representation), Print, AWT and TXT. The primary output target is PDF.
There is a newer version: 2.9
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* $Id: PDFLogicalStructureHandler.java 898840 2010-01-13 17:05:59Z vhennebert $ */

package org.apache.fop.render.pdf;

import java.util.HashMap;
import java.util.Locale;
import java.util.Map;

import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import org.apache.fop.events.EventBroadcaster;
import org.apache.fop.fo.extensions.ExtensionElementMapping;
import org.apache.fop.fo.extensions.InternalElementMapping;
import org.apache.fop.pdf.PDFArray;
import org.apache.fop.pdf.PDFDictionary;
import org.apache.fop.pdf.PDFDocument;
import org.apache.fop.pdf.PDFLink;
import org.apache.fop.pdf.PDFName;
import org.apache.fop.pdf.PDFPage;
import org.apache.fop.pdf.PDFParentTree;
import org.apache.fop.pdf.PDFStructElem;
import org.apache.fop.pdf.PDFStructTreeRoot;


/**
 * Handles the creation of the logical structure in the PDF document.
 */
class PDFLogicalStructureHandler {

    private static final PDFName MCR = new PDFName("MCR");

    private static final PDFName OBJR = new PDFName("OBJR");

    private static final MarkedContentInfo ARTIFACT = new MarkedContentInfo(null, -1, null);

    private final PDFDocument pdfDoc;

    private final EventBroadcaster eventBroadcaster;

    /**
     * Map of references to the corresponding structure elements.
     */
    private final Map structTreeMap = new HashMap();

    private final PDFParentTree parentTree = new PDFParentTree();

    private int parentTreeKey;

    private PDFPage currentPage;

    /**
     * The array of references, from marked-content sequences in the current
     * page, to their parent structure elements. This will be a value in the
     * structure parent tree, whose corresponding key will be the page's
     * StructParents entry.
     */
    private PDFArray pageParentTreeArray;

    private PDFStructElem rootStructureElement;

    /**
     * Class providing the necessary information for bracketing content
     * associated to a structure element as a marked-content sequence.
     */
    static final class MarkedContentInfo {

        /**
         * A value that can be used for the tag operand of a marked-content
         * operator. This is the structure type of the corresponding structure
         * element.
         */
        final String tag;

        /**
         * The value for the MCID entry of the marked-content sequence's property list.
         */
        final int mcid;

        private final PDFStructElem parent;

        private MarkedContentInfo(String tag, int mcid, PDFStructElem parent) {
            this.tag = tag;
            this.mcid = mcid;
            this.parent = parent;
        }
    }

    /**
     * Creates a new instance for handling the logical structure of the given document.
     *
     * @param pdfDoc a document
     */
    PDFLogicalStructureHandler(PDFDocument pdfDoc, EventBroadcaster eventBroadcaster) {
        this.pdfDoc = pdfDoc;
        this.eventBroadcaster = eventBroadcaster;
        PDFStructTreeRoot structTreeRoot = pdfDoc.getFactory().makeStructTreeRoot(parentTree);
        rootStructureElement = pdfDoc.getFactory().makeStructureElement(
                FOToPDFRoleMap.mapFormattingObject("root", structTreeRoot), structTreeRoot);
        structTreeRoot.addKid(rootStructureElement);
    }

    /**
     * Converts the given structure tree into PDF.
     *
     * @param structureTree the structure tree of the current page sequence
     * @param language language set on the page sequence
     */
    void processStructureTree(NodeList structureTree, Locale language) {
        pdfDoc.enforceLanguageOnRoot();
        PDFStructElem structElemPart = pdfDoc.getFactory().makeStructureElement(
                FOToPDFRoleMap.mapFormattingObject("page-sequence", rootStructureElement),
                rootStructureElement);
        rootStructureElement.addKid(structElemPart);
        if (language != null) {
            structElemPart.setLanguage(language);
        }

        for (int i = 0, n = structureTree.getLength(); i < n; i++) {
            Node node = structureTree.item(i);
            assert node.getLocalName().equals("flow")
                    || node.getLocalName().equals("static-content");
            PDFStructElem structElemSect = pdfDoc.getFactory().makeStructureElement(
                    FOToPDFRoleMap.mapFormattingObject(node.getLocalName(), structElemPart),
                    structElemPart);
            structElemPart.addKid(structElemSect);
            NodeList childNodes = node.getChildNodes();
            for (int j = 0, m = childNodes.getLength(); j < m; j++) {
                processNode(childNodes.item(j), structElemSect, true);
            }
        }
    }

    private void processNode(Node node, PDFStructElem parent, boolean addKid) {
        Node attr = node.getAttributes().getNamedItemNS(InternalElementMapping.URI, "ptr");
        assert attr != null;
        String ptr = attr.getNodeValue();
        PDFStructElem structElem = pdfDoc.getFactory().makeStructureElement(
                FOToPDFRoleMap.mapFormattingObject(node, parent, eventBroadcaster), parent);
        // TODO necessary? If a page-sequence is empty (e.g., contains a single
        // empty fo:block), should the block still be added to the structure
        // tree? This is not being done for descendant empty elements...
        if (addKid) {
            parent.addKid(structElem);
        }
        String nodeName = node.getLocalName();
        if (nodeName.equals("external-graphic") || nodeName.equals("instream-foreign-object")) {
            Node altTextNode = node.getAttributes().getNamedItemNS(
                    ExtensionElementMapping.URI, "alt-text");
            if (altTextNode != null) {
                structElem.put("Alt", altTextNode.getNodeValue());
            } else {
                structElem.put("Alt", "No alternate text specified");
            }
        }
        structTreeMap.put(ptr, structElem);
        NodeList nodes = node.getChildNodes();
        for (int i = 0, n = nodes.getLength(); i < n; i++) {
            processNode(nodes.item(i), structElem, false);
        }
    }

    private int getNextParentTreeKey() {
        return parentTreeKey++;
    }

    /**
     * Receive notification of the beginning of a new page.
     *
     * @param page the page that will be rendered in PDF
     */
    void startPage(PDFPage page) {
        currentPage = page;
        currentPage.setStructParents(getNextParentTreeKey());
        pageParentTreeArray = new PDFArray();
    }

    /**
     * Receive notification of the end of the current page.
     */
    void endPage() {
        // TODO
        // Values in a number tree must be indirect references to the PDF
        // objects associated to the keys. To enforce that the array is
        // registered to the PDF document. Unfortunately that can't be done
        // earlier since a call to PDFContentGenerator.flushPDFDoc can be made
        // before the array is complete, which would result in only part of it
        // being output to the PDF.
        // This should really be handled by PDFNumsArray
        pdfDoc.registerObject(pageParentTreeArray);
        parentTree.getNums().put(currentPage.getStructParents(), pageParentTreeArray);
    }

    private MarkedContentInfo addToParentTree(String structurePointer) {
        PDFStructElem parent = (PDFStructElem) structTreeMap.get(structurePointer);
        if (parent == null) {
            return ARTIFACT;
        } else {
            pageParentTreeArray.add(parent);
            String type = parent.getStructureType().toString();
            int mcid = pageParentTreeArray.length() - 1;
            return new MarkedContentInfo(type, mcid, parent);
        }
    }

    /**
     * Adds a content item corresponding to text into the structure tree, if
     * there is a structure element associated to it.
     *
     * @param structurePointer reference to the parent structure element of the
     * piece of text
     * @return the necessary information for bracketing the content as a
     * marked-content sequence. If there is no element in the structure tree
     * associated to that content, returns an instance whose
     * {@link MarkedContentInfo#tag} value is null. The content
     * must then be treated as an artifact.
     */
    MarkedContentInfo addTextContentItem(String structurePointer) {
        MarkedContentInfo mci = addToParentTree(structurePointer);
        if (mci != ARTIFACT) {
            PDFDictionary contentItem = new PDFDictionary();
            contentItem.put("Type", MCR);
            contentItem.put("Pg", this.currentPage);
            contentItem.put("MCID", mci.mcid);
            mci.parent.addKid(contentItem);
        }
        return mci;
    }

    /**
     * Adds a content item corresponding to an image into the structure tree, if
     * there is a structure element associated to it.
     *
     * @param structurePointer reference to the parent structure element of the
     * image
     * @return the necessary information for bracketing the content as a
     * marked-content sequence. If there is no element in the structure tree
     * associated to that image, returns an instance whose
     * {@link MarkedContentInfo#tag} value is null. The image
     * must then be treated as an artifact.
     */
    MarkedContentInfo addImageContentItem(String structurePointer) {
        MarkedContentInfo mci = addToParentTree(structurePointer);
        if (mci != ARTIFACT) {
            mci.parent.setMCIDKid(mci.mcid);
            mci.parent.setPage(this.currentPage);
        }
        return mci;
    }

    // While the PDF spec allows images to be referred as PDF objects, this
    // makes the Acrobat Pro checker complain that the image is not accessible.
    // Its alt-text is still read aloud though. Using marked-content sequences
    // like for text works.
//    MarkedContentInfo addImageObject(String parentReference) {
//        MarkedContentInfo mci = addToParentTree(parentReference);
//        if (mci != ARTIFACT) {
//            PDFDictionary contentItem = new PDFDictionary();
//            contentItem.put("Type", OBJR);
//            contentItem.put("Pg", this.currentPage);
//            contentItem.put("Obj", null);
//            mci.parent.addKid(contentItem);
//        }
//        return mci;
//    }

    /**
     * Adds a content item corresponding to the given link into the structure
     * tree.
     *
     * @param link a link
     * @param structurePointer reference to the corresponding parent structure element
     */
    void addLinkContentItem(PDFLink link, String structurePointer) {
        int structParent = getNextParentTreeKey();
        link.setStructParent(structParent);
        parentTree.getNums().put(structParent, link);
        PDFDictionary contentItem = new PDFDictionary();
        contentItem.put("Type", OBJR);
        contentItem.put("Pg", this.currentPage);
        contentItem.put("Obj", link);
        PDFStructElem parent = (PDFStructElem) structTreeMap.get(structurePointer);
        parent.addKid(contentItem);
    }

}