org.xhtmlrenderer.pdf.HTMLOutline Maven / Gradle / Ivy
/*
* {{{ header & license
* Copyright (c) 2016 Stanimir Stamenkov
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
* }}}
*/
package org.xhtmlrenderer.pdf;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.NodeIterator;
import org.xhtmlrenderer.pdf.ITextOutputDevice.Bookmark;
import org.xhtmlrenderer.render.Box;
import java.util.Collections;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.regex.Pattern.CASE_INSENSITIVE;
class HTMLOutline {
private static final Pattern HEADING = Pattern.compile("h(\\d+)", CASE_INSENSITIVE);
/** sectioning roots */
private static final Pattern ROOT = Pattern.compile("blockquote|details|fieldset|figure|td", CASE_INSENSITIVE);
private static final Pattern WS = Pattern.compile("\\s+");
private static final int MAX_NAME_LENGTH = 200;
private final HTMLOutline parent;
private final int level;
private final Bookmark bookmark;
private HTMLOutline() {
this(0, "root", null);
}
private HTMLOutline(int level, String name, HTMLOutline parent) {
this.level = level;
this.bookmark = new Bookmark(name, "");
this.parent = parent;
if (parent != null) {
parent.bookmark.addChild(bookmark);
}
}
/**
* Creates a bookmark list of the document outline generated for the given
* element context (usually the root document element).
*
* The current algorithm is more simple than the one suggested in the HTML5
* specification such as it is not affected by
* sectioning
* content but just the heading level. For
* example:
*
* <body>
* <h1>Foo</h1>
* <h3>Bar</h3>
* <blockquote>
* <h5>Bla</h5>
* </blockquote>
* <p>Baz</p>
* <h2>Quux</h2>
* <section>
* <h3>Thud</h3>
* </section>
* <h4>Grunt</h4>
* </body>
*
* Should generate outline as:
*
* - Foo
*
* - Bar
* - Quux
* - Thud
* - Grunt
*
*
*
* But it generates outline as:
*
* - Foo
*
* - Bar
* - Quux
*
* - Thud
*
* - Grunt
*
*
*
*
*
* Example document customizations
*
* Include non-heading element as bookmark (level 4)
*
* <strong data-pdf-bookmark="4">Foo bar</strong>
*
* Specify bookmark name
*
* <tr data-pdf-bookmark="5" data-pdf-bookmark-name="Bar baz">...</tr>
*
* Exclude individual heading from bookmarks
*
* <h3 data-pdf-bookmark="none">Baz qux</h3>
*
* Prevent automatic bookmarks for the whole of the document
*
* <html data-pdf-bookmark="exclude">...</html>
*
* @param context the top element a sectioning outline would be generated for;
* @param box box hierarchy the outline bookmarks would get mapped into.
* @return Bookmarks of the outline generated for the given element context.
* @see Creating an outline
*/
public static List generate(Element context, Box box) {
NodeIterator iterator = NestedSectioningFilter.iterator(context);
if (iterator == null) {
return Collections.emptyList();
}
HTMLOutline root = new HTMLOutline();
HTMLOutline current = root;
Map map = new IdentityHashMap<>();
for (Element element = (Element) iterator.nextNode();
element != null; element = (Element) iterator.nextNode()) {
int level;
try {
level = Integer.parseInt(getOutlineLevel(element));
if (level < 1) {
continue; // Illegal value
}
} catch (NumberFormatException ignore) {
continue; // Invalid value
}
String name = getBookmarkName(element);
while (current.level >= level) {
current = current.parent;
}
current = new HTMLOutline(level, name, current);
map.put(element, current.bookmark);
}
initBoxRefs(map, box);
return root.bookmark.getChildren();
}
private static void initBoxRefs(Map map, Box box) {
Bookmark bookmark = map.get(box.getElement());
if (bookmark != null) {
bookmark.setBox(box);
}
for (int i = 0, len = box.getChildCount(); i < len; i++) {
initBoxRefs(map, box.getChild(i));
}
}
private static String getBookmarkName(Element element) {
String name = element.getAttribute("data-pdf-bookmark-name").trim();
if (name.isEmpty()) {
name = element.getTextContent();
}
name = WS.matcher(name.trim()).replaceAll(" ");
if (name.length() > MAX_NAME_LENGTH) {
name = name.substring(0, MAX_NAME_LENGTH);
}
return name;
}
private static String getOutlineLevel(Element element) {
String bookmark = element.getAttribute("data-pdf-bookmark").trim();
return bookmark.isEmpty() ?
getOutlineLevelFromTagName(element.getTagName()) :
bookmark;
}
static String getOutlineLevelFromTagName(String tagName) {
Matcher heading = HEADING.matcher(tagName);
if (heading.matches()) {
return heading.group(1);
} else if (ROOT.matcher(tagName).matches()) {
return "exclude";
} else {
return "none";
}
}
private static class NestedSectioningFilter implements NodeFilter {
private static final NestedSectioningFilter INSTANCE = new NestedSectioningFilter();
private static NodeIterator iterator(Element root) {
Document ownerDocument = root.getOwnerDocument();
return (ownerDocument instanceof DocumentTraversal)
? ((DocumentTraversal) ownerDocument).createNodeIterator(root, SHOW_ELEMENT, INSTANCE, true)
: null;
}
@Override
public short acceptNode(Node n) {
String outlineLevel = getOutlineLevel((Element) n);
if (outlineLevel.equalsIgnoreCase("none")) {
return FILTER_SKIP;
}
return outlineLevel.equalsIgnoreCase("exclude")
? FILTER_REJECT
: FILTER_ACCEPT;
}
}
}