All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.sax.XHTMLContentHandler Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;

import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;

/**
 * Content handler decorator that simplifies the task of producing XHTML
 * events for Tika content parsers.
 */
public class XHTMLContentHandler extends SafeContentHandler {

    /**
     * The XHTML namespace URI
     */
    public static final String XHTML = "http://www.w3.org/1999/xhtml";
    /**
     * The elements that get appended with the {@link #NL} character.
     */
    public static final Set ENDLINE =
            unmodifiableSet("p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl", "pre",
                    "hr", "blockquote", "address", "fieldset", "table", "form", "noscript", "li",
                    "dt", "dd", "noframes", "br", "tr", "select", "option", "link", "script");
    /**
     * The newline character that gets inserted after block elements.
     */
    private static final char[] NL = new char[]{'\n'};
    /**
     * The tab character gets inserted before table cells and list items.
     */
    private static final char[] TAB = new char[]{'\t'};
    /**
     * The elements that are in the  section.
     */
    private static final Set HEAD =
            unmodifiableSet("title", "link", "base", "meta", "script");
    /**
     * The elements that are automatically emitted by lazyStartHead, so
     * skip them if they get sent to startElement/endElement by mistake.
     */
    private static final Set AUTO = unmodifiableSet("head", "frameset");
    /**
     * The elements that get prepended with the {@link #TAB} character.
     */
    private static final Set INDENT =
            unmodifiableSet("li", "dd", "dt", "td", "th", "frame");
    private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
    /**
     * Metadata associated with the document. Used to fill in the
     * <head/> section.
     */
    private final Metadata metadata;
    /**
     * Flag to indicate whether the document has been started.
     */
    private boolean documentStarted = false;
    /**
     * Flags to indicate whether the document head element has been started/ended.
     */
    private boolean headStarted = false;
    private boolean headEnded = false;
    private boolean useFrameset = false;
    public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
        super(handler);
        this.metadata = metadata;
    }

    private static Set unmodifiableSet(String... elements) {
        return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(elements)));
    }

    /**
     * Starts an XHTML document by setting up the namespace mappings
     * when called for the first time.
     * The standard XHTML prefix is generated lazily when the first
     * element is started.
     */
    @Override
    public void startDocument() throws SAXException {
        if (!documentStarted) {
            documentStarted = true;
            super.startDocument();
            startPrefixMapping("", XHTML);
        }
    }

    /**
     * Generates the following XHTML prefix when called for the first time:
     * 
     * <html>
     *   <head>
     *     <title>...</title>
     *   </head>
     *   <body>
     * 
*/ private void lazyStartHead() throws SAXException { if (!headStarted) { headStarted = true; // Call directly, so we don't go through our startElement(), which will // ignore these elements. AttributesImpl htmlAttrs = new AttributesImpl(); String lang = metadata.get(Metadata.CONTENT_LANGUAGE); if (lang != null) { htmlAttrs.addAttribute("", "lang", "lang", "CDATA", lang); } super.startElement(XHTML, "html", "html", htmlAttrs); newline(); super.startElement(XHTML, "head", "head", EMPTY_ATTRIBUTES); newline(); } } /** * Generates the following XHTML prefix when called for the first time: *
     * <html>
     *   <head>
     *     <title>...</title>
     *   </head>
     *   <body> (or <frameset>
     * 
*/ private void lazyEndHead(boolean isFrameset) throws SAXException { lazyStartHead(); if (!headEnded) { headEnded = true; useFrameset = isFrameset; // TIKA-478: Emit all metadata values (other than title). We have to call // startElement() and characters() directly to avoid recursive problems. for (String name : metadata.names()) { if (name.equals("title")) { continue; } for (String value : metadata.getValues(name)) { // Putting null values into attributes causes problems, but is // allowed by Metadata, so guard against that. if (value != null) { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "name", "name", "CDATA", name); attributes.addAttribute("", "content", "content", "CDATA", value); super.startElement(XHTML, "meta", "meta", attributes); super.endElement(XHTML, "meta", "meta"); newline(); } } } super.startElement(XHTML, "title", "title", EMPTY_ATTRIBUTES); String title = metadata.get(TikaCoreProperties.TITLE); if (title != null && title.length() > 0) { char[] titleChars = title.toCharArray(); super.characters(titleChars, 0, titleChars.length); } else { // TIKA-725: Prefer over super.characters(new char[0], 0, 0); } super.endElement(XHTML, "title", "title"); newline(); super.endElement(XHTML, "head", "head"); newline(); if (useFrameset) { super.startElement(XHTML, "frameset", "frameset", EMPTY_ATTRIBUTES); } else { super.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES); } } } /** * Ends the XHTML document by writing the following footer and * clearing the namespace mappings: * <pre> * </body> * </html> * </pre> */ @Override public void endDocument() throws SAXException { lazyEndHead(useFrameset); if (useFrameset) { super.endElement(XHTML, "frameset", "frameset"); } else { super.endElement(XHTML, "body", "body"); } super.endElement(XHTML, "html", "html"); endPrefixMapping(""); super.endDocument(); } /** * Starts the given element. Table cells and list items are automatically * indented by emitting a tab character as ignorable whitespace. */ @Override public void startElement(String uri, String local, String name, Attributes attributes) throws SAXException { if (name.equals("frameset")) { lazyEndHead(true); } else if (!AUTO.contains(name)) { if (HEAD.contains(name)) { lazyStartHead(); } else { lazyEndHead(false); } if (XHTML.equals(uri) && INDENT.contains(name)) { ignorableWhitespace(TAB, 0, TAB.length); } super.startElement(uri, local, name, attributes); } } /** * Ends the given element. Block elements are automatically followed * by a newline character. */ @Override public void endElement(String uri, String local, String name) throws SAXException { if (!AUTO.contains(name)) { super.endElement(uri, local, name); if (XHTML.equals(uri) && ENDLINE.contains(name)) { newline(); } } } /** * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a> */ @Override public void characters(char[] ch, int start, int length) throws SAXException { lazyEndHead(useFrameset); super.characters(ch, start, length); } //------------------------------------------< public convenience methods > public void startElement(String name) throws SAXException { startElement(XHTML, name, name, EMPTY_ATTRIBUTES); } public void startElement(String name, String attribute, String value) throws SAXException { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", attribute, attribute, "CDATA", value); startElement(XHTML, name, name, attributes); } public void startElement(String name, AttributesImpl attributes) throws SAXException { startElement(XHTML, name, name, attributes); } public void endElement(String name) throws SAXException { endElement(XHTML, name, name); } public void characters(String characters) throws SAXException { if (characters != null && characters.length() > 0) { characters(characters.toCharArray(), 0, characters.length()); } } public void newline() throws SAXException { ignorableWhitespace(NL, 0, NL.length); } /** * Emits an XHTML element with the given text content. If the given * text value is null or empty, then the element is not written. * * @param name XHTML element name * @param value element value, possibly <code>null</code> * @throws SAXException if the content element could not be written */ public void element(String name, String value) throws SAXException { if (value != null && value.length() > 0) { startElement(name); characters(value); endElement(name); } } @Override protected boolean isInvalid(int ch) { if (super.isInvalid(ch)) { return true; } // These control chars are invalid in XHTML. return 0x7F <= ch && ch <= 0x9F; } } </code></pre> <br/> <br/> <!--<div id="right-banner">--> <!--</div>--> <!--<div id="left-banner">--> <!--</div>--> <div class='clear'></div> </main> </div> <br/><br/> <div class="align-center">© 2015 - 2024 <a href="/legal-notice.php">Weber Informatics LLC</a> | <a href="/data-protection.php">Privacy Policy</a></div> <br/><br/><br/><br/><br/><br/> </body> </html>