All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.sax.XHTMLContentHandler Maven / Gradle / Ivy

Go to download

This is the core Apache Tika™ toolkit library from which all other modules inherit functionality. It also includes the core facades for the Tika API.

There is a newer version: 1.0.18
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;

import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/**
 * Content handler decorator that simplifies the task of producing XHTML
 * events for Tika content parsers.
 */
public class XHTMLContentHandler extends SafeContentHandler {

    /**
     * The XHTML namespace URI
     */
    public static final String XHTML = "http://www.w3.org/1999/xhtml";

    /**
     * The newline character that gets inserted after block elements.
     */
    private static final char[] NL = new char[] { '\n' };

    /**
     * The tab character gets inserted before table cells and list items.
     */
    private static final char[] TAB = new char[] { '\t' };

    /**
     * The elements that are in the  section.
     */
    private static final Set HEAD =
        unmodifiableSet("title", "link", "base", "meta", "script");

    /**
     * The elements that are automatically emitted by lazyStartHead, so
     * skip them if they get sent to startElement/endElement by mistake.
     */
    private static final Set AUTO =
        unmodifiableSet("head", "frameset");

    /**
     * The elements that get prepended with the {@link #TAB} character.
     */
    private static final Set INDENT =
        unmodifiableSet("li", "dd", "dt", "td", "th", "frame");

    /**
     * The elements that get appended with the {@link #NL} character.
     */
    public static final Set ENDLINE = unmodifiableSet(
            "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
            "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
            "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select",
            "option", "link", "script");

    private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();

    private static Set unmodifiableSet(String... elements) {
        return Collections.unmodifiableSet(
                new HashSet(Arrays.asList(elements)));
    }

    /**
     * Metadata associated with the document. Used to fill in the
     * <head/> section.
     */
    private final Metadata metadata;

    /**
     * Flag to indicate whether the document has been started.
     */
    private boolean documentStarted = false;
    
    /**
     * Flags to indicate whether the document head element has been started/ended.
     */
    private boolean headStarted = false;
    private boolean headEnded = false;
    private boolean useFrameset = false;
    
    public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
        super(handler);
        this.metadata = metadata;
    }

    /**
     * Starts an XHTML document by setting up the namespace mappings 
     * when called for the first time.
     * The standard XHTML prefix is generated lazily when the first
     * element is started.
     */
    @Override
    public void startDocument() throws SAXException {
    	if(!documentStarted){
    		documentStarted = true;
            super.startDocument();
            startPrefixMapping("", XHTML);
    	}
    }

    /**
     * Generates the following XHTML prefix when called for the first time:
     * 
     * <html>
     *   <head>
     *     <title>...</title>
     *   </head>
     *   <body>
     * 
*/ private void lazyStartHead() throws SAXException { if (!headStarted) { headStarted = true; // Call directly, so we don't go through our startElement(), which will // ignore these elements. super.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES); newline(); super.startElement(XHTML, "head", "head", EMPTY_ATTRIBUTES); newline(); } } /** * Generates the following XHTML prefix when called for the first time: *
     * <html>
     *   <head>
     *     <title>...</title>
     *   </head>
     *   <body> (or <frameset>
     * 
*/ private void lazyEndHead(boolean isFrameset) throws SAXException { lazyStartHead(); if (!headEnded) { headEnded = true; useFrameset = isFrameset; // TIKA-478: Emit all metadata values (other than title). We have to call // startElement() and characters() directly to avoid recursive problems. for (String name : metadata.names()) { if (name.equals("title")) { continue; } for (String value : metadata.getValues(name)) { // Putting null values into attributes causes problems, but is // allowed by Metadata, so guard against that. if (value != null) { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "name", "name", "CDATA", name); attributes.addAttribute("", "content", "content", "CDATA", value); super.startElement(XHTML, "meta", "meta", attributes); super.endElement(XHTML, "meta", "meta"); newline(); } } } super.startElement(XHTML, "title", "title", EMPTY_ATTRIBUTES); String title = metadata.get(TikaCoreProperties.TITLE); if (title != null && title.length() > 0) { char[] titleChars = title.toCharArray(); super.characters(titleChars, 0, titleChars.length); } else { // TIKA-725: Prefer over super.characters(new char[0], 0, 0); } super.endElement(XHTML, "title", "title"); newline(); super.endElement(XHTML, "head", "head"); newline(); if (useFrameset) { super.startElement(XHTML, "frameset", "frameset", EMPTY_ATTRIBUTES); } else { super.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES); } } } /** * Ends the XHTML document by writing the following footer and * clearing the namespace mappings: * <pre> * </body> * </html> * </pre> */ @Override public void endDocument() throws SAXException { lazyEndHead(useFrameset); if (useFrameset) { super.endElement(XHTML, "frameset", "frameset"); } else { super.endElement(XHTML, "body", "body"); } super.endElement(XHTML, "html", "html"); endPrefixMapping(""); super.endDocument(); } /** * Starts the given element. Table cells and list items are automatically * indented by emitting a tab character as ignorable whitespace. */ @Override public void startElement( String uri, String local, String name, Attributes attributes) throws SAXException { if (name.equals("frameset")) { lazyEndHead(true); } else if (!AUTO.contains(name)) { if (HEAD.contains(name)) { lazyStartHead(); } else { lazyEndHead(false); } if (XHTML.equals(uri) && INDENT.contains(name)) { ignorableWhitespace(TAB, 0, TAB.length); } super.startElement(uri, local, name, attributes); } } /** * Ends the given element. Block elements are automatically followed * by a newline character. */ @Override public void endElement(String uri, String local, String name) throws SAXException { if (!AUTO.contains(name)) { super.endElement(uri, local, name); if (XHTML.equals(uri) && ENDLINE.contains(name)) { newline(); } } } /** * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a> */ @Override public void characters(char[] ch, int start, int length) throws SAXException { lazyEndHead(useFrameset); super.characters(ch, start, length); } //------------------------------------------< public convenience methods > public void startElement(String name) throws SAXException { startElement(XHTML, name, name, EMPTY_ATTRIBUTES); } public void startElement(String name, String attribute, String value) throws SAXException { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", attribute, attribute, "CDATA", value); startElement(XHTML, name, name, attributes); } public void startElement(String name, AttributesImpl attributes) throws SAXException { startElement(XHTML, name, name, attributes); } public void endElement(String name) throws SAXException { endElement(XHTML, name, name); } public void characters(String characters) throws SAXException { if (characters != null && characters.length() > 0) { characters(characters.toCharArray(), 0, characters.length()); } } public void newline() throws SAXException { ignorableWhitespace(NL, 0, NL.length); } /** * Emits an XHTML element with the given text content. If the given * text value is null or empty, then the element is not written. * * @param name XHTML element name * @param value element value, possibly <code>null</code> * @throws SAXException if the content element could not be written */ public void element(String name, String value) throws SAXException { if (value != null && value.length() > 0) { startElement(name); characters(value); endElement(name); } } } </code></pre> <br/> <br/> <div id="right-banner"> </div> <div id="left-banner"> </div> <div class='clear'></div> <aside class="related-items"> <section> <div class="panel panel-primary"> <div class="panel-heading margin-bottom">Related Artifacts</div> <div class=""> <a title='This artifact is from the group mysql' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/mysql/mysql-connector-java' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> mysql-connector-java <small class='group-info' >mysql</small></a><br/><a title='This artifact is from the group com.github.codedrinker' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.github.codedrinker/facebook-messenger' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> facebook-messenger <small class='group-info' >com.github.codedrinker</small></a><br/><a title='This artifact is from the group org.seleniumhq.selenium' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.seleniumhq.selenium/selenium-java' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> selenium-java <small class='group-info' >org.seleniumhq.selenium</small></a><br/><a title='This artifact is from the group com.github.sola92' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.github.sola92/instagram-java' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> instagram-java <small class='group-info' >com.github.sola92</small></a><br/><a title='This artifact is from the group com.google.code.gson' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.google.code.gson/gson' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> gson <small class='group-info' >com.google.code.gson</small></a><br/><a title='This artifact is from the group org.apache.poi' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.poi/poi' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> poi <small class='group-info' >org.apache.poi</small></a><br/><a title='This artifact is from the group org.apache.httpcomponents' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.httpcomponents/httpclient' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> httpclient <small class='group-info' >org.apache.httpcomponents</small></a><br/><a title='This artifact is from the group org.json' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.json/json' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> json <small class='group-info' >org.json</small></a><br/><a title='This artifact is from the group com.google.code.facebook-java-api' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.google.code.facebook-java-api/facebook-java-api' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> facebook-java-api <small class='group-info' >com.google.code.facebook-java-api</small></a><br/><a title='This artifact is from the group org.apache.poi' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.poi/poi-ooxml' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> poi-ooxml <small class='group-info' >org.apache.poi</small></a><br/><a title='This artifact is from the group com.fasterxml.jackson.core' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.fasterxml.jackson.core/jackson-databind' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> jackson-databind <small class='group-info' >com.fasterxml.jackson.core</small></a><br/><a title='This artifact is from the group junit' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/junit/junit' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> junit <small class='group-info' >junit</small></a><br/><a title='This artifact is from the group org.primefaces' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.primefaces/primefaces' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> primefaces <small class='group-info' >org.primefaces</small></a><br/><a title='This artifact is from the group com.github.noraui' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.github.noraui/ojdbc7' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> ojdbc7 <small class='group-info' >com.github.noraui</small></a><br/><a title='This artifact is from the group com.jfoenix' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.jfoenix/jfoenix' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> jfoenix <small class='group-info' >com.jfoenix</small></a><br/><a title='This artifact is from the group org.testng' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.testng/testng' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> testng <small class='group-info' >org.testng</small></a><br/><a title='This artifact is from the group com.googlecode.json-simple' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.googlecode.json-simple/json-simple' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> json-simple <small class='group-info' >com.googlecode.json-simple</small></a><br/><a title='This artifact is from the group org.seleniumhq.selenium' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.seleniumhq.selenium/selenium-server' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> selenium-server <small class='group-info' >org.seleniumhq.selenium</small></a><br/><a title='This artifact is from the group com.itextpdf' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.itextpdf/itextpdf' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> itextpdf <small class='group-info' >com.itextpdf</small></a><br/><a title='This artifact is from the group org.springframework' class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.springframework/spring-core' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> spring-core <small class='group-info' >org.springframework</small></a><br/> </div> </div> </section> <section> <div class="panel panel-primary"> <div class="panel-heading margin-bottom">Related Groups</div> <div class=""> <a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.springframework' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.springframework</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.poi' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.apache.poi</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.hibernate' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.hibernate</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.springframework.boot' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.springframework.boot</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.fasterxml.jackson.core' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> com.fasterxml.jackson.core</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.itextpdf' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> com.itextpdf</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.seleniumhq.selenium' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.seleniumhq.selenium</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/mysql' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> mysql</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.finos.legend.engine' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.finos.legend.engine</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.httpcomponents' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.apache.httpcomponents</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.logging.log4j' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.apache.logging.log4j</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.openjfx' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.openjfx</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.apache.commons' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.apache.commons</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/org.json' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> org.json</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.google.guava' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> com.google.guava</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.google.zxing' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> com.google.zxing</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/net.sf.jasperreports' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> net.sf.jasperreports</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/javax.xml.bind' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> javax.xml.bind</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/ojdbc' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> ojdbc</a><br/><a class='btn btn-default btn-xs small-margin-bottom ellipsis sidebar-btn' href='/artifacts/com.google.code.facebook-java-api' ><i class="fa fa-arrow-circle-right" aria-hidden="true"></i> com.google.code.facebook-java-api</a><br/> </div> </div> </section> </aside> <div class='clear'></div> </main> </div> <br/><br/> <div class="align-center">© 2015 - 2024 <a href="/legal-notice.php">Weber Informatics LLC</a> | <a href="/data-protection.php">Privacy Policy</a></div> <br/><br/><br/><br/><br/><br/> </body> </html>