org.apache.tika.sax.XHTMLContentHandler Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-core Show documentation
Show all versions of tika-core Show documentation
This is the core Apache Tika™ toolkit library from which all other modules inherit functionality. It also
includes the core facades for the Tika API.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.sax;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
/**
* Content handler decorator that simplifies the task of producing XHTML
* events for Tika content parsers.
*/
public class XHTMLContentHandler extends SafeContentHandler {
/**
* The XHTML namespace URI
*/
public static final String XHTML = "http://www.w3.org/1999/xhtml";
/**
* The newline character that gets inserted after block elements.
*/
private static final char[] NL = new char[] { '\n' };
/**
* The tab character gets inserted before table cells and list items.
*/
private static final char[] TAB = new char[] { '\t' };
/**
* The elements that are in the section.
*/
private static final Set HEAD =
unmodifiableSet("title", "link", "base", "meta", "script");
/**
* The elements that are automatically emitted by lazyStartHead, so
* skip them if they get sent to startElement/endElement by mistake.
*/
private static final Set AUTO =
unmodifiableSet("head", "frameset");
/**
* The elements that get prepended with the {@link #TAB} character.
*/
private static final Set INDENT =
unmodifiableSet("li", "dd", "dt", "td", "th", "frame");
/**
* The elements that get appended with the {@link #NL} character.
*/
public static final Set ENDLINE = unmodifiableSet(
"p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
"pre", "hr", "blockquote", "address", "fieldset", "table", "form",
"noscript", "li", "dt", "dd", "noframes", "br", "tr", "select",
"option", "link", "script");
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
private static Set unmodifiableSet(String... elements) {
return Collections.unmodifiableSet(
new HashSet(Arrays.asList(elements)));
}
/**
* Metadata associated with the document. Used to fill in the
* <head/> section.
*/
private final Metadata metadata;
/**
* Flag to indicate whether the document has been started.
*/
private boolean documentStarted = false;
/**
* Flags to indicate whether the document head element has been started/ended.
*/
private boolean headStarted = false;
private boolean headEnded = false;
private boolean useFrameset = false;
public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
super(handler);
this.metadata = metadata;
}
/**
* Starts an XHTML document by setting up the namespace mappings
* when called for the first time.
* The standard XHTML prefix is generated lazily when the first
* element is started.
*/
@Override
public void startDocument() throws SAXException {
if(!documentStarted){
documentStarted = true;
super.startDocument();
startPrefixMapping("", XHTML);
}
}
/**
* Generates the following XHTML prefix when called for the first time:
*
* <html>
* <head>
* <title>...</title>
* </head>
* <body>
*
*/
private void lazyStartHead() throws SAXException {
if (!headStarted) {
headStarted = true;
// Call directly, so we don't go through our startElement(), which will
// ignore these elements.
super.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES);
newline();
super.startElement(XHTML, "head", "head", EMPTY_ATTRIBUTES);
newline();
}
}
/**
* Generates the following XHTML prefix when called for the first time:
*
* <html>
* <head>
* <title>...</title>
* </head>
* <body> (or <frameset>
*
*/
private void lazyEndHead(boolean isFrameset) throws SAXException {
lazyStartHead();
if (!headEnded) {
headEnded = true;
useFrameset = isFrameset;
// TIKA-478: Emit all metadata values (other than title). We have to call
// startElement() and characters() directly to avoid recursive problems.
for (String name : metadata.names()) {
if (name.equals("title")) {
continue;
}
for (String value : metadata.getValues(name)) {
// Putting null values into attributes causes problems, but is
// allowed by Metadata, so guard against that.
if (value != null) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "name", "name", "CDATA", name);
attributes.addAttribute("", "content", "content", "CDATA", value);
super.startElement(XHTML, "meta", "meta", attributes);
super.endElement(XHTML, "meta", "meta");
newline();
}
}
}
super.startElement(XHTML, "title", "title", EMPTY_ATTRIBUTES);
String title = metadata.get(TikaCoreProperties.TITLE);
if (title != null && title.length() > 0) {
char[] titleChars = title.toCharArray();
super.characters(titleChars, 0, titleChars.length);
} else {
// TIKA-725: Prefer over
super.characters(new char[0], 0, 0);
}
super.endElement(XHTML, "title", "title");
newline();
super.endElement(XHTML, "head", "head");
newline();
if (useFrameset) {
super.startElement(XHTML, "frameset", "frameset", EMPTY_ATTRIBUTES);
} else {
super.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES);
}
}
}
/**
* Ends the XHTML document by writing the following footer and
* clearing the namespace mappings:
*
* </body>
* </html>
*
*/
@Override
public void endDocument() throws SAXException {
lazyEndHead(useFrameset);
if (useFrameset) {
super.endElement(XHTML, "frameset", "frameset");
} else {
super.endElement(XHTML, "body", "body");
}
super.endElement(XHTML, "html", "html");
endPrefixMapping("");
super.endDocument();
}
/**
* Starts the given element. Table cells and list items are automatically
* indented by emitting a tab character as ignorable whitespace.
*/
@Override
public void startElement(
String uri, String local, String name, Attributes attributes)
throws SAXException {
if (name.equals("frameset")) {
lazyEndHead(true);
} else if (!AUTO.contains(name)) {
if (HEAD.contains(name)) {
lazyStartHead();
} else {
lazyEndHead(false);
}
if (XHTML.equals(uri) && INDENT.contains(name)) {
ignorableWhitespace(TAB, 0, TAB.length);
}
super.startElement(uri, local, name, attributes);
}
}
/**
* Ends the given element. Block elements are automatically followed
* by a newline character.
*/
@Override
public void endElement(String uri, String local, String name) throws SAXException {
if (!AUTO.contains(name)) {
super.endElement(uri, local, name);
if (XHTML.equals(uri) && ENDLINE.contains(name)) {
newline();
}
}
}
/**
* @see TIKA-210
*/
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
lazyEndHead(useFrameset);
super.characters(ch, start, length);
}
//------------------------------------------< public convenience methods >
public void startElement(String name) throws SAXException {
startElement(XHTML, name, name, EMPTY_ATTRIBUTES);
}
public void startElement(String name, String attribute, String value)
throws SAXException {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", attribute, attribute, "CDATA", value);
startElement(XHTML, name, name, attributes);
}
public void startElement(String name, AttributesImpl attributes)
throws SAXException {
startElement(XHTML, name, name, attributes);
}
public void endElement(String name) throws SAXException {
endElement(XHTML, name, name);
}
public void characters(String characters) throws SAXException {
if (characters != null && characters.length() > 0) {
characters(characters.toCharArray(), 0, characters.length());
}
}
public void newline() throws SAXException {
ignorableWhitespace(NL, 0, NL.length);
}
/**
* Emits an XHTML element with the given text content. If the given
* text value is null or empty, then the element is not written.
*
* @param name XHTML element name
* @param value element value, possibly null
* @throws SAXException if the content element could not be written
*/
public void element(String name, String value) throws SAXException {
if (value != null && value.length() > 0) {
startElement(name);
characters(value);
endElement(name);
}
}
}