org.apache.tika.parser.html.HtmlParser Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.html;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.config.Field;
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.ParseContext;
import org.ccil.cowan.tagsoup.HTMLSchema;
import org.ccil.cowan.tagsoup.Schema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
* and post-processes the events to produce XHTML and metadata expected by
* Tika clients.
*/
public class HtmlParser extends AbstractEncodingDetectorParser {
/**
* Serial version UID
*/
private static final long serialVersionUID = 7895315240498733128L;
private static final Logger LOG = LoggerFactory.getLogger(HtmlParser.class);
private static final MediaType XHTML = MediaType.application("xhtml+xml");
private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
private static final MediaType X_ASP = MediaType.application("x-asp");
private static final Set SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
MediaType.text("html"),
XHTML,
WAP_XHTML,
X_ASP)));
/**
* HTML schema singleton used to amortise the heavy instantiation time.
*/
private static final Schema HTML_SCHEMA = new HTMLSchema();
@Field
private boolean extractScripts = false;
public Set getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
public HtmlParser() {
super();
}
public HtmlParser(EncodingDetector encodingDetector) {
super(encodingDetector);
}
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
// Automatically detect the character encoding
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
metadata, getEncodingDetector(context))) {
Charset charset = reader.getCharset();
String previous = metadata.get(Metadata.CONTENT_TYPE);
MediaType contentType = null;
if (previous == null || previous.startsWith("text/html")) {
contentType = new MediaType(MediaType.TEXT_HTML, charset);
} else if (previous.startsWith("application/xhtml+xml")) {
contentType = new MediaType(XHTML, charset);
} else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
contentType = new MediaType(WAP_XHTML, charset);
} else if (previous.startsWith("application/x-asp")) {
contentType = new MediaType(X_ASP, charset);
}
if (contentType != null) {
metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
}
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
// Get the HTML mapper from the parse context
HtmlMapper mapper =
context.get(HtmlMapper.class, new HtmlParserMapper());
// Parse the HTML document
org.ccil.cowan.tagsoup.Parser parser =
new org.ccil.cowan.tagsoup.Parser();
// Use schema from context or default
Schema schema = context.get(Schema.class, HTML_SCHEMA);
// TIKA-528: Reuse share schema to avoid heavy instantiation
parser.setProperty(
org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
// TIKA-599: Shared schema is thread-safe only if bogons are ignored
parser.setFeature(
org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
parser.setContentHandler(new XHTMLDowngradeHandler(
new HtmlHandler(mapper, handler, metadata, context, extractScripts)));
parser.parse(reader.asInputSource());
}
}
/**
* Maps "safe" HTML element names to semantic XHTML equivalents. If the
* given element is unknown or deemed unsafe for inclusion in the parse
* output, then this method returns null
and the element
* will be ignored but the content inside it is still processed. See
* the {@link #isDiscardElement(String)} method for a way to discard
* the entire contents of an element.
*
* Subclasses can override this method to customize the default mapping.
*
* @param name HTML element name (upper case)
* @return XHTML element name (lower case), or
* null
if the element is unsafe
* @since Apache Tika 0.5
* @deprecated Use the {@link HtmlMapper} mechanism to customize
* the HTML mapping. This method will be removed in Tika 1.0.
*/
protected String mapSafeElement(String name) {
return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
}
/**
* Checks whether all content within the given HTML element should be
* discarded instead of including it in the parse output. Subclasses
* can override this method to customize the set of discarded elements.
*
* @param name HTML element name (upper case)
* @return true
if content inside the named element
* should be ignored, false
otherwise
* @since Apache Tika 0.5
* @deprecated Use the {@link HtmlMapper} mechanism to customize
* the HTML mapping. This method will be removed in Tika 1.0.
*/
protected boolean isDiscardElement(String name) {
return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
}
/**
* @deprecated Use the {@link HtmlMapper} mechanism to customize
* the HTML mapping. This method will be removed in Tika 1.0.
*/
public String mapSafeAttribute(String elementName, String attributeName) {
return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
}
/**
* Adapter class that maintains backwards compatibility with the
* protected HtmlParser methods. Making HtmlParser implement HtmlMapper
* directly would require those methods to be public, which would break
* backwards compatibility with subclasses.
*
* @deprecated Use the {@link HtmlMapper} mechanism to customize
* the HTML mapping. This class will be removed in Tika 1.0.
*/
private class HtmlParserMapper implements HtmlMapper {
public String mapSafeElement(String name) {
return HtmlParser.this.mapSafeElement(name);
}
public boolean isDiscardElement(String name) {
return HtmlParser.this.isDiscardElement(name);
}
public String mapSafeAttribute(String elementName, String attributeName) {
return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
}
}
/**
* Whether or not to extract contents in script entities.
* Default is false
*
* @param extractScripts
*/
@Field
public void setExtractScripts(boolean extractScripts) {
this.extractScripts = extractScripts;
}
public boolean getExtractScripts() {
return extractScripts;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy