All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.html.HtmlParser Maven / Gradle / Ivy

There is a newer version: 3.0.0-BETA2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.html;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.apache.tika.utils.CharsetUtils;
import org.ccil.cowan.tagsoup.HTMLSchema;
import org.ccil.cowan.tagsoup.Schema;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/**
 * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
 * and post-processes the events to produce XHTML and metadata expected by
 * Tika clients.
 */
public class HtmlParser implements Parser {

    private static final Set SUPPORTED_TYPES =
        Collections.unmodifiableSet(new HashSet(Arrays.asList(
                MediaType.text("html"),
                MediaType.application("xhtml+xml"),
                MediaType.application("vnd.wap.xhtml+xml"),
                MediaType.application("x-asp"))));

    // Use the widest, most common charset as our default.
    private static final String DEFAULT_CHARSET = "windows-1252";
    // TIKA-357 - use bigger buffer for meta tag sniffing (was 4K)
    private static final int META_TAG_BUFFER_SIZE = 8192;
    private static final Pattern HTTP_EQUIV_PATTERN = Pattern.compile(
                    "(?is) getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    /**
     * TIKA-332: Check for meta http-equiv tag with charset info in
     * HTML content.
     * 

* TODO: Move this into core, along with CharsetDetector */ private String getEncoding(InputStream stream, Metadata metadata) throws IOException { stream.mark(META_TAG_BUFFER_SIZE); char[] buffer = new char[META_TAG_BUFFER_SIZE]; InputStreamReader isr = new InputStreamReader(stream, "us-ascii"); int bufferSize = isr.read(buffer); stream.reset(); if (bufferSize != -1) { String metaString = new String(buffer, 0, bufferSize); Matcher m = HTTP_EQUIV_PATTERN.matcher(metaString); if (m.find()) { // TIKA-349: flexible handling of attributes // We have one or more x or x=y attributes, separated by ';' String[] attrs = m.group(1).split(";"); for (String attr : attrs) { String[] keyValue = attr.trim().split("="); if ((keyValue.length == 2) && keyValue[0].equalsIgnoreCase("charset")) { // TIKA-459: improve charset handling. String charset = CharsetUtils.clean(keyValue[1]); if (CharsetUtils.isSupported(charset)) { metadata.set(Metadata.CONTENT_ENCODING, charset); return charset; } } } } } // No (valid) charset in a meta http-equiv tag, see if it's in the passed content-encoding // hint, or the passed content-type hint. CharsetDetector detector = new CharsetDetector(); String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING); String incomingType = metadata.get(Metadata.CONTENT_TYPE); if (incomingCharset == null && incomingType != null) { // TIKA-341: Use charset in content-type MediaType mt = MediaType.parse(incomingType); if (mt != null) { String charset = mt.getParameters().get("charset"); if ((charset != null) && Charset.isSupported(charset)) { incomingCharset = charset; } } } if (incomingCharset != null) { detector.setDeclaredEncoding(incomingCharset); } // TIKA-341 without enabling input filtering (stripping of tags) the // short HTML tests don't work well. detector.enableInputFilter(true); detector.setText(stream); for (CharsetMatch match : detector.detectAll()) { if (Charset.isSupported(match.getName())) { metadata.set(Metadata.CONTENT_ENCODING, match.getName()); // TIKA-339: Don't set language, as it's typically not a very good // guess, and it can create ambiguity if another (better) language // value is specified by a meta tag in the HTML (or via HTTP response // header). /* String language = match.getLanguage(); if (language != null) { metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage()); metadata.set(Metadata.LANGUAGE, match.getLanguage()); } */ break; } } String encoding = metadata.get(Metadata.CONTENT_ENCODING); if (encoding == null) { if (Charset.isSupported(DEFAULT_CHARSET)) { encoding = DEFAULT_CHARSET; } else { encoding = Charset.defaultCharset().name(); } metadata.set(Metadata.CONTENT_ENCODING, encoding); } return encoding; } public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // The getEncoding() method depends on the mark feature if (!stream.markSupported()) { stream = new BufferedInputStream(stream); } // Protect the stream from being closed by CyberNeko // TODO: Is this still needed, given our use of TagSoup? stream = new CloseShieldInputStream(stream); // Prepare the input source using the encoding hint if available InputSource source = new InputSource(stream); source.setEncoding(getEncoding(stream, metadata)); // Get the HTML mapper from the parse context HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper()); // Parse the HTML document org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser(); // Instantiating HTMLSchema is heavy, therefore reuse a cached instance parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, HTML_SCHEMA); parser.setContentHandler(new XHTMLDowngradeHandler( new HtmlHandler(mapper, handler, metadata))); parser.parse(source); } /** * @deprecated This method will be removed in Apache Tika 1.0. */ public void parse( InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { parse(stream, handler, metadata, new ParseContext()); } /** * Maps "safe" HTML element names to semantic XHTML equivalents. If the * given element is unknown or deemed unsafe for inclusion in the parse * output, then this method returns null and the element * will be ignored but the content inside it is still processed. See * the {@link #isDiscardElement(String)} method for a way to discard * the entire contents of an element. *

* Subclasses can override this method to customize the default mapping. * * @deprecated Use the {@link HtmlMapper} mechanism to customize * the HTML mapping. This method will be removed in Tika 1.0. * @since Apache Tika 0.5 * @param name HTML element name (upper case) * @return XHTML element name (lower case), or * null if the element is unsafe */ protected String mapSafeElement(String name) { return DefaultHtmlMapper.INSTANCE.mapSafeElement(name); } /** * Checks whether all content within the given HTML element should be * discarded instead of including it in the parse output. Subclasses * can override this method to customize the set of discarded elements. * * @deprecated Use the {@link HtmlMapper} mechanism to customize * the HTML mapping. This method will be removed in Tika 1.0. * @since Apache Tika 0.5 * @param name HTML element name (upper case) * @return true if content inside the named element * should be ignored, false otherwise */ protected boolean isDiscardElement(String name) { return DefaultHtmlMapper.INSTANCE.isDiscardElement(name); } /** * @deprecated Use the {@link HtmlMapper} mechanism to customize * the HTML mapping. This method will be removed in Tika 1.0. **/ public String mapSafeAttribute(String elementName, String attributeName) { return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName,attributeName) ; } /** * Adapter class that maintains backwards compatibility with the * protected HtmlParser methods. Making HtmlParser implement HtmlMapper * directly would require those methods to be public, which would break * backwards compatibility with subclasses. * * @deprecated Use the {@link HtmlMapper} mechanism to customize * the HTML mapping. This class will be removed in Tika 1.0. */ private class HtmlParserMapper implements HtmlMapper { public String mapSafeElement(String name) { return HtmlParser.this.mapSafeElement(name); } public boolean isDiscardElement(String name) { return HtmlParser.this.isDiscardElement(name); } public String mapSafeAttribute(String elementName, String attributeName){ return HtmlParser.this.mapSafeAttribute(elementName,attributeName); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy