All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.html.DefaultHtmlMapper Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.html;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

/**
 * The default HTML mapping rules in Tika.
 *
 * @since Apache Tika 0.6
 */
@SuppressWarnings("serial")
public class DefaultHtmlMapper implements HtmlMapper {

    /**
     * @since Apache Tika 0.8
     */
    public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
    // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
    private static final Map SAFE_ELEMENTS = new HashMap() {{
            put("H1", "h1");
            put("H2", "h2");
            put("H3", "h3");
            put("H4", "h4");
            put("H5", "h5");
            put("H6", "h6");

            put("P", "p");
            put("PRE", "pre");
            put("BLOCKQUOTE", "blockquote");
            put("Q", "q");

            put("UL", "ul");
            put("OL", "ol");
            put("MENU", "ul");
            put("LI", "li");
            put("DL", "dl");
            put("DT", "dt");
            put("DD", "dd");

            put("TABLE", "table");
            put("THEAD", "thead");
            put("TBODY", "tbody");
            put("TR", "tr");
            put("TH", "th");
            put("TD", "td");

            put("ADDRESS", "address");

            // TIKA-460 - add anchors
            put("A", "a");

            // TIKA-463 - add additional elements that contain URLs (and their sub-elements)
            put("MAP", "map");
            put("AREA", "area");
            put("IMG", "img");
            put("FRAMESET", "frameset");
            put("FRAME", "frame");
            put("IFRAME", "iframe");
            put("OBJECT", "object");
            put("PARAM", "param");
            put("INS", "ins");
            put("DEL", "del");
        }};
    private static final Set DISCARDABLE_ELEMENTS = new HashSet() {{
            add("STYLE");
            add("SCRIPT");
        }};
    // For information on tags & attributes, see:
    // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict
    // http://www.w3schools.com/TAGS/
    private static final Map> SAFE_ATTRIBUTES =
            new HashMap>() {{
                    put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev",
                            "shape", "coords"));
                    put("img", attrSet("src", "alt",  "longdesc", "height", "width", "usemap",
                            "ismap"));
                    put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth",
                            "marginheight", "noresize", "scrolling"));
                    put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth",
                            "marginheight", "scrolling", "align", "height", "width"));
                    put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media"));
                    put("map", attrSet("id", "class", "style", "title", "name"));
                    put("area", attrSet("shape", "coords", "href", "nohref", "alt"));
                    put("object", attrSet("declare", "classid", "codebase", "data", "type",
                            "codetype", "archive", "standby", "height", "width",
                            "usemap", "name", "tabindex", "align", "border", "hspace", "vspace"));
                    put("param", attrSet("id", "name", "value", "valuetype", "type"));
                    put("blockquote", attrSet("cite"));
                    put("ins", attrSet("cite", "datetime"));
                    put("del", attrSet("cite", "datetime"));
                    put("q", attrSet("cite"));

                // TODO - fill out this set. Include core, i18n, etc sets where appropriate.
                }};

    private static Set attrSet(String... attrs) {
        return new HashSet<>(Arrays.asList(attrs));
    }

    public String mapSafeElement(String name) {
        return SAFE_ELEMENTS.get(name);
    }

    /**
     * Normalizes an attribute name. Assumes that the element name
     * is valid and normalized
     */
    public String mapSafeAttribute(String elementName, String attributeName) {
        Set safeAttrs = SAFE_ATTRIBUTES.get(elementName);
        if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
            return attributeName;
        } else {
            return null;
        }
    }

    public boolean isDiscardElement(String name) {
        return DISCARDABLE_ELEMENTS.contains(name);
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy