org.apache.tika.parser.html.HtmlMapper Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.html;
/**
* HTML mapper used to make incoming HTML documents easier to handle by
* Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
* the parse context and uses it to map parsed HTML to "safe" XHTML. A client
* that wants to customize this mapping can place a custom HtmlMapper instance
* into the parse context.
*
* @since Apache Tika 0.6
*/
public interface HtmlMapper {
/**
* Maps "safe" HTML element names to semantic XHTML equivalents. If the
* given element is unknown or deemed unsafe for inclusion in the parse
* output, then this method returns null
and the element
* will be ignored but the content inside it is still processed. See
* the {@link #isDiscardElement(String)} method for a way to discard
* the entire contents of an element.
*
* @param name HTML element name (upper case)
* @return XHTML element name (lower case), or
* null
if the element is unsafe
*/
String mapSafeElement(String name);
/**
* Checks whether all content within the given HTML element should be
* discarded instead of including it in the parse output.
*
* @param name HTML element name (upper case)
* @return true
if content inside the named element
* should be ignored, false
otherwise
*/
boolean isDiscardElement(String name);
/**
* Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
* given attribute is unknown or deemed unsafe for inclusion in the parse
* output, then this method returns null
and the attribute
* will be ignored. This method assumes that the element name
* is valid and normalised.
*
* @param elementName HTML element name (lower case)
* @param attributeName HTML attribute name (lower case)
* @return XHTML attribute name (lower case), or
* null
if the element is unsafe
*/
String mapSafeAttribute(String elementName, String attributeName);
}