org.apache.xml.serialize.HTMLdtd Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Aug 21, 2000:
// Fixed bug in isElement and made HTMLdtd public.
// Contributed by Eric SCHAEFFER"
package org.apache.xml.serialize;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Hashtable;
import java.util.Locale;
import org.apache.xerces.dom.DOMMessageFormatter;
/**
* Utility class for accessing information specific to HTML documents.
* The HTML DTD is expressed as three utility function groups. Two methods
* allow for checking whether an element requires an open tag on printing
* ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}).
*
* Two other methods translate character references from name to value and
* from value to name. A small entities resource is loaded into memory the
* first time any of these methods is called for fast and efficient access.
*
* @deprecated This class was deprecated in Xerces 2.9.0. It is recommended
* that new applications use JAXP's Transformation API for XML (TrAX) for
* serializing HTML. See the Xerces documentation for more information.
* @version $Revision: 699902 $ $Date: 2008-09-28 14:40:49 -0700 (Sun, 28 Sep 2008) $
* @author Assaf Arkin
*/
public final class HTMLdtd
{
/**
* Public identifier for HTML 4.01 (Strict) document type.
*/
public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN";
/**
* System identifier for HTML 4.01 (Strict) document type.
*/
public static final String HTMLSystemId =
"http://www.w3.org/TR/html4/strict.dtd";
/**
* Public identifier for XHTML 1.0 (Strict) document type.
*/
public static final String XHTMLPublicId =
"-//W3C//DTD XHTML 1.0 Strict//EN";
/**
* System identifier for XHTML 1.0 (Strict) document type.
*/
public static final String XHTMLSystemId =
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
/**
* Table of reverse character reference mapping. Character codes are held
* as single-character strings, mapped to their reference name.
*/
private static Hashtable _byChar;
/**
* Table of entity name to value mapping. Entities are held as strings,
* character references as Character objects.
*/
private static Hashtable _byName;
private static Hashtable _boolAttrs;
/**
* Holds element definitions.
*/
private static Hashtable _elemDefs;
/**
* Locates the HTML entities file that is loaded upon initialization.
* This file is a resource loaded with the default class loader.
*/
private static final String ENTITIES_RESOURCE = "HTMLEntities.res";
/**
* Only opening tag should be printed.
*/
private static final int ONLY_OPENING = 0x0001;
/**
* Element contains element content only.
*/
private static final int ELEM_CONTENT = 0x0002;
/**
* Element preserve spaces.
*/
private static final int PRESERVE = 0x0004;
/**
* Optional closing tag.
*/
private static final int OPT_CLOSING = 0x0008;
/**
* Element is empty (also means only opening tag)
*/
private static final int EMPTY = 0x0010 | ONLY_OPENING;
/**
* Allowed to appear in head.
*/
private static final int ALLOWED_HEAD = 0x0020;
/**
* When opened, closes P.
*/
private static final int CLOSE_P = 0x0040;
/**
* When opened, closes DD or DT.
*/
private static final int CLOSE_DD_DT = 0x0080;
/**
* When opened, closes itself.
*/
private static final int CLOSE_SELF = 0x0100;
/**
* When opened, closes another table section.
*/
private static final int CLOSE_TABLE = 0x0200;
/**
* When opened, closes TH or TD.
*/
private static final int CLOSE_TH_TD = 0x04000;
/**
* Returns true if element is declared to be empty. HTML elements are
* defines as empty in the DTD, not by the document syntax.
*
* @param tagName The element tag name (upper case)
* @return True if element is empty
*/
public static boolean isEmptyTag( String tagName )
{
return isElement( tagName, EMPTY );
}
/**
* Returns true if element is declared to have element content.
* Whitespaces appearing inside element content will be ignored,
* other text will simply report an error.
*
* @param tagName The element tag name (upper case)
* @return True if element content
*/
public static boolean isElementContent( String tagName )
{
return isElement( tagName, ELEM_CONTENT );
}
/**
* Returns true if element's textual contents preserves spaces.
* This only applies to PRE and TEXTAREA, all other HTML elements
* do not preserve space.
*
* @param tagName The element tag name (upper case)
* @return True if element's text content preserves spaces
*/
public static boolean isPreserveSpace( String tagName )
{
return isElement( tagName, PRESERVE );
}
/**
* Returns true if element's closing tag is optional and need not
* exist. An error will not be reported for such elements if they
* are not closed. For example, LI is most often not closed.
*
* @param tagName The element tag name (upper case)
* @return True if closing tag implied
*/
public static boolean isOptionalClosing( String tagName )
{
return isElement( tagName, OPT_CLOSING );
}
/**
* Returns true if element's closing tag is generally not printed.
* For example, LI should not print the closing tag.
*
* @param tagName The element tag name (upper case)
* @return True if only opening tag should be printed
*/
public static boolean isOnlyOpening( String tagName )
{
return isElement( tagName, ONLY_OPENING );
}
/**
* Returns true if the opening of one element (tagName) implies
* the closing of another open element (openTag). For example,
* every opening LI will close the previously open LI,
* and every opening BODY will close the previously open HEAD.
*
* @param tagName The newly opened element
* @param openTag The already opened element
* @return True if closing tag closes opening tag
*/
public static boolean isClosing( String tagName, String openTag )
{
// Several elements are defined as closing the HEAD
if ( openTag.equalsIgnoreCase( "HEAD" ) )
return ! isElement( tagName, ALLOWED_HEAD );
// P closes iteself
if ( openTag.equalsIgnoreCase( "P" ) )
return isElement( tagName, CLOSE_P );
// DT closes DD, DD closes DT
if ( openTag.equalsIgnoreCase( "DT" ) || openTag.equalsIgnoreCase( "DD" ) )
return isElement( tagName, CLOSE_DD_DT );
// LI and OPTION close themselves
if ( openTag.equalsIgnoreCase( "LI" ) || openTag.equalsIgnoreCase( "OPTION" ) )
return isElement( tagName, CLOSE_SELF );
// Each of these table sections closes all the others
if ( openTag.equalsIgnoreCase( "THEAD" ) || openTag.equalsIgnoreCase( "TFOOT" ) ||
openTag.equalsIgnoreCase( "TBODY" ) || openTag.equalsIgnoreCase( "TR" ) ||
openTag.equalsIgnoreCase( "COLGROUP" ) )
return isElement( tagName, CLOSE_TABLE );
// TD closes TH and TH closes TD
if ( openTag.equalsIgnoreCase( "TH" ) || openTag.equalsIgnoreCase( "TD" ) )
return isElement( tagName, CLOSE_TH_TD );
return false;
}
/**
* Returns true if the specified attribute it a URI and should be
* escaped appropriately. In HTML URIs are escaped differently
* than normal attributes.
*
* @param tagName The element's tag name
* @param attrName The attribute's name
*/
public static boolean isURI( String tagName, String attrName )
{
// Stupid checks.
return ( attrName.equalsIgnoreCase( "href" ) || attrName.equalsIgnoreCase( "src" ) );
}
/**
* Returns true if the specified attribute is a boolean and should be
* printed without the value. This applies to attributes that are true
* if they exist, such as selected (OPTION/INPUT).
*
* @param tagName The element's tag name
* @param attrName The attribute's name
*/
public static boolean isBoolean( String tagName, String attrName )
{
String[] attrNames;
attrNames = (String[]) _boolAttrs.get( tagName.toUpperCase(Locale.ENGLISH) );
if ( attrNames == null )
return false;
for ( int i = 0 ; i < attrNames.length ; ++i )
if ( attrNames[ i ].equalsIgnoreCase( attrName ) )
return true;
return false;
}
/**
* Returns the value of an HTML character reference by its name. If the
* reference is not found or was not defined as a character reference,
* returns EOF (-1).
*
* @param name Name of character reference
* @return Character code or EOF (-1)
*/
public static int charFromName( String name )
{
Object value;
initialize();
value = _byName.get( name );
if ( value != null && value instanceof Integer ) {
return ( (Integer) value ).intValue();
}
return -1;
}
/**
* Returns the name of an HTML character reference based on its character
* value. Only valid for entities defined from character references. If no
* such character value was defined, return null.
*
* @param value Character value of entity
* @return Entity's name or null
*/
public static String fromChar(int value )
{
if (value > 0xffff)
return null;
String name;
initialize();
name = (String) _byChar.get( new Integer( value ) );
return name;
}
/**
* Initialize upon first access. Will load all the HTML character references
* into a list that is accessible by name or character value and is optimized
* for character substitution. This method may be called any number of times
* but will execute only once.
*/
private static void initialize()
{
InputStream is = null;
BufferedReader reader = null;
int index;
String name;
String value;
int code;
String line;
// Make sure not to initialize twice.
if ( _byName != null )
return;
try {
_byName = new Hashtable();
_byChar = new Hashtable();
is = HTMLdtd.class.getResourceAsStream( ENTITIES_RESOURCE );
if ( is == null ) {
throw new RuntimeException(
DOMMessageFormatter.formatMessage(
DOMMessageFormatter.SERIALIZER_DOMAIN,
"ResourceNotFound", new Object[] {ENTITIES_RESOURCE}));
}
reader = new BufferedReader( new InputStreamReader( is, "ASCII" ) );
line = reader.readLine();
while ( line != null ) {
if ( line.length() == 0 || line.charAt( 0 ) == '#' ) {
line = reader.readLine();
continue;
}
index = line.indexOf( ' ' );
if ( index > 1 ) {
name = line.substring( 0, index );
++index;
if ( index < line.length() ) {
value = line.substring( index );
index = value.indexOf( ' ' );
if ( index > 0 )
value = value.substring( 0, index );
code = Integer.parseInt( value );
defineEntity( name, (char) code );
}
}
line = reader.readLine();
}
is.close();
} catch ( Exception except ) {
throw new RuntimeException(
DOMMessageFormatter.formatMessage(
DOMMessageFormatter.SERIALIZER_DOMAIN,
"ResourceNotLoaded", new Object[] {ENTITIES_RESOURCE, except.toString()}));
} finally {
if ( is != null ) {
try {
is.close();
} catch ( Exception except ) { }
}
}
}
/**
* Defines a new character reference. The reference's name and value are
* supplied. Nothing happens if the character reference is already defined.
*
* Unlike internal entities, character references are a string to single
* character mapping. They are used to map non-ASCII characters both on
* parsing and printing, primarily for HTML documents. '<amp;' is an
* example of a character reference.
*
* @param name The entity's name
* @param value The entity's value
*/
private static void defineEntity( String name, char value )
{
if ( _byName.get( name ) == null ) {
_byName.put( name, new Integer( value ) );
_byChar.put( new Integer( value ), name );
}
}
private static void defineElement( String name, int flags )
{
_elemDefs.put( name, new Integer( flags ) );
}
private static void defineBoolean( String tagName, String attrName )
{
defineBoolean( tagName, new String[] { attrName } );
}
private static void defineBoolean( String tagName, String[] attrNames )
{
_boolAttrs.put( tagName, attrNames );
}
private static boolean isElement( String name, int flag )
{
Integer flags;
flags = (Integer) _elemDefs.get( name.toUpperCase(Locale.ENGLISH) );
if ( flags == null ) {
return false;
}
return ( ( flags.intValue() & flag ) == flag );
}
static
{
_elemDefs = new Hashtable();
defineElement( "ADDRESS", CLOSE_P );
defineElement( "AREA", EMPTY );
defineElement( "BASE", EMPTY | ALLOWED_HEAD );
defineElement( "BASEFONT", EMPTY );
defineElement( "BLOCKQUOTE", CLOSE_P );
defineElement( "BODY", OPT_CLOSING );
defineElement( "BR", EMPTY );
defineElement( "COL", EMPTY );
defineElement( "COLGROUP", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
defineElement( "DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
defineElement( "DIV", CLOSE_P );
defineElement( "DL", ELEM_CONTENT | CLOSE_P );
defineElement( "DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
defineElement( "FIELDSET", CLOSE_P );
defineElement( "FORM", CLOSE_P );
defineElement( "FRAME", EMPTY | OPT_CLOSING );
defineElement( "H1", CLOSE_P );
defineElement( "H2", CLOSE_P );
defineElement( "H3", CLOSE_P );
defineElement( "H4", CLOSE_P );
defineElement( "H5", CLOSE_P );
defineElement( "H6", CLOSE_P );
defineElement( "HEAD", ELEM_CONTENT | OPT_CLOSING );
defineElement( "HR", EMPTY | CLOSE_P );
defineElement( "HTML", ELEM_CONTENT | OPT_CLOSING );
defineElement( "IMG", EMPTY );
defineElement( "INPUT", EMPTY );
defineElement( "ISINDEX", EMPTY | ALLOWED_HEAD );
defineElement( "LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
defineElement( "LINK", EMPTY | ALLOWED_HEAD );
defineElement( "MAP", ALLOWED_HEAD );
defineElement( "META", EMPTY | ALLOWED_HEAD );
defineElement( "OL", ELEM_CONTENT | CLOSE_P );
defineElement( "OPTGROUP", ELEM_CONTENT );
defineElement( "OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
defineElement( "P", OPT_CLOSING | CLOSE_P | CLOSE_SELF );
defineElement( "PARAM", EMPTY );
defineElement( "PRE", PRESERVE | CLOSE_P );
defineElement( "SCRIPT", ALLOWED_HEAD | PRESERVE );
defineElement( "NOSCRIPT", ALLOWED_HEAD | PRESERVE );
defineElement( "SELECT", ELEM_CONTENT );
defineElement( "STYLE", ALLOWED_HEAD | PRESERVE );
defineElement( "TABLE", ELEM_CONTENT | CLOSE_P );
defineElement( "TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
defineElement( "TD", OPT_CLOSING | CLOSE_TH_TD );
defineElement( "TEXTAREA", PRESERVE );
defineElement( "TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
defineElement( "TH", OPT_CLOSING | CLOSE_TH_TD );
defineElement( "THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
defineElement( "TITLE", ALLOWED_HEAD );
defineElement( "TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
defineElement( "UL", ELEM_CONTENT | CLOSE_P );
_boolAttrs = new Hashtable();
defineBoolean( "AREA", "href" );
defineBoolean( "BUTTON", "disabled" );
defineBoolean( "DIR", "compact" );
defineBoolean( "DL", "compact" );
defineBoolean( "FRAME", "noresize" );
defineBoolean( "HR", "noshade" );
defineBoolean( "IMAGE", "ismap" );
defineBoolean( "INPUT", new String[] { "defaultchecked", "checked", "readonly", "disabled" } );
defineBoolean( "LINK", "link" );
defineBoolean( "MENU", "compact" );
defineBoolean( "OBJECT", "declare" );
defineBoolean( "OL", "compact" );
defineBoolean( "OPTGROUP", "disabled" );
defineBoolean( "OPTION", new String[] { "default-selected", "selected", "disabled" } );
defineBoolean( "SCRIPT", "defer" );
defineBoolean( "SELECT", new String[] { "multiple", "disabled" } );
defineBoolean( "STYLE", "disabled" );
defineBoolean( "TD", "nowrap" );
defineBoolean( "TH", "nowrap" );
defineBoolean( "TEXTAREA", new String[] { "disabled", "readonly" } );
defineBoolean( "UL", "compact" );
initialize();
}
}