au.id.jericho.lib.html.HTMLElements Maven / Gradle / Ivy
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 2.4
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package au.id.jericho.lib.html;
import java.util.*;
/**
* Contains static methods which group {@linkplain HTMLElementName HTML element names} by the characteristics of their associated
* elements.
*
* An HTML element is a normal element with a
* {@linkplain Element#getName() name} that matches one of the {@linkplain HTMLElementName HTML element names} (ignoring case).
* This type of element spans the logical HTML element as described in the
* HTML 4.01 specification section 3.2.1,
* which may be implicitly terminated if it specifies an
* {@linkplain #getEndTagOptionalElementNames() optional end tag}.
*
* The term Non-HTML element refers to a normal element
* with a {@linkplain Element#getName() name} that does not match one of the {@linkplain HTMLElementName HTML element names}.
* This type of element must be either a single tag element or
* explicitly terminated.
*
* All of the sets returned by the methods in this class may be modified to customise the behaviour of the parser.
* Care must be taken however to ensure that the sets only contain tag names in lower case.
*
* Below is a table summarising the default characteristics of each HTML element. See also the
* index of elements in the HTML 4.01 specification
* for the official table containing similar information.
*
*
*
* {@linkplain Element#getName() Name} Box Type {@linkplain #getStartTagOptionalElementNames() Start Tag} {@linkplain #getEndTagOptionalElementNames() End Tag} {@linkplain #getNestingForbiddenElementNames() Nest} {@linkplain #getDeprecatedElementNames() Depr.} Description / Specification
* {@link HTMLElementName#A A} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} anchor
* {@link HTMLElementName#ABBR ABBR} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} abbreviated form (e.g., WWW, HTTP, etc.)
* {@link HTMLElementName#ACRONYM ACRONYM} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} acronym
* {@link HTMLElementName#ADDRESS ADDRESS} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} information on author
* {@link HTMLElementName#APPLET APPLET} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} D Java applet
* {@link HTMLElementName#AREA AREA} {@linkplain #getEndTagForbiddenElementNames() Forbidden} {@linkplain #getNestingForbiddenElementNames() NF} client-side image map area
* {@link HTMLElementName#B B} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} bold text style
* {@link HTMLElementName#BASE BASE} {@linkplain #getEndTagForbiddenElementNames() Forbidden} {@linkplain #getNestingForbiddenElementNames() NF} document base URI
* {@link HTMLElementName#BASEFONT BASEFONT} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagForbiddenElementNames() Forbidden} {@linkplain #getNestingForbiddenElementNames() NF} D base font size
* {@link HTMLElementName#BDO BDO} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} I18N BiDi over-ride
* {@link HTMLElementName#BIG BIG} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} large text style
* {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} long quotation
* {@link HTMLElementName#BODY BODY} {@linkplain #getStartTagOptionalElementNames() Optional} {@linkplain #getEndTagOptionalElementNames() Optional} (details) {@linkplain #getNestingForbiddenElementNames() NF} document body
* {@link HTMLElementName#BR BR} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagForbiddenElementNames() Forbidden} {@linkplain #getNestingForbiddenElementNames() NF} forced line break
* {@link HTMLElementName#BUTTON BUTTON} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} push button
* {@link HTMLElementName#CAPTION CAPTION} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} table caption
* {@link HTMLElementName#CENTER CENTER} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} D shorthand for DIV align=center
* {@link HTMLElementName#CITE CITE} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} citation
* {@link HTMLElementName#CODE CODE} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} computer code fragment
* {@link HTMLElementName#COL COL} {@linkplain #getEndTagForbiddenElementNames() Forbidden} {@linkplain #getNestingForbiddenElementNames() NF} table column
* {@link HTMLElementName#COLGROUP COLGROUP} {@linkplain #getEndTagOptionalElementNames() Optional} (details) {@linkplain #getNestingForbiddenElementNames() NF} table column group
* {@link HTMLElementName#DD DD} {@linkplain #getEndTagOptionalElementNames() Optional} (details) definition description
* {@link HTMLElementName#DEL DEL} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} deleted text
* {@link HTMLElementName#DFN DFN} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} instance definition
* {@link HTMLElementName#DIR DIR} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} D directory list
* {@link HTMLElementName#DIV DIV} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} generic language/style container
* {@link HTMLElementName#DL DL} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} definition list
* {@link HTMLElementName#DT DT} {@linkplain #getEndTagOptionalElementNames() Optional} (details) definition term
* {@link HTMLElementName#EM EM} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} emphasis
* {@link HTMLElementName#FIELDSET FIELDSET} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} form control group
* {@link HTMLElementName#FONT FONT} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} D local change to font
* {@link HTMLElementName#FORM FORM} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} interactive form
* {@link HTMLElementName#FRAME FRAME} {@linkplain #getEndTagForbiddenElementNames() Forbidden} {@linkplain #getNestingForbiddenElementNames() NF} subwindow
* {@link HTMLElementName#FRAMESET FRAMESET} {@linkplain #getEndTagRequiredElementNames() Required} window subdivision
* {@link HTMLElementName#H1 H1} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} heading
* {@link HTMLElementName#H2 H2} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} heading
* {@link HTMLElementName#H3 H3} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} heading
* {@link HTMLElementName#H4 H4} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} heading
* {@link HTMLElementName#H5 H5} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} heading
* {@link HTMLElementName#H6 H6} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} heading
* {@link HTMLElementName#HEAD HEAD} {@linkplain #getStartTagOptionalElementNames() Optional} {@linkplain #getEndTagOptionalElementNames() Optional} (details) {@linkplain #getNestingForbiddenElementNames() NF} document head
* {@link HTMLElementName#HR HR} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagForbiddenElementNames() Forbidden} {@linkplain #getNestingForbiddenElementNames() NF} horizontal rule
* {@link HTMLElementName#HTML HTML} {@linkplain #getStartTagOptionalElementNames() Optional} {@linkplain #getEndTagOptionalElementNames() Optional} (details) {@linkplain #getNestingForbiddenElementNames() NF} document root element
* {@link HTMLElementName#I I} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} italic text style
* {@link HTMLElementName#IFRAME IFRAME} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} inline subwindow
* {@link HTMLElementName#IMG IMG} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagForbiddenElementNames() Forbidden} {@linkplain #getNestingForbiddenElementNames() NF} Embedded image
* {@link HTMLElementName#INPUT INPUT} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagForbiddenElementNames() Forbidden} {@linkplain #getNestingForbiddenElementNames() NF} form control
* {@link HTMLElementName#INS INS} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} inserted text
* {@link HTMLElementName#ISINDEX ISINDEX} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagForbiddenElementNames() Forbidden} {@linkplain #getNestingForbiddenElementNames() NF} D single line prompt
* {@link HTMLElementName#KBD KBD} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} text to be entered by the user
* {@link HTMLElementName#LABEL LABEL} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} form field label text
* {@link HTMLElementName#LEGEND LEGEND} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} fieldset legend
* {@link HTMLElementName#LI LI} {@linkplain #getEndTagOptionalElementNames() Optional} (details) list item
* {@link HTMLElementName#LINK LINK} {@linkplain #getEndTagForbiddenElementNames() Forbidden} {@linkplain #getNestingForbiddenElementNames() NF} a media-independent link
* {@link HTMLElementName#MAP MAP} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} client-side image map
* {@link HTMLElementName#MENU MENU} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} D menu list
* {@link HTMLElementName#META META} {@linkplain #getEndTagForbiddenElementNames() Forbidden} {@linkplain #getNestingForbiddenElementNames() NF} generic metainformation
* {@link HTMLElementName#NOFRAMES NOFRAMES} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} alternate content container for non frame-based rendering
* {@link HTMLElementName#NOSCRIPT NOSCRIPT} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} alternate content container for non script-based rendering
* {@link HTMLElementName#OBJECT OBJECT} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} generic embedded object
* {@link HTMLElementName#OL OL} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} ordered list
* {@link HTMLElementName#OPTGROUP OPTGROUP} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} option group
* {@link HTMLElementName#OPTION OPTION} {@linkplain #getEndTagOptionalElementNames() Optional} (details) {@linkplain #getNestingForbiddenElementNames() NF} selectable choice
* {@link HTMLElementName#P P} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagOptionalElementNames() Optional} (details) {@linkplain #getNestingForbiddenElementNames() NF} paragraph
* {@link HTMLElementName#PARAM PARAM} {@linkplain #getEndTagForbiddenElementNames() Forbidden} {@linkplain #getNestingForbiddenElementNames() NF} named property value
* {@link HTMLElementName#PRE PRE} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} preformatted text
* {@link HTMLElementName#Q Q} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} short inline quotation
* {@link HTMLElementName#S S} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} D strike-through text style
* {@link HTMLElementName#SAMP SAMP} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} sample program output, scripts, etc.
* {@link HTMLElementName#SCRIPT SCRIPT} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} script statements
* {@link HTMLElementName#SELECT SELECT} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} option selector
* {@link HTMLElementName#SMALL SMALL} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} small text style
* {@link HTMLElementName#SPAN SPAN} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} generic language/style container
* {@link HTMLElementName#STRIKE STRIKE} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} D strike-through text
* {@link HTMLElementName#STRONG STRONG} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} strong emphasis
* {@link HTMLElementName#STYLE STYLE} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} style info
* {@link HTMLElementName#SUB SUB} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} subscript
* {@link HTMLElementName#SUP SUP} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} superscript
* {@link HTMLElementName#TABLE TABLE} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} table
* {@link HTMLElementName#TBODY TBODY} {@linkplain #getStartTagOptionalElementNames() Optional} {@linkplain #getEndTagOptionalElementNames() Optional} (details) table body
* {@link HTMLElementName#TD TD} {@linkplain #getEndTagOptionalElementNames() Optional} (details) table data cell
* {@link HTMLElementName#TEXTAREA TEXTAREA} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} multi-line text field
* {@link HTMLElementName#TFOOT TFOOT} {@linkplain #getEndTagOptionalElementNames() Optional} (details) table footer
* {@link HTMLElementName#TH TH} {@linkplain #getEndTagOptionalElementNames() Optional} (details) table header cell
* {@link HTMLElementName#THEAD THEAD} {@linkplain #getEndTagOptionalElementNames() Optional} (details) table header
* {@link HTMLElementName#TITLE TITLE} {@linkplain #getEndTagRequiredElementNames() Required} {@linkplain #getNestingForbiddenElementNames() NF} document title
* {@link HTMLElementName#TR TR} {@linkplain #getEndTagOptionalElementNames() Optional} (details) table row
* {@link HTMLElementName#TT TT} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} teletype or monospaced text style
* {@link HTMLElementName#U U} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} D underlined text style
* {@link HTMLElementName#UL UL} {@linkplain #getBlockLevelElementNames() Block} {@linkplain #getEndTagRequiredElementNames() Required} unordered list
* {@link HTMLElementName#VAR VAR} {@linkplain #getInlineLevelElementNames() Inline} {@linkplain #getEndTagRequiredElementNames() Required} instance of a variable or program argument
*
*
* @see HTMLElementName
* @see Element
*/
public final class HTMLElements implements HTMLElementName {
private static final List ALL=new ArrayList(Arrays.asList(new String[] {A,ABBR,ACRONYM,ADDRESS,APPLET,AREA,B,BASE,BASEFONT,BDO,BIG,BLOCKQUOTE,BODY,BR,BUTTON,CAPTION,CENTER,CITE,CODE,COL,COLGROUP,DD,DEL,DFN,DIR,DIV,DL,DT,EM,FIELDSET,FONT,FORM,FRAME,FRAMESET,H1,H2,H3,H4,H5,H6,HEAD,HR,HTML,I,IFRAME,IMG,INPUT,INS,ISINDEX,KBD,LABEL,LEGEND,LI,LINK,MAP,MENU,META,NOFRAMES,NOSCRIPT,OBJECT,OL,OPTGROUP,OPTION,P,PARAM,PRE,Q,S,SAMP,SCRIPT,SELECT,SMALL,SPAN,STRIKE,STRONG,STYLE,SUB,SUP,TABLE,TBODY,TD,TEXTAREA,TFOOT,TH,THEAD,TITLE,TR,TT,U,UL,VAR}));
private static final HTMLElementNameSet BLOCK=new HTMLElementNameSet(new String[] {P,H1,H2,H3,H4,H5,H6,UL,OL,DIR,MENU,PRE,DL,DIV,CENTER,NOSCRIPT,NOFRAMES,BLOCKQUOTE,FORM,ISINDEX,HR,TABLE,FIELDSET,ADDRESS});
private static final HTMLElementNameSet INLINE=new HTMLElementNameSet(new String[] {TT,I,B,U,S,STRIKE,BIG,SMALL,EM,STRONG,DFN,CODE,SAMP,KBD,VAR,CITE,ABBR,ACRONYM,A,IMG,APPLET,OBJECT,FONT,BASEFONT,BR,SCRIPT,MAP,Q,SUB,SUP,SPAN,BDO,IFRAME,INPUT,SELECT,TEXTAREA,LABEL,BUTTON,INS,DEL});
private static final HTMLElementNameSet END_TAG_FORBIDDEN_SET=new HTMLElementNameSet(new String[] {AREA,BASE,BASEFONT,BR,COL,FRAME,HR,IMG,INPUT,ISINDEX,LINK,META,PARAM});
private static final HTMLElementNameSet _UL_OL=new HTMLElementNameSet(UL).union(OL);
private static final HTMLElementNameSet _DD_DT=new HTMLElementNameSet(DD).union(DT);
private static final HTMLElementNameSet _THEAD_TBODY_TFOOT_TR=new HTMLElementNameSet(THEAD).union(TBODY).union(TFOOT).union(TR);
private static final HTMLElementNameSet _THEAD_TBODY_TFOOT_TR_TD_TH=new HTMLElementNameSet(_THEAD_TBODY_TFOOT_TR).union(TD).union(TH);
private static final HTMLElementNameSet DEPRECATED=new HTMLElementNameSet().union(APPLET).union(BASEFONT).union(CENTER).union(DIR).union(FONT).union(ISINDEX).union(MENU).union(S).union(STRIKE).union(U);
private static final HTMLElementNameSet START_TAG_OPTIONAL_SET=new HTMLElementNameSet().union(BODY).union(HEAD).union(HTML).union(TBODY);
private static final HashMap CONSTANT_NAME_MAP=buildTagMap(); // contains a map of tag names to the tag constants, allowing standard tags to be tested using == operator instead of equals()
private static final HashMap TERMINATING_TAG_NAME_SETS_MAP=buildTerminatingTagNameSetsMap(); // contains a map of tags having optional end tags to the HTMLElementTerminatingTagNameSets that can terminate the element if the end tag is not present
private static final Set END_TAG_OPTIONAL_SET=TERMINATING_TAG_NAME_SETS_MAP.keySet();
private static final HTMLElementNameSet END_TAG_REQUIRED_SET=new HTMLElementNameSet().union(ALL).minus(END_TAG_FORBIDDEN_SET).minus(END_TAG_OPTIONAL_SET);
private static final HTMLElementNameSet CLOSING_SLASH_IGNORED_SET=new HTMLElementNameSet().union(END_TAG_OPTIONAL_SET).union(END_TAG_REQUIRED_SET);
static final HTMLElementNameSet END_TAG_REQUIRED_NESTING_FORBIDDEN_SET=new HTMLElementNameSet().union(A).union(ADDRESS).union(APPLET).union(BUTTON).union(CAPTION).union(FORM).union(IFRAME).union(LABEL).union(LEGEND).union(OPTGROUP).union(SCRIPT).union(SELECT).union(STYLE).union(TEXTAREA).union(TITLE);
private static final HTMLElementNameSet END_TAG_OPTIONAL_NESTING_FORBIDDEN_SET=new HTMLElementNameSet().union(BODY).union(COLGROUP).union(HEAD).union(HTML).union(OPTION).union(P);
private static final HTMLElementNameSet NESTING_FORBIDDEN_SET=new HTMLElementNameSet().union(END_TAG_REQUIRED_NESTING_FORBIDDEN_SET).union(END_TAG_OPTIONAL_NESTING_FORBIDDEN_SET).union(END_TAG_FORBIDDEN_SET);
private HTMLElements() {}
/**
* Returns a list containing all of the {@linkplain HTMLElementName HTML element names}.
*
* The returned list is in alphabetical order.
*
* @return a list containing of all the {@linkplain HTMLElementName HTML element names}.
*/
public static final List getElementNames() {
return ALL;
}
/**
* Returns a set containing the {@linkplain Element#getName() names} of all the
* block-level elements.
*
* The element names contained in this set are:
* {@link #ADDRESS ADDRESS}, {@link #BLOCKQUOTE BLOCKQUOTE}, {@link #CENTER CENTER}, {@link #DIR DIR}, {@link #DIV DIV},
* {@link #DL DL}, {@link #FIELDSET FIELDSET}, {@link #FORM FORM},
* {@link #H1 H1}, {@link #H2 H2}, {@link #H3 H3}, {@link #H4 H4}, {@link #H5 H5}, {@link #H6 H6},
* {@link #HR HR}, {@link #ISINDEX ISINDEX}, {@link #MENU MENU}, {@link #NOFRAMES NOFRAMES}, {@link #NOSCRIPT NOSCRIPT},
* {@link #OL OL}, {@link #P P}, {@link #PRE PRE}, {@link #TABLE TABLE}, {@link #UL UL}
*
* This set is defined in the HTML 4.01 Transitional DTD,
* but more detailed information can be found in the
* HTML 4.01 specification section 7.5.3 - Block-level and inline elements
* and the CSS2 specification section 9.2.1 - Block-level elements and block boxes.
*
* The CSS2 display property can be used
* to override the normal box type of an element.
*
* @return a set containing the {@linkplain Element#getName() names} of all the block-level elements.
* @see #getInlineLevelElementNames()
*/
public static Set getBlockLevelElementNames() {
return BLOCK;
}
/**
* Returns a set containing the {@linkplain Element#getName() names} of all the
* inline-level elements.
*
* The element names contained in this set are:
* {@link #A A}, {@link #ABBR ABBR}, {@link #ACRONYM ACRONYM}, {@link #APPLET APPLET}, {@link #B B}, {@link #BASEFONT BASEFONT},
* {@link #BDO BDO}, {@link #BIG BIG}, {@link #BR BR}, {@link #BUTTON BUTTON}, {@link #CITE CITE}, {@link #CODE CODE},
* {@link #DEL DEL}, {@link #DFN DFN}, {@link #EM EM}, {@link #FONT FONT}, {@link #I I}, {@link #IFRAME IFRAME}, {@link #IMG IMG},
* {@link #INPUT INPUT}, {@link #INS INS}, {@link #KBD KBD}, {@link #LABEL LABEL}, {@link #MAP MAP}, {@link #OBJECT OBJECT},
* {@link #Q Q}, {@link #S S}, {@link #SAMP SAMP}, {@link #SCRIPT SCRIPT}, {@link #SELECT SELECT}, {@link #SMALL SMALL},
* {@link #SPAN SPAN}, {@link #STRIKE STRIKE}, {@link #STRONG STRONG}, {@link #SUB SUB}, {@link #SUP SUP}, {@link #TEXTAREA TEXTAREA},
* {@link #TT TT}, {@link #U U}, {@link #VAR VAR}
*
* This set is defined in the HTML 4.01 Transitional DTD,
* but more detailed information can be found in the
* HTML 4.01 specification section 7.5.3 - Block-level and inline elements
* and the CSS2 specification section 9.2.2 - Inline-level elements and inline boxes.
*
* The CSS2 display property can be used
* to override the normal box type of an element.
*
* The HTML Document Type Definitions
* forbid the presence of {@linkplain #getBlockLevelElementNames() block-level elements} inside inline-level elements,
* but it is tolerated by all popular browsers in various situations, even in XHTML documents.
* The most notorious example of this is the common inclusion of block-level elements inside {@link HTMLElementName#FONT FONT} elements.
*
* @return a set containing the {@linkplain Element#getName() names} of all the inline-level elements.
* @see #getBlockLevelElementNames()
*/
public static Set getInlineLevelElementNames() {
return INLINE;
}
/**
* Returns a set containing the {@linkplain Element#getName() names} of all
* deprecated elements in HTML 4.01.
* @return a set containing the {@linkplain Element#getName() names} of all deprecated elements in HTML 4.01.
*/
public static Set getDeprecatedElementNames() {
return DEPRECATED;
}
/**
* Returns a set containing the {@linkplain Element#getName() names} of all of the HTML elements
* for which the {@linkplain Element#getEndTag() end tag} is forbidden.
*
* See the element parsing rules for HTML elements with forbidden end tags
* for more information.
*
* The index of elements in the HTML 4.01 specification
* includes the letter 'F' in the "End Tag" column for elements whose end tag is forbidden.
*
* @return a set containing the {@linkplain Element#getName() names} of all of the HTML elements for which the {@linkplain Element#getEndTag() end tag} is forbidden.
* @see #getEndTagOptionalElementNames()
* @see #getEndTagRequiredElementNames()
*/
public static Set getEndTagForbiddenElementNames() {
return END_TAG_FORBIDDEN_SET;
}
/**
* Returns a set containing the {@linkplain Element#getName() names} of all of the HTML elements
* for which the {@linkplain Element#getEndTag() end tag} is optional.
*
* Elements with these names may be implicitly terminated by a subsequent
* {@linkplain #getTerminatingStartTagNames(String) terminating start tag} or
* {@linkplain #getTerminatingEndTagNames(String) terminating end tag}.
* A list of the these terminating tags, and the names of {@linkplain #getNonterminatingElementNames(String) non-terminating elements}
* that can be nested within the element, can be found in the documentation of each relevant element in the {@link HTMLElementName} class.
*
* See the element parsing rules for HTML elements with optional end tags
* for more information.
*
* The index of elements in the HTML 4.01 specification
* includes the letter 'O' in the "End Tag" column for elements whose end tag is optional.
*
* @return a set containing the {@linkplain Element#getName() names} of all of the HTML elements for which the {@linkplain Element#getEndTag() end tag} is optional.
* @see #getEndTagForbiddenElementNames()
* @see #getEndTagRequiredElementNames()
*/
public static Set getEndTagOptionalElementNames() {
return END_TAG_OPTIONAL_SET;
}
/**
* Returns a set containing the {@linkplain Element#getName() names} of all of the HTML elements
* for which the {@linkplain Element#getEndTag() end tag} is required.
*
* See the element parsing rules for HTML elements with required end tags
* for more information.
*
* The index of elements in the HTML 4.01 specification
* leaves the "End Tag" column blank for elements whose end tag is required.
*
* @return a set containing the {@linkplain Element#getName() names} of all of the HTML elements for which the {@linkplain Element#getEndTag() end tag} is required.
* @see #getEndTagForbiddenElementNames()
* @see #getEndTagOptionalElementNames()
*/
public static Set getEndTagRequiredElementNames() {
return END_TAG_REQUIRED_SET;
}
/**
* Returns a set containing the {@linkplain Element#getName() names} of all of the HTML elements
* for which the {@linkplain Element#getStartTag() start tag} is optional.
*
* Elements with optional start tags must be present in the document object model (DOM)
* in certain locations, either forming part of the structure of the HTML document as a whole
* (e.g. the {@link HTMLElementName#HTML HTML}, {@link HTMLElementName#HEAD HEAD}, and {@link HTMLElementName#BODY BODY} elements),
* or forming part of the structure of a {@link HTMLElementName#TABLE TABLE} element (e.g. the {@link HTMLElementName#TBODY TBODY} element).
* The location of an omitted start tag
* in the document's object model can be inferred from the surrounding elements.
*
* This library does not use this property in any way when parsing documents, and does not construct a document object model from the
* source, so no implied element is created where an optional start tag is omitted.
*
* When the start tag has been omitted in the document text, the corresponding end tag should also be omitted.
*
* The index of elements in the HTML 4.01 specification
* includes the letter 'O' in the "Start Tag" column for elements whose start tag is optional.
*
* @return a set containing the {@linkplain Element#getName() names} of all of the HTML elements for which the {@linkplain Element#getStartTag() start tag} is optional.
*/
public static Set getStartTagOptionalElementNames() {
return START_TAG_OPTIONAL_SET;
}
/**
* Returns the {@linkplain StartTag#getName() names} of start tags that implicitly terminate
* an HTML element with the specified name.
*
* This method is only relevant to HTML elements for which the
* {@linkplain #getEndTagOptionalElementNames() end tag is optional}.
* It returns null
if
*
{@link #getEndTagOptionalElementNames()}.contains(endTagOptionalElementName.toLowerCase())==null
.
*
* @param endTagOptionalElementName the {@linkplain Element#getName() name} of an element for which the {@linkplain #getEndTagOptionalElementNames() end tag is optional}.
* @return the {@linkplain StartTag#getName() names} of start tags that implicitly terminate an HTML element with the specified name, or null
if the name does not identify an element for which the {@linkplain #getEndTagOptionalElementNames() end tag is optional}.
* @see #getTerminatingEndTagNames(String endTagOptionalElementName)
* @see #getNonterminatingElementNames(String endTagOptionalElementName)
*/
public static Set getTerminatingStartTagNames(final String endTagOptionalElementName) {
final HTMLElementTerminatingTagNameSets terminatingTagNameSets=getTerminatingTagNameSets(endTagOptionalElementName);
if (terminatingTagNameSets==null) return null;
return terminatingTagNameSets.TerminatingStartTagNameSet;
}
/**
* Returns the {@linkplain EndTag#getName() names} of end tags that implicitly terminate
* an HTML element with the specified name.
*
* This method is only relevant to HTML elements for which the
* {@linkplain #getEndTagOptionalElementNames() end tag is optional}.
* It returns null
if
*
{@link #getEndTagOptionalElementNames()}.contains(endTagOptionalElementName.toLowerCase())==null
.
*
* Note that removing the tag name matching the specified element has no effect on the behaviour of the parser,
* as it is always assumed that a start tag is terminated by an end tag with a matching name.
*
* @param endTagOptionalElementName the {@linkplain Element#getName() name} of an element for which the {@linkplain #getEndTagOptionalElementNames() end tag is optional}.
* @return the {@linkplain StartTag#getName() names} of end tags that implicitly terminate an HTML element with the specified name, or null
if the name does not identify an element for which the {@linkplain #getEndTagOptionalElementNames() end tag is optional}.
* @see #getTerminatingStartTagNames(String endTagOptionalElementName)
* @see #getNonterminatingElementNames(String endTagOptionalElementName)
*/
public static Set getTerminatingEndTagNames(final String endTagOptionalElementName) {
final HTMLElementTerminatingTagNameSets terminatingTagNameSets=getTerminatingTagNameSets(endTagOptionalElementName);
if (terminatingTagNameSets==null) return null;
return terminatingTagNameSets.TerminatingEndTagNameSet;
}
/**
* Returns the {@linkplain Element#getName() names} of elements that do NOT implicitly terminate
* an HTML element with the specified name.
* Neither can any tag nested inside any of these elements implicitly terminate the specified element,
* even if it is listed as one of the {@linkplain #getTerminatingStartTagNames(String) terminating start tags} or
* {@linkplain #getTerminatingEndTagNames(String) terminating end tags}.
*
* This method is only relevant to HTML elements for which the
* {@linkplain #getEndTagOptionalElementNames() end tag is optional}.
* It returns null
if
*
{@link #getEndTagOptionalElementNames()}.contains(endTagOptionalElementName.toLowerCase())==null
.
*
* @param endTagOptionalElementName the {@linkplain Element#getName() name} of an element for which the {@linkplain #getEndTagOptionalElementNames() end tag is optional}.
* @return the {@linkplain Element#getName() names} of elements that do NOT implicitly terminate an HTML element with the specified name, or null
if the name does not identify an element for which the {@linkplain #getEndTagOptionalElementNames() end tag is optional}.
* @see #getTerminatingStartTagNames(String endTagOptionalElementName)
* @see #getTerminatingEndTagNames(String endTagOptionalElementName)
*/
public static Set getNonterminatingElementNames(final String endTagOptionalElementName) {
final HTMLElementTerminatingTagNameSets terminatingTagNameSets=getTerminatingTagNameSets(endTagOptionalElementName);
if (terminatingTagNameSets==null) return null;
return terminatingTagNameSets.NonterminatingElementNameSet;
}
/**
* Returns a set containing the {@linkplain Element#getName() names} of all of the HTML elements
* which should never contain elements of the same name, either as direct or indirect descendants.
*
* @return a set containing the {@linkplain Element#getName() names} of all of the HTML elements which should never contain elements of the same name.
*/
public static Set getNestingForbiddenElementNames() {
return NESTING_FORBIDDEN_SET;
}
static final String getConstantElementName(final String elementName) {
final String elementNameConstant=(String)CONSTANT_NAME_MAP.get(elementName);
return elementNameConstant!=null ? elementNameConstant : elementName;
}
static final boolean isClosingSlashIgnored(final String elementName) {
return CLOSING_SLASH_IGNORED_SET.contains(elementName);
}
static final HTMLElementTerminatingTagNameSets getTerminatingTagNameSets(final String endTagOptionalElementName) {
return (HTMLElementTerminatingTagNameSets)TERMINATING_TAG_NAME_SETS_MAP.get(endTagOptionalElementName);
}
private static HashMap buildTerminatingTagNameSetsMap() {
// HTML is included in the NonterminatingElementNameSet of BODY and HTML in case the source contains (illegaly) nested HTML documents
final HashMap map=new HashMap(20,1.0F); // 15 entries in total
map.put(BODY,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(), new HTMLElementNameSet(HTML).union(BODY), new HTMLElementNameSet(HTML)));
map.put(COLGROUP,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(_THEAD_TBODY_TFOOT_TR).union(COLGROUP), new HTMLElementNameSet(TABLE).union(COLGROUP), new HTMLElementNameSet(TABLE)));
map.put(DD,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(_DD_DT), new HTMLElementNameSet(DL).union(DD), new HTMLElementNameSet(DL)));
map.put(DT,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(_DD_DT), new HTMLElementNameSet(DL).union(DT), new HTMLElementNameSet(DL)));
map.put(HEAD,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(BODY).union(FRAMESET), new HTMLElementNameSet(HTML).union(HEAD), new HTMLElementNameSet()));
map.put(HTML,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(), new HTMLElementNameSet(HTML), new HTMLElementNameSet(HTML)));
map.put(LI,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(LI), new HTMLElementNameSet(_UL_OL).union(LI), new HTMLElementNameSet(_UL_OL)));
map.put(OPTION,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(OPTION).union(OPTGROUP), new HTMLElementNameSet(SELECT).union(OPTION), new HTMLElementNameSet()));
map.put(P,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(BLOCK).union(_DD_DT).union(TH).union(TD).union(LI), new HTMLElementNameSet(BLOCK).union(_DD_DT).union(BODY).union(HTML).union(_THEAD_TBODY_TFOOT_TR_TD_TH).union(CAPTION).union(LEGEND), new HTMLElementNameSet()));
map.put(TBODY,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(TBODY).union(TFOOT).union(THEAD), new HTMLElementNameSet(TABLE).union(TBODY), new HTMLElementNameSet(TABLE)));
map.put(TD,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(_THEAD_TBODY_TFOOT_TR_TD_TH), new HTMLElementNameSet(_THEAD_TBODY_TFOOT_TR).union(TABLE).union(TD), new HTMLElementNameSet(TABLE)));
map.put(TFOOT,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(TBODY).union(TFOOT).union(THEAD), new HTMLElementNameSet(TABLE).union(TFOOT), new HTMLElementNameSet(TABLE)));
map.put(TH,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(_THEAD_TBODY_TFOOT_TR_TD_TH), new HTMLElementNameSet(_THEAD_TBODY_TFOOT_TR).union(TABLE).union(TH), new HTMLElementNameSet(TABLE)));
map.put(THEAD,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(TBODY).union(TFOOT).union(THEAD), new HTMLElementNameSet(TABLE).union(THEAD), new HTMLElementNameSet(TABLE)));
map.put(TR,new HTMLElementTerminatingTagNameSets(new HTMLElementNameSet(_THEAD_TBODY_TFOOT_TR), new HTMLElementNameSet(_THEAD_TBODY_TFOOT_TR).union(TABLE), new HTMLElementNameSet(TABLE)));
return map;
}
private static HashMap buildTagMap() {
final HashMap map=new HashMap(132,1.0F); // 99 tags total
for (final Iterator i=ALL.iterator(); i.hasNext();) {
final String tagName=(String)i.next();
map.put(tagName,tagName);
}
map.put(StartTagTypeMarkupDeclaration.ELEMENT,StartTagTypeMarkupDeclaration.ELEMENT);
map.put(StartTagTypeMarkupDeclaration.ATTLIST,StartTagTypeMarkupDeclaration.ATTLIST);
map.put(StartTagTypeMarkupDeclaration.ENTITY,StartTagTypeMarkupDeclaration.ENTITY);
map.put(StartTagTypeMarkupDeclaration.NOTATION,StartTagTypeMarkupDeclaration.NOTATION);
return map;
}
}