net.htmlparser.jericho.Element Maven / Gradle / Ivy
Show all versions of jericho-html Show documentation
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.3
// Copyright (C) 2004-2009 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package net.htmlparser.jericho;
import java.util.*;
/**
* Represents an element
* in a specific {@linkplain Source source} document, which encompasses a {@linkplain #getStartTag() start tag},
* an optional {@linkplain #getEndTag() end tag} and all {@linkplain #getContent() content} in between.
*
* Take the following HTML segment as an example:
*
* <p>This is a sample paragraph.</p>
*
* The whole segment is represented by an Element
object. This is comprised of the {@link StartTag} "<p>
",
* the {@link EndTag} "</p>
", as well as the text in between.
* An element may also contain other elements between its start and end tags.
*
* The term normal element refers to an element having a {@linkplain #getStartTag() start tag}
* with a {@linkplain StartTag#getStartTagType() type} of {@link StartTagType#NORMAL}.
* This comprises all {@linkplain HTMLElements HTML elements} and non-HTML elements.
*
* Element
instances are obtained using one of the following methods:
*
* - {@link StartTag#getElement()}
*
- {@link EndTag#getElement()}
*
- {@link Segment#getAllElements()}
*
- {@link Segment#getAllElements(String name)}
*
- {@link Segment#getAllElements(StartTagType)}
*
* See also the {@link HTMLElements} class, and the
* XML 1.0 specification for elements.
* Element Structure
*
* The three possible structures of an element are listed below:
*
* - Single Tag Element:
*
-
* Example:
* <img src="mypicture.jpg">
*
* The element consists only of a single {@linkplain #getStartTag() start tag} and has no {@linkplain #getContent() element content}
* (although the start tag itself may have {@linkplain StartTag#getTagContent() tag content}).
*
{@link #getEndTag()}==null
*
{@link #isEmpty()}==true
*
{@link #getEnd() getEnd()}==
{@link #getStartTag()}.
{@link #getEnd() getEnd()}
*
* This occurs in the following situations:
*
* - An HTML element for which the {@linkplain HTMLElements#getEndTagForbiddenElementNames() end tag is forbidden}.
*
- An HTML element for which the {@linkplain HTMLElements#getEndTagRequiredElementNames() end tag is required},
* but the end tag is not present in the source document.
*
- An HTML element for which the {@linkplain HTMLElements#getEndTagOptionalElementNames() end tag is optional},
* where the implicitly terminating tag is situated immediately after the element's
* {@linkplain #getStartTag() start tag}.
*
- An {@linkplain #isEmptyElementTag() empty element tag}
*
- A non-HTML element that is not an {@linkplain #isEmptyElementTag() empty element tag} but is missing its end tag.
*
- An element with a start tag of a {@linkplain StartTag#getStartTagType() type} that does not define a
* {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type}.
*
- An element with a start tag of a {@linkplain StartTag#getStartTagType() type} that does define a
* {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type} but is missing its end tag.
*
* - Explicitly Terminated Element:
*
-
* Example:
* <p>This is a sample paragraph.</p>
*
* The element consists of a {@linkplain #getStartTag() start tag}, {@linkplain #getContent() content},
* and an {@linkplain #getEndTag() end tag}.
*
{@link #getEndTag()}!=null
.
*
{@link #isEmpty()}==false
(provided the end tag doesn't immediately follow the start tag)
*
{@link #getEnd() getEnd()}==
{@link #getEndTag()}.
{@link #getEnd() getEnd()}.
*
* This occurs in the following situations, assuming the start tag's matching end tag is present in the source document:
*
* - An HTML element for which the end tag is either
* {@linkplain HTMLElements#getEndTagRequiredElementNames() required} or {@linkplain HTMLElements#getEndTagOptionalElementNames() optional}.
*
- A non-HTML element that is not an {@linkplain #isEmptyElementTag() empty element tag}.
*
- An element with a start tag of a {@linkplain StartTag#getStartTagType() type} that defines a
* {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type}.
*
* - Implicitly Terminated Element:
*
-
* Example:
* <p>This text is included in the paragraph element even though no end tag is present.
* <p>This is the next paragraph.
*
* The element consists of a {@linkplain #getStartTag() start tag} and {@linkplain #getContent() content},
* but no {@linkplain #getEndTag() end tag}.
*
{@link #getEndTag()}==null
.
*
{@link #isEmpty()}==false
*
{@link #getEnd() getEnd()}!=
{@link #getStartTag()}.
{@link #getEnd() getEnd()}.
*
* This only occurs in an HTML element for which the
* {@linkplain HTMLElements#getEndTagOptionalElementNames() end tag is optional}.
*
* The element ends at the start of a tag which implies the termination of the element, called the implicitly terminating tag.
* If the implicitly terminating tag is situated immediately after the element's {@linkplain #getStartTag() start tag},
* the element is classed as a single tag element.
*
* See the element parsing rules for HTML elements with optional end tags
* for details on which tags can implicitly terminate a given element.
*
* See also the documentation of the {@link HTMLElements#getEndTagOptionalElementNames()} method.
*
* Element Parsing Rules
* The following rules describe the algorithm used in the {@link StartTag#getElement()} method to construct an element.
* The detection of the start tag's matching end tag or other terminating tags always takes into account the possible nesting of elements.
*
*
* -
* If the start tag has a {@linkplain StartTag#getStartTagType() type} of {@link StartTagType#NORMAL}:
*
* -
* If the {@linkplain StartTag#getName() name} of the start tag matches one of the
* recognised {@linkplain HTMLElementName HTML element names} (indicating an HTML element):
*
* -
*
* If the end tag for an element of this {@linkplain StartTag#getName() name} is
* {@linkplain HTMLElements#getEndTagForbiddenElementNames() forbidden},
* the parser does not conduct any search for an end tag and a single tag element is created.
*
-
*
* If the end tag for an element of this {@linkplain StartTag#getName() name} is
* {@linkplain HTMLElements#getEndTagRequiredElementNames() required}, the parser searches for the start tag's matching end tag.
*
* -
* If the matching end tag is found, an explicitly terminated element is created.
*
-
* If no matching end tag is found, the source document is not valid HTML and the incident is
* {@linkplain Source#getLogger() logged} as a missing required end tag.
* In this situation a single tag element is created.
*
* -
*
* If the end tag for an element of this {@linkplain StartTag#getName() name} is
* {@linkplain HTMLElements#getEndTagOptionalElementNames() optional}, the parser searches not only for the start tag's matching end tag,
* but also for any other tag that implicitly terminates the element.
*
For each tag (T2) following the start tag (ST1) of this element (E1):
*
* -
* If T2 is a start tag:
*
* -
* If the {@linkplain StartTag#getName() name} of T2 is in the list of
* {@linkplain HTMLElements#getNonterminatingElementNames(String) non-terminating element names} for E1,
* then continue evaluating tags from the {@linkplain Element#getEnd() end} of T2's corresponding
* {@linkplain StartTag#getElement() element}.
*
-
* If the {@linkplain StartTag#getName() name} of T2 is in the list of
* {@linkplain HTMLElements#getTerminatingStartTagNames(String) terminating start tag names} for E1,
* then E1 ends at the {@linkplain StartTag#getBegin() beginning} of T2.
* If T2 follows immediately after ST1, a single tag element is created,
* otherwise an implicitly terminated element is created.
*
* -
* If T2 is an end tag:
*
* -
* If the {@linkplain EndTag#getName() name} of T2 is the same as that of ST1,
* an explicitly terminated element is created.
*
-
* If the {@linkplain EndTag#getName() name} of T2 is in the list of
* {@linkplain HTMLElements#getTerminatingEndTagNames(String) terminating end tag names} for E1,
* then E1 ends at the {@linkplain EndTag#getBegin() beginning} of T2.
* If T2 follows immediately after ST1, a single tag element is created,
* otherwise an implicitly terminated element is created.
*
* -
* If no more tags are present in the source document, then E1 ends at the end of the file, and an
* implicitly terminated element is created.
*
*
* Note that the syntactical indication of an {@linkplain StartTag#isSyntacticalEmptyElementTag() empty-element tag} in the start tag
* is ignored when determining the end of HTML elements.
* See the documentation of the {@link #isEmptyElementTag()} method for more information.
* -
* If the {@linkplain StartTag#getName() name} of the start tag does not match one of the
* recognised {@linkplain HTMLElementName HTML element names} (indicating a non-HTML element):
*
* -
* If the start tag is {@linkplain StartTag#isSyntacticalEmptyElementTag() syntactically an empty-element tag},
* the parser does not conduct any search for an end tag and a single tag element is created.
*
-
* Otherwise, section 3.1
* of the XML 1.0 specification states that a matching end tag MUST be present, and
* the parser searches for the start tag's matching end tag.
*
* -
* If the matching end tag is found, an explicitly terminated element is created.
*
-
* If no matching end tag is found, the source document is not valid XML and the incident is
* {@linkplain Source#getLogger() logged} as a missing required end tag.
* In this situation a single tag element is created.
*
*
*
* -
* If the start tag has any {@linkplain StartTag#getStartTagType() type} other than {@link StartTagType#NORMAL}:
*
* -
* If the start tag's type does not define a {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type},
* the parser does not conduct any search for an end tag and a single tag element is created.
*
-
* If the start tag's type does define a {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type},
* the parser assumes that a matching end tag is required and searches for it.
*
* -
* If the matching end tag is found, an explicitly terminated element is created.
*
-
* If no matching end tag is found, the missing required end tag is {@linkplain Source#getLogger() logged}
* and a single tag element is created.
*
*
*
* @see HTMLElements
*/
public final class Element extends Segment {
private final StartTag startTag;
private final EndTag endTag;
private Segment content=null;
Element parentElement=Element.NOT_CACHED;
private int depth=-1;
private List childElements=null;
static final Element NOT_CACHED=new Element();
private static final boolean INCLUDE_INCORRECTLY_NESTED_CHILDREN_IN_HIERARCHY=true;
Element(final Source source, final StartTag startTag, final EndTag endTag) {
super(source, startTag.begin, endTag==null ? startTag.end : endTag.end);
if (source.isStreamed()) throw new UnsupportedOperationException("Elements are not supported when using StreamedSource");
this.startTag=startTag;
this.endTag=(endTag==null || endTag.length()==0) ? null : endTag;
}
// used only to construct NOT_CACHED
private Element() {
startTag=null;
endTag=null;
}
/**
* Returns the parent of this element in the document element hierarchy.
*
* The {@link Source#fullSequentialParse()} method must be called (either explicitly or implicitly) immediately after construction of the Source
object if this method is to be used.
* An IllegalStateException
is thrown if a full sequential parse has not been performed or if it was performed after this element was found.
*
* This method returns null
for a top-level element,
* as well as any element formed from a {@linkplain TagType#isServerTag() server tag}, regardless of whether it is nested inside a normal element.
*
* See the {@link Source#getChildElements()} method for more details.
*
* @return the parent of this element in the document element hierarchy, or null
if this element is a top-level element.
* @throws IllegalStateException if a {@linkplain Source#fullSequentialParse() full sequential parse} has not been performed or if it was performed after this element was found.
* @see #getChildElements()
*/
public Element getParentElement() {
if (parentElement==Element.NOT_CACHED) {
if (!source.wasFullSequentialParseCalled()) throw new IllegalStateException("This operation is only possible after a full sequential parse has been performed");
if (startTag.isOrphaned()) throw new IllegalStateException("This operation is only possible if a full sequential parse was performed immediately after construction of the Source object");
source.getChildElements();
if (parentElement==Element.NOT_CACHED) parentElement=null;
}
return parentElement;
}
/**
* Returns a list of the immediate children of this element in the document element hierarchy.
*
* The objects in the list are all of type {@link Element}.
*
* See the {@link Source#getChildElements()} method for more details.
*
* @return a list of the immediate children of this element in the document element hierarchy, guaranteed not null
.
* @see #getParentElement()
*/
@Override public final List getChildElements() {
return childElements!=null ? childElements : getChildElements(-1);
}
final List getChildElements(int depth) {
if (depth!=-1) this.depth=depth;
if (childElements==null) {
if (!Config.IncludeServerTagsInElementHierarchy && end==startTag.end) {
childElements=Collections.emptyList();
} else {
final int childDepth=(depth==-1 ? -1 : depth+1);
childElements=new ArrayList();
int pos=Config.IncludeServerTagsInElementHierarchy ? begin+1 : startTag.end;
final int maxChildBegin=(Config.IncludeServerTagsInElementHierarchy || endTag==null) ? end : endTag.begin;
while (true) {
final StartTag childStartTag=source.getNextStartTag(pos);
if (childStartTag==null || childStartTag.begin>=maxChildBegin) break;
if (Config.IncludeServerTagsInElementHierarchy) {
if (childStartTag.beginend) {
if (source.logger.isErrorEnabled()) source.logger.error("Child "+childElement.getDebugInfo()+" extends beyond end of parent "+getDebugInfo());
if (!INCLUDE_INCORRECTLY_NESTED_CHILDREN_IN_HIERARCHY) {
pos=childElement.end;
continue;
}
}
childElement.getChildElements(childDepth);
if (childElement.parentElement==Element.NOT_CACHED) { // make sure element was not added as a child of a descendent element (can happen with overlapping elements)
childElement.parentElement=this;
childElements.add(childElement);
}
pos=childElement.end;
}
}
}
return childElements;
}
/**
* Returns the nesting depth of this element in the document element hierarchy.
*
* The {@link Source#fullSequentialParse()} method must be called (either explicitly or implicitly) after construction of the Source
object if this method is to be used.
* An IllegalStateException
is thrown if a full sequential parse has not been performed or if it was performed after this element was found.
*
* A top-level element has a nesting depth of 0
.
*
* An element formed from a {@linkplain TagType#isServerTag() server tag} always have a nesting depth of 0
,
* regardless of whether it is nested inside a normal element.
*
* See the {@link Source#getChildElements()} method for more details.
*
* @return the nesting depth of this element in the document element hierarchy.
* @throws IllegalStateException if a {@linkplain Source#fullSequentialParse() full sequential parse} has not been performed or if it was performed after this element was found.
* @see #getParentElement()
*/
public int getDepth() {
if (depth==-1) {
getParentElement();
if (depth==-1) depth=0;
}
return depth;
}
/**
* Returns the segment representing the content of the element.
*
* This segment spans between the end of the start tag and the start of the end tag.
* If the end tag is not present, the content reaches to the end of the element.
*
* A zero-length segment is returned if the element is {@linkplain #isEmpty() empty},
*
* @return the segment representing the content of the element, guaranteed not null
.
*/
public Segment getContent() {
if (content==null) content=new Segment(source,startTag.end,getContentEnd());
return content;
}
/**
* Returns the start tag of the element.
* @return the start tag of the element.
*/
public StartTag getStartTag() {
return startTag;
}
/**
* Returns the end tag of the element.
*
* If the element has no end tag this method returns null
.
*
* @return the end tag of the element, or null
if the element has no end tag.
*/
public EndTag getEndTag() {
return endTag;
}
/**
* Returns the {@linkplain StartTag#getName() name} of the {@linkplain #getStartTag() start tag} of this element, always in lower case.
*
* This is equivalent to {@link #getStartTag()}.
{@link StartTag#getName() getName()}.
*
* See the {@link Tag#getName()} method for more information.
*
* @return the name of the {@linkplain #getStartTag() start tag} of this element, always in lower case.
*/
public String getName() {
return startTag.getName();
}
/**
* Indicates whether this element has zero-length {@linkplain #getContent() content}.
*
* This is equivalent to {@link #getContent()}.
{@link Segment#length() length()}==0
.
*
* Note that this is a broader definition than that of both the
* HTML definition of an empty element,
* which is only those elements whose end tag is {@linkplain HTMLElements#getEndTagForbiddenElementNames() forbidden}, and the
* XML definition of an empty element,
* which is "either a start-tag immediately followed by an end-tag, or an {@linkplain #isEmptyElementTag() empty-element tag}".
* The other possibility covered by this property is the case of an HTML element with an
* {@linkplain HTMLElements#getEndTagOptionalElementNames() optional} end tag that is immediately followed by another tag that implicitly
* terminates the element.
*
* @return true
if this element has zero-length {@linkplain #getContent() content}, otherwise false
.
* @see #isEmptyElementTag()
*/
public boolean isEmpty() {
return startTag.end==getContentEnd();
}
/**
* Indicates whether this element is an empty-element tag.
*
* This is equivalent to {@link #getStartTag()}.
{@link StartTag#isEmptyElementTag() isEmptyElementTag()}.
*
* @return true
if this element is an empty-element tag, otherwise false
.
*/
public boolean isEmptyElementTag() {
return startTag.isEmptyElementTag();
}
/**
* Returns the attributes specified in this element's start tag.
*
* This is equivalent to {@link #getStartTag()}.
{@link StartTag#getAttributes() getAttributes()}.
*
* @return the attributes specified in this element's start tag.
* @see StartTag#getAttributes()
*/
public Attributes getAttributes() {
return getStartTag().getAttributes();
}
/**
* Returns the {@linkplain CharacterReference#decode(CharSequence) decoded} value of the attribute with the specified name (case insensitive).
*
* Returns null
if the {@linkplain #getStartTag() start tag of this element} does not
* {@linkplain StartTagType#hasAttributes() have attributes},
* no attribute with the specified name exists or the attribute {@linkplain Attribute#hasValue() has no value}.
*
* This is equivalent to {@link #getStartTag()}.
{@link StartTag#getAttributeValue(String) getAttributeValue(attributeName)}.
*
* @param attributeName the name of the attribute to get.
* @return the {@linkplain CharacterReference#decode(CharSequence) decoded} value of the attribute with the specified name, or null
if the attribute does not exist or {@linkplain Attribute#hasValue() has no value}.
*/
public String getAttributeValue(final String attributeName) {
return getStartTag().getAttributeValue(attributeName);
}
/**
* Returns the {@link FormControl} defined by this element.
* @return the {@link FormControl} defined by this element, or null
if it is not a control.
*/
public FormControl getFormControl() {
return FormControl.construct(this);
}
public String getDebugInfo() {
if (this==NOT_CACHED) return "NOT_CACHED";
final StringBuilder sb=new StringBuilder();
sb.append("Element ");
startTag.appendDebugTag(sb);
if (!isEmpty()) sb.append('-');
if (endTag!=null) sb.append(endTag);
sb.append(' ');
startTag.appendDebugTagType(sb);
sb.append(super.getDebugInfo());
return sb.toString();
}
int getContentEnd() {
return endTag!=null ? endTag.begin : end;
}
}