net.htmlparser.jericho.Segment Maven / Gradle / Ivy
Show all versions of jericho-html Show documentation
// Jericho HTML Parser - Java based library for analysing and manipulating HTML // Version 3.3 // Copyright (C) 2004-2009 Martin Jericho // http://jericho.htmlparser.net/ // // This library is free software; you can redistribute it and/or // modify it under the terms of either one of the following licences: // // 1. The Eclipse Public License (EPL) version 1.0, // included in this distribution in the file licence-epl-1.0.html // or available at http://www.eclipse.org/legal/epl-v10.html // // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later, // included in this distribution in the file licence-lgpl-2.1.txt // or available at http://www.gnu.org/licenses/lgpl.txt // // This library is distributed on an "AS IS" basis, // WITHOUT WARRANTY OF ANY KIND, either express or implied. // See the individual licence texts for more details. package net.htmlparser.jericho; import java.util.Iterator; import java.util.List; import java.util.Collections; import java.util.ArrayList; import java.util.regex.Pattern; /** * Represents a segment of a {@link Source} document. *
. ** Many of the tag search methods are defined in this class. *
* The span of a segment is defined by the combination of its begin and end character positions. */ public class Segment implements Comparable
, CharSequence { final int begin; final int end; final Source source; private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'}; // see comments in isWhiteSpace(char) method /** * Constructs a new Segment
within the specified {@linkplain Source source} document with the specified begin and end character positions. * @param source the {@link Source} document, must not benull
. * @param begin the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive. * @param end the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive. */ public Segment(final Source source, final int begin, final int end) { if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException(); this.begin=begin; this.end=end; if (source==null) throw new IllegalArgumentException("source argument must not be null"); this.source=source; } // Only called from Source constructor Segment(final int length) { begin=0; this.end=length; source=(Source)this; } // Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED) Segment() { this(0,0); } // Only used for creating dummy flag instances of this type (see Segment() constructor and StreamedSource.START_SEGMENT) Segment(final int begin, final int end) { this.begin=begin; this.end=end; source=null; } /** * Returns the {@link Source} document containing this segment. ** If a {@link StreamedSource} is in use, this method throws an
UnsupportedOperationException
. * * @return the {@link Source} document containing this segment. */ public final Source getSource() { if (source.isStreamed()) throw new UnsupportedOperationException("Source object is not available when using StreamedSource"); return source; } /** * Returns the character position in the {@link Source} document at which this segment begins, inclusive. ** Use the {@link Source#getRowColumnVector(int pos)} method to determine the row and column numbers corresponding to this character position. * * @return the character position in the {@link Source} document at which this segment begins, inclusive. */ public final int getBegin() { return begin; } /** * Returns the character position in the {@link Source} document immediately after the end of this segment. *
* The character at the position specified by this property is not included in the segment. * * @return the character position in the {@link Source} document immediately after the end of this segment. * @see #getBegin() */ public final int getEnd() { return end; } /** * Compares the specified object with this
Segment
for equality. ** Returns
true
if and only if the specified object is also aSegment
, * and both segments have the same {@link Source}, and the same begin and end positions. * @param object the object to be compared for equality with thisSegment
. * @returntrue
if the specified object is equal to thisSegment
, otherwisefalse
. */ public final boolean equals(final Object object) { if (this==object) return true; if (object==null || !(object instanceof Segment)) return false; final Segment segment=(Segment)object; return segment.begin==begin && segment.end==end && segment.source==source; } /** * Returns a hash code value for the segment. ** The current implementation returns the sum of the begin and end positions, although this is not * guaranteed in future versions. * * @return a hash code value for the segment. */ public int hashCode() { return begin+end; } /** * Returns the length of the segment. * This is defined as the number of characters between the begin and end positions. * @return the length of the segment. */ public int length() { return end-begin; } /** * Indicates whether this
Segment
encloses the specifiedSegment
. ** This is the case if {@link #getBegin()}
<=segment.
{@link #getBegin()}&&
{@link #getEnd()}>=segment.
{@link #getEnd()}. ** Note that a segment encloses itself. * * @param segment the segment to be tested for being enclosed by this segment. * @return
true
if thisSegment
encloses the specifiedSegment
, otherwisefalse
. */ public final boolean encloses(final Segment segment) { return begin<=segment.begin && end>=segment.end; } /** * Indicates whether this segment encloses the specified character position in the source document. ** This is the case if {@link #getBegin()}
<= pos <
{@link #getEnd()}. * * @param pos the position in the {@link Source} document. * @returntrue
if this segment encloses the specified character position in the source document, otherwisefalse
. */ public final boolean encloses(final int pos) { return begin<=pos && posString * The returned
String
is newly created with every call to this method, unless this * segment is itself an instance of {@link Source}. * * @return the source text of this segment as aString
. */ public String toString() { return source.subSequence(begin,end).toString(); } /** * Performs a simple rendering of the HTML markup in this segment into text. ** The output can be configured by setting any number of properties on the returned {@link Renderer} instance before * {@linkplain Renderer#writeTo(Writer) obtaining its output}. * * @return an instance of {@link Renderer} based on this segment. * @see #getTextExtractor() */ public Renderer getRenderer() { return new Renderer(this); } /** * Extracts the textual content from the HTML markup of this segment. *
* The output can be configured by setting properties on the returned {@link TextExtractor} instance before * {@linkplain TextExtractor#writeTo(Writer) obtaining its output}. *
* @return an instance of {@link TextExtractor} based on this segment. * @see #getRenderer() */ public TextExtractor getTextExtractor() { return new TextExtractor(this); } /** * Returns an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and plain text segment contained within this segment. *
* See the {@link Source#iterator()} method for a detailed description. *
*
-
*
- Example: *
-
*
* The following code demonstrates the typical usage of this method to make an exact copy of this segment to
*writer
(assuming no server tags are present): ** for (Iterator<Segment> nodeIterator=segment.getNoteIterator(); nodeIterator.hasNext();) { * Segment nodeSegment=nodeIterator.next(); * if (nodeSegment instanceof Tag) { * Tag tag=(Tag)nodeSegment; * // HANDLE TAG * // Uncomment the following line to ensure each tag is valid XML: * // writer.write(tag.tidy()); continue; * } else if (nodeSegment instanceof CharacterReference) { * CharacterReference characterReference=(CharacterReference)nodeSegment; * // HANDLE CHARACTER REFERENCE * // Uncomment the following line to decode all character references instead of copying them verbatim: * // characterReference.appendCharTo(writer); continue; * } else { * // HANDLE PLAIN TEXT * } * // unless specific handling has prevented getting to here, simply output the segment as is: * writer.write(nodeSegment.toString()); * }
*
*
* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object * if this method is to be used on a large proportion of the source. * It is called automatically if this method is called on the {@link Source} object itself. *
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* Specifying a
* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
* if this method is to be used on a large proportion of the source.
* It is called automatically if this method is called on the {@link Source} object itself.
*
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* Specifying a
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* Specifying a
* This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
*
* @param name the {@linkplain StartTag#getName() name} of the start tags to get.
* @return a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* @param attributeName the attribute name (case insensitive) to search for, must not be
* Specifying a
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* @param attributeName the attribute name (case insensitive) to search for, must not be
* This matches start tags with a
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* @param className the class name (case sensitive) to search for, must not be
* The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
*
* An element found at the start of this segment is included in the list.
* Note however that if this segment is an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead,
* which only returns the children of the element.
*
* Calling
* The objects in the list are all of type {@link Element}.
*
* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
* if this method is to be used on a large proportion of the source.
* It is called automatically if this method is called on the {@link Source} object itself.
*
* See the {@link Source#getChildElements()} method for more details.
*
* @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not
* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
* if this method is to be used on a large proportion of the source.
* It is called automatically if this method is called on the {@link Source} object itself.
*
* The elements returned correspond exactly with the start tags returned in the {@link #getAllStartTags()} method.
*
* If this segment is itself an {@link Element}, the result includes this element in the list.
*
* @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List
* The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String name)} method,
* except that elements which are not entirely enclosed by this segment are excluded.
*
* Specifying a
* This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
*
* If this segment is itself an {@link Element} with the specified name, the result includes this element in the list.
*
* @param name the {@linkplain Element#getName() name} of the elements to get.
* @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List
* The elements returned correspond with the start tags returned in the {@link #getAllTags(TagType)} method,
* except that elements which are not entirely enclosed by this segment are excluded.
*
* If this segment is itself an {@link Element} with the specified type, the result includes this element in the list.
*
* @param startTagType the {@linkplain StartTagType type} of start tags to get, must not be
* The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method,
* except that elements which are not entirely enclosed by this segment are excluded.
*
* If this segment is itself an {@link Element} with the specified name/value pair, the result includes this element in the list.
*
* @param attributeName the attribute name (case insensitive) to search for, must not be
* The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, Pattern valueRegexPattern)} method,
* except that elements which are not entirely enclosed by this segment are excluded.
*
* Specifying a
* If this segment is itself an {@link Element} with the specified attribute name and value pattern, the result includes this element in the list.
*
* @param attributeName the attribute name (case insensitive) to search for, must not be
* This matches elements with a
* The elements returned correspond with the start tags returned in the {@link #getAllStartTagsByClass(String className)} method,
* except that elements which are not entirely enclosed by this segment are excluded.
*
* If this segment is itself an {@link Element} with the specified class, the result includes this element in the list.
*
* @param className the class name (case sensitive) to search for, must not be
* According to the HTML 4.01 specification, the following attributes have URI values:
*
* Attributes from other elements may also be returned if the attribute name matches one of those in the list above.
*
* This method is often used in conjunction with the {@link #getStyleURISegments()} method in order to find all URIs in a document.
*
* The attributes are returned in order of appearance.
*
* @return a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have URI values.
* @see #getStyleURISegments()
*/
public List
* If this segment does not contain any tags, the entire segment is assumed to be CSS.
*
* The URI segments are found by searching the CSS for the functional notation "
* The segments are returned in order of appearance.
*
* @return a list of all URI {@linkplain Segment segments} inside {@link HTMLElementName#STYLE STYLE} elements and
* This is functionally equivalent to {@link #getAllStartTags()}
* This is functionally equivalent to {@link #getAllStartTags(StartTagType) getAllStartTags(startTagType)}
* This is functionally equivalent to {@link #getAllStartTags(String) getAllStartTags(name)}
* Specifying a
* This is functionally equivalent to {@link #getAllStartTags(String,String,boolean) getAllStartTags(attributeName,value,valueCaseSensitive)}
* This is functionally equivalent to {@link #getAllStartTags(String,Pattern) getAllStartTags(attributeName,valueRegexPattern)}
* This is functionally equivalent to {@link #getAllStartTagsByClass(String) getAllStartTagsByClass(className)}
* This is functionally equivalent to {@link #getAllElements()}
* If this segment is itself an {@link Element}, this element is returned, not the first child element.
*
* @return the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment, or
* This is functionally equivalent to {@link #getAllElements(String) getAllElements(name)}
* Specifying a
* If this segment is itself an {@link Element} with the specified name, this element is returned.
*
* @param name the {@linkplain Element#getName() name} of the element to search for.
* @return the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment, or
* This is functionally equivalent to {@link #getAllElements(String,String,boolean) getAllElements(attributeName,value,valueCaseSensitive)}
* If this segment is itself an {@link Element} with the specified attribute name/value pair, this element is returned.
*
* @param attributeName the attribute name (case insensitive) to search for, must not be
* This is functionally equivalent to {@link #getAllElements(String,Pattern) getAllElements(attributeName,valueRegexPattern)}
* If this segment is itself an {@link Element} with the specified attribute name and value pattern, this element is returned.
*
* @param attributeName the attribute name (case insensitive) to search for, must not be
* This is functionally equivalent to {@link #getAllElementsByClass(String) getAllElementsByClass(className)}
* If this segment is itself an {@link Element} with the specified class, this element is returned.
*
* @param className the class name (case sensitive) to search for, must not be
* This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}
* This is equivalent to
* Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions.
*
* This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside
* {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags
* (such as where an {@linkplain Attribute attribute} of a normal tag uses a server tag to dynamically set its value),
* as well as preventing non-server tags from being recognised inside server tags.
*
* It is not necessary to use this method to ignore {@linkplain TagType#isServerTag() server tags} located inside normal tags,
* as the attributes parser automatically ignores any server tags.
*
* It is not necessary to use this method to ignore non-server tags inside server tags, or the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements,
* as the parser does this automatically when performing a {@linkplain Source#fullSequentialParse() full sequential parse}.
*
* This leaves only very few scenarios where calling this method still provides a significant benefit.
*
* One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags.
* Here is an example using an XML-style JSP tag:
*
* It is important to understand the difference between ignoring the segment when parsing and removing the segment completely.
* Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of
* tools such as {@link TextExtractor} and {@link Renderer}.
*
* To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
* {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment.
* Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
* and perform the desired operations on this new source object.
*
* Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an
* Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache},
* and so will continue to be found by the tag search methods.
* If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache.
* Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
*
* For best performance, this method should be called on all segments that need to be ignored without calling
* any of the tag search methods in between.
*
* @see Source#ignoreWhenParsing(Collection segments)
*/
public void ignoreWhenParsing() {
source.ignoreWhenParsing(begin,end);
}
/**
* Compares this
* If the argument is not a
* A segment is considered to be before another segment if its begin position is earlier,
* or in the case that both segments begin at the same position, its end position is earlier.
*
* Segments that begin and end at the same position are considered equal for
* the purposes of this comparison, even if they relate to different source documents.
*
* Note: this class has a natural ordering that is inconsistent with equals.
* This means that this method may return zero in some cases where calling the
* {@link #equals(Object)} method with the same argument returns
* The usefulness of this method is debatable as a
* It is up to the application developer to determine what return value constitutes an unreasonable level of nesting given the stack space allocated to the application
* and other factors.
*
* Note that the return value is an approximation only and is usually greater than the actual maximum element depth that would be reported by calling the
* {@link Element#getDepth()} method on the most nested element.
*
* @return an indication of the maximum depth of nested elements within this segment.
*/
public int getMaxDepthIndicator() {
int maxDepth=0;
int depth=0;
for (Tag tag : getAllTags()) {
if (tag instanceof StartTag) {
StartTag startTag=(StartTag)tag;
if (startTag.getStartTagType().getCorrespondingEndTagType()==null) continue;
if (HTMLElements.getEndTagForbiddenElementNames().contains(startTag.getName())) continue;
if (startTag.isEmptyElementTag()) continue;
depth++;
if (depth>maxDepth) maxDepth++;
} else {
depth--;
}
}
return maxDepth;
}
/**
* Indicates whether the specified character is white space.
*
* The HTML 4.01 specification section 9.1
* specifies the following white space characters:
*
* Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not
* recognise them as white space and renders them as an unprintable character (empty square).
* Even zero-width spaces included using the numeric character reference
* This is logically equivalent to
* However because this implementation works directly on the underlying document source string,
* it should not be assumed that an
* This is logically equivalent to
* However because this implementation works directly on the underlying document source text,
* it should not be assumed that an null
argument to the tagType
parameter is equivalent to {@link #getAllTags()}.
*
* @param tagType the {@linkplain TagType type} of tags to get.
* @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
* @see #getAllStartTags(StartTagType)
*/
public Listnull
argument to the startTagType
parameter is equivalent to {@link #getAllStartTags()}.
*
* @param startTagType the {@linkplain StartTagType type} of tags to get.
* @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public Listnull
argument to the name
parameter is equivalent to {@link #getAllStartTags()}, which may include non-{@linkplain StartTagType#NORMAL normal} start tags.
* null
.
* @param value the value of the specified attribute to search for, must not be null
.
* @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
* @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
* @see #getAllStartTags(String attributeName, Pattern valueRegexPattern)
*/
public Listnull
argument to the valueRegexPattern
parameter performs the search on the attribute name only,
* without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
* null
.
* @param valueRegexPattern the regular expression pattern that must match the attribute value, may be null
.
* @return a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
* @see #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)
*/
public Listclass
attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple
* class names separated by white space in the attribute value.
* null
.
* @return a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public ListgetChildElements()
on an Element
is much more efficient than calling it on a Segment
.
* null
.
* @see Element#getParentElement()
*/
public Listnull
argument to the name
parameter is equivalent to {@link #getAllElements()}, which may include elements of non-{@linkplain StartTagType#NORMAL normal} tags.
* null
.
* @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public Listnull
.
* @param value the value of the specified attribute to search for, must not be null
.
* @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
* @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
* @see #getAllElements(String attributeName, Pattern valueRegexPattern)
*/
public Listnull
argument to the valueRegexPattern
parameter performs the search on the attribute name only,
* without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
* null
.
* @param valueRegexPattern the regular expression pattern that must match the attribute value, may be null
.
* @return a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
* @see #getAllElements(String attributeName, String value, boolean valueCaseSensitive)
*/
public Listclass
attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple
* class names separated by white space in the attribute value.
* null
.
* @return a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List
*
* HTML element name Attribute name
* {@link HTMLElementName#A A} href
* {@link HTMLElementName#APPLET APPLET} codebase
* {@link HTMLElementName#APPLET APPLET} archive
* {@link HTMLElementName#AREA AREA} href
* {@link HTMLElementName#BASE BASE} href
* {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE} cite
* {@link HTMLElementName#BODY BODY} background
* {@link HTMLElementName#FORM FORM} action
* {@link HTMLElementName#FRAME FRAME} longdesc
* {@link HTMLElementName#FRAME FRAME} src
* {@link HTMLElementName#DEL DEL} cite
* {@link HTMLElementName#HEAD HEAD} profile
* {@link HTMLElementName#IFRAME IFRAME} longdesc
* {@link HTMLElementName#IFRAME IFRAME} src
* {@link HTMLElementName#IMG IMG} longdesc
* {@link HTMLElementName#IMG IMG} src
* {@link HTMLElementName#IMG IMG} usemap
* {@link HTMLElementName#INPUT INPUT} src
* {@link HTMLElementName#INPUT INPUT} usemap
* {@link HTMLElementName#INS INS} cite
* {@link HTMLElementName#LINK LINK} href
* {@link HTMLElementName#OBJECT OBJECT} archive
* {@link HTMLElementName#OBJECT OBJECT} classid
* {@link HTMLElementName#OBJECT OBJECT} codebase
* {@link HTMLElementName#OBJECT OBJECT} data
* {@link HTMLElementName#OBJECT OBJECT} usemap
* {@link HTMLElementName#Q Q} cite
* {@link HTMLElementName#SCRIPT SCRIPT} src
* style
attribute values
* {@linkplain #encloses(Segment) enclosed} by this segment.
* url()
" as described in
* section 4.3.4 of the CSS2 specification.
* style
attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
* @see #getURIAttributes()
*/
public List.iterator().next()
,
* but does not search beyond the first start tag and returns null
if no such start tag exists.
*
* @return the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or null
if none exists.
*/
public final StartTag getFirstStartTag() {
return checkEnclosure(source.getNextStartTag(begin));
}
/**
* Returns the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment.
* .iterator().next()
,
* but does not search beyond the first start tag and returns null
if no such start tag exists.
*
* @param startTagType the StartTagType
to search for.
* @return the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment, or null
if none exists.
*/
public final StartTag getFirstStartTag(StartTagType startTagType) {
return checkEnclosure(source.getNextStartTag(begin,startTagType));
}
/**
* Returns the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
* .iterator().next()
,
* but does not search beyond the first start tag and returns null
if no such start tag exists.
* null
argument to the name
parameter is equivalent to {@link #getFirstStartTag()}.
*
* @param name the {@linkplain StartTag#getName() name} of the start tag to search for, may be null
.
* @return the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or null
if none exists.
*/
public final StartTag getFirstStartTag(String name) {
return checkEnclosure(source.getNextStartTag(begin,name));
}
/**
* Returns the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
* .iterator().next()
,
* but does not search beyond the first start tag and returns null
if no such start tag exists.
*
* @param attributeName the attribute name (case insensitive) to search for, must not be null
.
* @param value the value of the specified attribute to search for, must not be null
.
* @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
* @return the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or null
if none exists.
* @see #getFirstStartTag(String attributeName, Pattern valueRegexPattern)
*/
public final StartTag getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive) {
return checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
}
/**
* Returns the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
* .iterator().next()
,
* but does not search beyond the first start tag and returns null
if no such start tag exists.
*
* @param attributeName the attribute name (case insensitive) to search for, must not be null
.
* @param valueRegexPattern the regular expression pattern that must match the attribute value, may be null
.
* @return the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or null
if none exists.
* @see #getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
*/
public final StartTag getFirstStartTag(final String attributeName, final Pattern valueRegexPattern) {
return checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
}
/**
* Returns the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
* .iterator().next()
,
* but does not search beyond the first start tag and returns null
if no such start tag exists.
*
* @param className the class name (case sensitive) to search for, must not be null
.
* @return the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or null
if none exists.
*/
public final StartTag getFirstStartTagByClass(final String className) {
return checkEnclosure(source.getNextStartTagByClass(begin,className));
}
/**
* Returns the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment.
* .iterator().next()
,
* but does not search beyond the first enclosed element and returns null
if no such element exists.
* null
if none exists.
*/
public final Element getFirstElement() {
StartTag startTag=checkEnclosure(StartTag.getNext(source,begin));
while (startTag!=null) {
final Element element=startTag.getElement();
if (element.end<=end) return element;
startTag=checkEnclosure(startTag.getNextStartTag());
}
return null;
}
/**
* Returns the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment.
* .iterator().next()
,
* but does not search beyond the first enclosed element and returns null
if no such element exists.
* null
argument to the name
parameter is equivalent to {@link #getFirstElement()}.
* null
if none exists.
*/
public final Element getFirstElement(String name) {
if (name==null) return getFirstElement();
final boolean isXMLTagName=Tag.isXMLName(name);
name=name.toLowerCase();
StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName));
while (startTag!=null) {
final Element element=startTag.getElement();
if (element.end<=end) return element;
startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName));
}
return null;
}
/**
* Returns the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
* .iterator().next()
,
* but does not search beyond the first enclosed element and returns null
if no such element exists.
* null
.
* @param value the value of the specified attribute to search for, must not be null
.
* @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
* @return the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or null
if none exists.
* @see #getFirstElement(String attributeName, Pattern valueRegexPattern)
*/
public final Element getFirstElement(String attributeName, String value, boolean valueCaseSensitive) {
StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
while (startTag!=null) {
final Element element=startTag.getElement();
if (element.end<=end) return element;
startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
}
return null;
}
/**
* Returns the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
* .iterator().next()
,
* but does not search beyond the first enclosed element and returns null
if no such element exists.
* null
.
* @param valueRegexPattern the regular expression pattern that must match the attribute value, may be null
.
* @return the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or null
if none exists.
* @see #getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
*/
public final Element getFirstElement(final String attributeName, final Pattern valueRegexPattern) {
StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
while (startTag!=null) {
final Element element=startTag.getElement();
if (element.end<=end) return element;
startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern));
}
return null;
}
/**
* Returns the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
* .iterator().next()
,
* but does not search beyond the first enclosed element and returns null
if no such element exists.
* null
.
* @return the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or null
if none exists.
*/
public final Element getFirstElementByClass(final String className) {
StartTag startTag=checkEnclosure(source.getNextStartTagByClass(begin,className));
while (startTag!=null) {
final Element element=startTag.getElement();
if (element.end<=end) return element;
startTag=checkEnclosure(source.getNextStartTagByClass(startTag.begin+1,className));
}
return null;
}
/**
* Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
* @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List(
{@link #getFormControls()})
.
*
* @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
* @see #getFormControls()
*/
public FormFields getFormFields() {
return new FormFields(getFormControls());
}
/**
* Parses any {@link Attributes} within this segment.
* This method is only used in the unusual situation where attributes exist outside of a start tag.
* The {@link StartTag#getAttributes()} method should be used in normal situations.
* source.
{@link Source#parseAttributes(int,int) parseAttributes}(
{@link #getBegin()},
{@link #getEnd()})
.
*
* @return the {@link Attributes} within this segment, or null
if too many errors occur while parsing.
*/
public Attributes parseAttributes() {
return source.parseAttributes(begin,end);
}
/**
* Causes the this segment to be ignored when parsing.
*
* The first double-quote of <a href="<i18n:resource path="/Portal"/>?BACK=TRUE">back</a>
"/Portal"
will be interpreted as the end quote for the href
attribute,
* as there is no way for the parser to recognise the il8n:resource
element as a server tag.
* Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice,
* but it is nevertheless valid JSP. The only way to ensure that this library is able to parse the normal tag surrounding it is to
* find these server tags first and call the ignoreWhenParsing
method to ignore them before parsing the rest of the document.
* IllegalStateException
.
* Segment
object to another object.
* Segment
, a ClassCastException
is thrown.
* false
.
*
* @param segment the segment to be compared
* @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
* @throws ClassCastException if the argument is not a Segment
*/
public int compareTo(final Segment segment) {
if (this==segment) return 0;
if (begintrue
if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise false
.
*/
public final boolean isWhiteSpace() {
for (int i=begin; iStackOverflowException
* if its content is parsed.
* StackOverflowException
is a recoverable error that can be easily caught.
* The use of this method to pre-detect and avoid a stack overflow may save some memory and processing resources in certain circumstances, but the cost of calling
* this method to check every segment or document will very often exceed any benefit.
*
*
* ​
are rendered this way.
*
* @param ch the character to test.
* @return true
if the specified character is white space, otherwise false
.
*/
public static final boolean isWhiteSpace(final char ch) {
for (char whiteSpaceChar : WHITESPACE) if (ch==whiteSpaceChar) return true;
return false;
}
/**
* Returns a {@link RowColumnVector} object representing the row and column number of the {@linkplain #getBegin() start} of this segment in the source document.
* @return a {@link RowColumnVector} object representing the row and column number of the {@linkplain #getBegin() start} of this segment in the source document.
* @see Source#getRowColumnVector(int pos)
*/
public RowColumnVector getRowColumnVector() {
return source.getRowColumnVector(begin);
}
/**
* Returns a string representation of this object useful for debugging purposes.
* @return a string representation of this object useful for debugging purposes.
*/
public String getDebugInfo() {
final StringBuilder sb=new StringBuilder(50);
sb.append('(');
source.getRowColumnVector(begin).appendTo(sb);
sb.append('-');
source.getRowColumnVector(end).appendTo(sb);
sb.append(')');
return sb.toString();
}
/**
* Returns the character at the specified index.
* toString().charAt(index)
* for valid argument values 0 <= index < length()
.
* IndexOutOfBoundsException
is thrown
* for an invalid argument value.
*
* @param index the index of the character.
* @return the character at the specified index.
*/
public char charAt(final int index) {
return source.charAt(begin+index);
}
/**
* Returns a new character sequence that is a subsequence of this sequence.
* toString().subSequence(beginIndex,endIndex)
* for valid values of beginIndex
and endIndex
.
* IndexOutOfBoundsException
is thrown
* for invalid argument values as described in the String.subSequence(int,int)
method.
*
* @param beginIndex the begin index, inclusive.
* @param endIndex the end index, exclusive.
* @return a new character sequence that is a subsequence of this sequence.
*/
public CharSequence subSequence(final int beginIndex, final int endIndex) {
return source.subSequence(begin+beginIndex,begin+endIndex);
}
/**
* Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text.
* All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
*/
static final StringBuilder appendCollapseWhiteSpace(final StringBuilder sb, final CharSequence text) {
final int textLength=text.length();
int i=0;
boolean lastWasWhiteSpace=false;
while (true) {
if (i>=textLength) return sb;
if (!isWhiteSpace(text.charAt(i))) break;
i++;
}
do {
final char ch=text.charAt(i++);
if (isWhiteSpace(ch)) {
lastWasWhiteSpace=true;
} else {
if (lastWasWhiteSpace) {
sb.append(' ');
lastWasWhiteSpace=false;
}
sb.append(ch);
}
} while (i