![JAR search and dependency download from the Maven repository](/logo.png)
au.id.jericho.lib.html.Segment Maven / Gradle / Ivy
// Jericho HTML Parser - Java based library for analysing and manipulating HTML // Version 2.3 // Copyright (C) 2006 Martin Jericho // http://sourceforge.net/projects/jerichohtml/ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // http://www.gnu.org/copyleft/lesser.html // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package au.id.jericho.lib.html; import java.util.*; /** * Represents a segment of a {@link Source} document. *
. ** The span of a segment is defined by the combination of its begin and end character positions. */ public class Segment implements Comparable, CharSequence { final int begin; final int end; final Source source; List childElements=null; private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'}; // see comments in isWhiteSpace(char) method /** * Constructs a new
Segment
within the specified {@linkplain Source source} document with the specified begin and end character positions. * @param source the {@link Source} document, must not benull
. * @param begin the character position in the source where this segment begins. * @param end the character position in the source where this segment ends. */ public Segment(final Source source, final int begin, final int end) { if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException(); this.begin=begin; this.end=end; if (source==null) throw new IllegalArgumentException("source argument must not be null"); this.source=source; } // Only called from Source constructor Segment(final int length) { begin=0; this.end=length; source=(Source)this; } // Only used for creating dummy flag instances of this type (see Element.NOT_CACHED) Segment() { begin=0; end=0; source=null; } /** * Returns the character position in the {@link Source} document at which this segment begins. * @return the character position in the {@link Source} document at which this segment begins. */ public final int getBegin() { return begin; } /** * Returns the character position in the {@link Source} document immediately after the end of this segment. ** The character at the position specified by this property is not included in the segment. * * @return the character position in the {@link Source} document immediately after the end of this segment. */ public final int getEnd() { return end; } /** * Compares the specified object with this
Segment
for equality. ** Returns
true
if and only if the specified object is also aSegment
, * and both segments have the same {@link Source}, and the same begin and end positions. * @param object the object to be compared for equality with thisSegment
. * @returntrue
if the specified object is equal to thisSegment
, otherwisefalse
. */ public final boolean equals(final Object object) { if (object==null || !(object instanceof Segment)) return false; final Segment segment=(Segment)object; return segment.begin==begin && segment.end==end && segment.source==source; } /** * Returns a hash code value for the segment. ** The current implementation returns the sum of the begin and end positions, although this is not * guaranteed in future versions. * * @return a hash code value for the segment. */ public int hashCode() { return begin+end; } /** * Returns the length of the segment. * This is defined as the number of characters between the begin and end positions. * @return the length of the segment. */ public final int length() { return end-begin; } /** * Indicates whether this
Segment
encloses the specifiedSegment
. ** This is the case if {@link #getBegin()}
<=segment.
{@link #getBegin()}&&
{@link #getEnd()}>=segment.
{@link #getEnd()}. * * @param segment the segment to be tested for being enclosed by this segment. * @returntrue
if thisSegment
encloses the specifiedSegment
, otherwisefalse
. */ public final boolean encloses(final Segment segment) { return begin<=segment.begin && end>=segment.end; } /** * Indicates whether this segment encloses the specified character position in the source document. ** This is the case if {@link #getBegin()}
<= pos <
{@link #getEnd()}. * * @param pos the position in the {@link Source} document. * @returntrue
if this segment encloses the specified character position in the source document, otherwisefalse
. */ public final boolean encloses(final int pos) { return begin<=pos && posString * The returned
String
is newly created with every call to this method, unless this * segment is itself an instance of {@link Source}. ** Note that before version 2.0 this returned a representation of this object useful for debugging purposes, * which can now be obtained via the {@link #getDebugInfo()} method. * * @return the source text of this segment as a
String
. */ public String toString() { return source.string.substring(begin,end).toString(); } /** * Extracts the text content of this segment. ** This method removes all of the tags from the segment and * {@linkplain CharacterReference#decodeCollapseWhiteSpace(CharSequence) decodes the result, collapsing all white space}. *
* See the documentation of the {@link #extractText(boolean includeAttributes)} method for more details. *
* This is equivalent to calling {@link #extractText(boolean) extractText(false)}. * * @return the text content of this segment. */ public String extractText() { return extractText(false); } /** * Extracts the text content of this segment. *
* This method removes all of the tags from the segment and * {@linkplain CharacterReference#decodeCollapseWhiteSpace(CharSequence) decodes the result, collapsing all white space}. * Tags are also converted to whitespace unless they belong to an * {@linkplain HTMLElements#getInlineLevelElementNames() inline-level} element. * An exception to this is the {@link HTMLElementName#BR BR} element, which is also converted to whitespace despite being an inline-level element. *
* Text inside {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements contained within this segment * is ignored. *
* Specifying a value of
true
as an argument to theincludeAttributes
parameter causes the values of * title, * alt, * label, and * summary * attributes of {@linkplain StartTagType#NORMAL normal} tags to be included in the extracted text. **
-
*
- Example: *
- source segment "
<div><b>O</b>ne</div><div><b>T</b><script>//a script </script>wo</div>
" * produces the text "One Two
". *
* Note that in version 2.1, no tags were converted to whitespace and text inside {@link HTMLElementName#SCRIPT SCRIPT} and
* {@link HTMLElementName#STYLE STYLE} elements was included. The example above produced the text "
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* Specifying a
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List findAllStartTags() {
return findAllStartTags(null);
}
/**
* Returns a list of all {@link StartTag} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
*
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* Specifying a
* This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
*
* @param name the {@linkplain StartTag#getName() name} of the start tags to find.
* @return a list of all {@link StartTag} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List findAllStartTags(String name) {
if (name!=null) name=name.toLowerCase();
final boolean isXMLTagName=Tag.isXMLName(name);
StartTag startTag=(StartTag)checkEnclosure(StartTag.findPreviousOrNext(source,begin,name,isXMLTagName,false));
if (startTag==null) return Collections.EMPTY_LIST;
final ArrayList list=new ArrayList();
do {
list.add(startTag);
startTag=(StartTag)checkEnclosure(StartTag.findPreviousOrNext(source,startTag.begin+1,name,isXMLTagName,false));
} while (startTag!=null);
return list;
}
/**
* Returns a list of all {@link StartTag} objects with the specified attribute name/value pair
* that are {@linkplain #encloses(Segment) enclosed} by this segment.
*
* See the {@link Tag} class documentation for more details about the behaviour of this method.
*
* @param attributeName the attribute name (case insensitive) to search for, must not be
* The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
*
* The objects in the list are all of type {@link Element}.
*
* See the {@link Source#getChildElements()} method for more details.
*
* @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not
* The elements returned correspond exactly with the start tags returned in the {@link #findAllStartTags()} method.
*
* @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List findAllElements() {
return findAllElements((String)null);
}
/**
* Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
*
* The elements returned correspond exactly with the start tags returned in the {@link #findAllStartTags(String name)} method.
*
* Specifying a
* This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
*
* @param name the {@linkplain Element#getName() name} of the elements to find.
* @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List findAllElements(String name) {
if (name!=null) name=name.toLowerCase();
final List startTags=findAllStartTags(name);
if (startTags.isEmpty()) return Collections.EMPTY_LIST;
final ArrayList elements=new ArrayList(startTags.size());
for (final Iterator i=startTags.iterator(); i.hasNext();) {
final StartTag startTag=(StartTag)i.next();
final Element element=startTag.getElement();
if (element.end>end) break;
elements.add(element);
}
return elements;
}
/**
* Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
*
* The elements returned correspond exactly with the start tags returned in the {@link #findAllTags(TagType)} method.
*
* @param startTagType the {@linkplain StartTagType type} of start tags to find, must not be
* This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}
* This is equivalent to
* This method is usually used to exclude {@linkplain TagType#isServerTag() server tags} or other non-HTML segments from the source text
* so that they do not interfere with the parsing of the surrounding HTML.
*
* This is necessary because many server tags are used as attribute values and in other places within
* HTML tags, and very often contain characters that prevent the parser from recognising the surrounding tag.
*
* Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache},
* and so will continue to be found by the tag search methods.
* If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache.
* Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
*
* For efficiency reasons, this method should be called on all segments that need to be ignored without calling
* any of the tag search methods in between.
*
* @see Source#ignoreWhenParsing(Collection segments)
*/
public void ignoreWhenParsing() {
source.ignoreWhenParsing(begin,end);
}
/**
* Compares this
* If the argument is not a
* A segment is considered to be before another segment if its begin position is earlier,
* or in the case that both segments begin at the same position, its end position is earlier.
*
* Segments that begin and end at the same position are considered equal for
* the purposes of this comparison, even if they relate to different source documents.
*
* Note: this class has a natural ordering that is inconsistent with equals.
* This means that this method may return zero in some cases where calling the
* {@link #equals(Object)} method with the same argument returns
* The HTML 4.01 specification section 9.1
* specifies the following white space characters:
*
* Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not
* recognise them as whitespace and renders them as an unprintable character (empty square).
* Even zero-width spaces included using the numeric character reference
* However because this implementation works directly on the underlying document source string,
* it should not be assumed that an
* This is logically equivalent to
* However because this implementation works directly on the underlying document source string,
* it should not be assumed that an
* This method has been deprecated as of version 2.0 as it is not a robust method of checking whether an HTML comment spans this segment.
*
* @return
* This method has been deprecated as of version 2.0 in favour of the more generic {@link #findAllTags(TagType)} method.
*
* @return a list of all {@link StartTag} objects representing HTML {@linkplain StartTagType#COMMENT comments} that are {@linkplain #encloses(Segment) enclosed} by this segment.
* @deprecated Use {@link #findAllTags(TagType) findAllTags}
* This method has been deprecated as of version 2.0 as it now duplicates the functionality of the {@link #toString()} method.
*
* @return the source text of this segment.
* @deprecated Use {@link #toString() toString()} instead.
*/
public String getSourceText() {
return toString();
}
/**
* Returns the source text of this segment without {@linkplain #isWhiteSpace(char) white space}.
*
* All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
*
* This method has been deprecated as of version 2.0 as it is no longer used internally and
* has no practical use as a public method.
* It is similar to the new {@link CharacterReference#decodeCollapseWhiteSpace(CharSequence)} method, but
* does not {@linkplain CharacterReference#decode(CharSequence) decode} the text after collapsing the white space.
*
* @return the source text of this segment without white space.
* @deprecated Use the more useful {@link CharacterReference#decodeCollapseWhiteSpace(CharSequence)} method instead.
*/
public final String getSourceTextNoWhitespace() {
return appendCollapseWhiteSpace(new StringBuffer(length()),this).toString();
}
/**
* Returns a list of
* This method has been deprecated as of version 2.0 as it has no discernable use.
*
* @return a list of OneT//a script wo
".
*
* @param includeAttributes indicates whether the values of title, alt, label, and summary attributes are included.
* @return the text content of this segment.
*/
public String extractText(final boolean includeAttributes) {
final StringBuffer sb=new StringBuffer(length());
int textBegin=begin;
// use findAllTags().iterator() instead of source.findNextTag(textBegin) to take advantage of allTags cache in Source object
for (final Iterator i=findAllTags().iterator(); i.hasNext();) {
final Tag tag=(Tag)i.next();
final int textEnd=tag.begin;
if (textEndnull
argument to the tagType
parameter is equivalent to {@link #findAllTags()}.
*
* @param tagType the {@linkplain TagType type} of tags to find.
* @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List findAllTags(final TagType tagType) {
Tag tag=checkEnclosure(Tag.findPreviousOrNextTag(source,begin,tagType,false));
if (tag==null) return Collections.EMPTY_LIST;
final ArrayList list=new ArrayList();
do {
list.add(tag);
tag=checkEnclosure(Tag.findPreviousOrNextTag(source,tag.begin+1,tagType,false));
} while (tag!=null);
return list;
}
/**
* Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
* null
argument to the name
parameter is equivalent to {@link #findAllStartTags()}.
* null
.
* @param value the value of the specified attribute to search for, must not be null
.
* @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
* @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List findAllStartTags(final String attributeName, final String value, final boolean valueCaseSensitive) {
StartTag startTag=(StartTag)checkEnclosure(source.findNextStartTag(begin,attributeName,value,valueCaseSensitive));
if (startTag==null) return Collections.EMPTY_LIST;
final ArrayList list=new ArrayList();
do {
list.add(startTag);
startTag=(StartTag)checkEnclosure(source.findNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
} while (startTag!=null);
return list;
}
/**
* Returns a list of the immediate children of this segment in the document element hierarchy.
* null
.
* @see Element#getParentElement()
*/
public List getChildElements() {
if (childElements==null) {
if (length()==0) {
childElements=Collections.EMPTY_LIST;
} else {
childElements=new ArrayList();
int pos=begin;
while (true) {
final StartTag childStartTag=source.findNextStartTag(pos);
if (childStartTag==null || childStartTag.begin>=end) break;
if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag()) {
pos=childStartTag.end;
continue;
}
final Element childElement=childStartTag.getElement();
childElements.add(childElement);
childElement.getChildElements();
pos=childElement.end;
}
}
}
return childElements;
}
/**
* Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
* null
argument to the name
parameter is equivalent to {@link #findAllElements()}.
* null
.
* @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List findAllElements(final StartTagType startTagType) {
final List startTags=findAllTags(startTagType);
if (startTags.isEmpty()) return Collections.EMPTY_LIST;
final ArrayList elements=new ArrayList(startTags.size());
for (final Iterator i=startTags.iterator(); i.hasNext();) {
final StartTag startTag=(StartTag)i.next();
final Element element=startTag.getElement();
if (element.end>end) break;
elements.add(element);
}
return elements;
}
/**
* Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
* @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List findAllCharacterReferences() {
CharacterReference characterReference=findNextCharacterReference(begin);
if (characterReference==null) return Collections.EMPTY_LIST;
final ArrayList list=new ArrayList();
do {
list.add(characterReference);
characterReference=findNextCharacterReference(characterReference.end);
} while (characterReference!=null);
return list;
}
/**
* Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
* @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
*/
public List findFormControls() {
return FormControl.findAll(this);
}
/**
* Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
* (
{@link #findFormControls()})
.
*
* @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
* @see #findFormControls()
*/
public FormFields findFormFields() {
return new FormFields(findFormControls());
}
/**
* Parses any {@link Attributes} within this segment.
* This method is only used in the unusual situation where attributes exist outside of a start tag.
* The {@link StartTag#getAttributes()} method should be used in normal situations.
* source.
{@link Source#parseAttributes(int,int) parseAttributes}(
{@link #getBegin()},
{@link #getEnd()})
.
*
* @return the {@link Attributes} within this segment, or null
if too many errors occur while parsing.
*/
public Attributes parseAttributes() {
return source.parseAttributes(begin,end);
}
/**
* Causes the this segment to be ignored when parsing.
* Segment
object to another object.
* Segment
, a ClassCastException
is thrown.
* false
.
*
* @param o the segment to be compared
* @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
* @throws ClassCastException if the argument is not a Segment
*/
public int compareTo(final Object o) {
if (this==o) return 0;
final Segment segment=(Segment)o;
if (begintrue
if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise false
.
*/
public final boolean isWhiteSpace() {
for (int i=begin; i
*
* ​
are rendered this way.
*
* @param ch the character to test.
* @return true
if the specified character is white space, otherwise false
.
*/
public static final boolean isWhiteSpace(final char ch) {
for (int i=0; itoString().charAt(index)
* for valid argument values 0 <= index < length()
.
* IndexOutOfBoundsException
is thrown
* for an invalid argument value.
*
* @param index the index of the character.
* @return the character at the specified index.
*/
public final char charAt(final int index) {
return source.string.charAt(begin+index);
}
/**
* Returns a new character sequence that is a subsequence of this sequence.
* toString().subSequence(beginIndex,endIndex)
* for valid values of beginIndex
and endIndex
.
* IndexOutOfBoundsException
is thrown
* for invalid argument values as described in the String.subSequence(int,int)
method.
*
* @param beginIndex the begin index, inclusive.
* @param endIndex the end index, exclusive.
* @return a new character sequence that is a subsequence of this sequence.
*/
public final CharSequence subSequence(final int beginIndex, final int endIndex) {
return source.string.subSequence(begin+beginIndex,begin+endIndex);
}
/**
* Indicates whether this segment is a {@link Tag} of type {@link StartTagType#COMMENT}.
* true
if this segment is a {@link Tag} of type {@link StartTagType#COMMENT}, otherwise false
.
* @deprecated Use this instanceof
{@link Tag} && ((Tag)this).
{@link Tag#getTagType() getTagType()}==
{@link StartTagType#COMMENT} instead.
*/
public boolean isComment() {
return false; // overridden in StartTag
}
/**
* Returns a list of all {@link StartTag} objects representing HTML {@linkplain StartTagType#COMMENT comments} that are {@linkplain #encloses(Segment) enclosed} by this segment.
* (
{@link StartTagType#COMMENT})
instead.
*/
public List findAllComments() {
return findAllTags(StartTagType.COMMENT);
}
/**
* Returns the source text of this segment.
* Segment
objects representing every word in this segment separated by {@linkplain #isWhiteSpace(char) white space}.
* Note that any markup contained in this segment is regarded as normal text for the purposes of this method.
* Segment
objects representing every word in this segment separated by white space.
* @deprecated no replacement
*/
public final List findWords() {
final ArrayList words=new ArrayList();
int wordBegin=-1;
for (int i=begin; i