src.java.net.htmlparser.jericho.Segment Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of com.liferay.wiki.engine.html
Liferay Wiki Engine HTML
There is a newer version: 6.0.13
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.1
// Copyright (C) 2004-2009 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import java.util.Iterator;
import java.util.List;
import java.util.Collections;
import java.util.ArrayList;
import java.util.regex.Pattern;

/**
 * Represents a segment of a {@link Source} document.
 * 
 * Many of the tag search methods are defined in this class.
 * 

 * The span of a segment is defined by the combination of its begin and end character positions.
 */
public class Segment implements Comparable, CharSequence {
	final int begin;
	final int end;
	final Source source;
	
	private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'}; // see comments in isWhiteSpace(char) method

	/**
	 * Constructs a new Segment within the specified {@linkplain Source source} document with the specified begin and end character positions.
	 * @param source  the {@link Source} document, must not be null.
	 * @param begin  the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive.
	 * @param end  the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive.
	 */
	public Segment(final Source source, final int begin, final int end) {
		if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException();
		this.begin=begin;
		this.end=end;
		if (source==null) throw new IllegalArgumentException("source argument must not be null");
		this.source=source;
	}

	// Only called from Source constructor
	Segment(final int length) {
		begin=0;
		this.end=length;
		source=(Source)this;
	}

	// Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED)
	Segment() {
		this(0,0);
	}

	// Only used for creating dummy flag instances of this type (see Segment() constructor and StreamedSource.START_SEGMENT)
	Segment(final int begin, final int end) {
		this.begin=begin;
		this.end=end;
		source=null;
	}

	/**
	 * Returns the {@link Source} document containing this segment.
	 * 

 	 * If a {@link StreamedSource} is in use, this method throws an UnsupportedOperationException.
	 *
	 * @return the {@link Source} document containing this segment.
	 */
	public final Source getSource() {
		if (source.isStreamed()) throw new UnsupportedOperationException("Source object is not available when using StreamedSource");
		return source;
	}

	/**
	 * Returns the character position in the {@link Source} document at which this segment begins, inclusive.
	 * @return the character position in the {@link Source} document at which this segment begins, inclusive.
	 */
	public final int getBegin() {
		return begin;
	}

	/**
	 * Returns the character position in the {@link Source} document immediately after the end of this segment.
	 * 

	 * The character at the position specified by this property is not included in the segment.
	 *
	 * @return the character position in the {@link Source} document immediately after the end of this segment.
	 */
	public final int getEnd() {
		return end;
	}

	/**
	 * Compares the specified object with this Segment for equality.
	 * 

	 * Returns true if and only if the specified object is also a Segment,
	 * and both segments have the same {@link Source}, and the same begin and end positions.
	 * @param object  the object to be compared for equality with this Segment.
	 * @return true if the specified object is equal to this Segment, otherwise false.
	 */
	public final boolean equals(final Object object) {
		if (this==object) return true;
		if (object==null || !(object instanceof Segment)) return false;
		final Segment segment=(Segment)object;
		return segment.begin==begin && segment.end==end && segment.source==source;
	}

	/**
	 * Returns a hash code value for the segment.
	 * 

	 * The current implementation returns the sum of the begin and end positions, although this is not
	 * guaranteed in future versions.
	 *
	 * @return a hash code value for the segment.
	 */
	public int hashCode() {
		return begin+end;
	}

	/**
	 * Returns the length of the segment.
	 * This is defined as the number of characters between the begin and end positions.
	 * @return the length of the segment.
	 */
	public int length() {
		return end-begin;
	}

	/**
	 * Indicates whether this Segment encloses the specified Segment.
	 * 

	 * This is the case if {@link #getBegin()}<=segment.{@link #getBegin()} && {@link #getEnd()}>=segment.{@link #getEnd()}.
	 * 

	 * Note that a segment encloses itself.
	 *
	 * @param segment  the segment to be tested for being enclosed by this segment.
	 * @return true if this Segment encloses the specified Segment, otherwise false.
	 */
	public final boolean encloses(final Segment segment) {
		return begin<=segment.begin && end>=segment.end;
	}

	/**
	 * Indicates whether this segment encloses the specified character position in the source document.
	 * 
	 * This is the case if {@link #getBegin()} <= pos < {@link #getEnd()}.
	 *
	 * @param pos  the position in the {@link Source} document.
	 * @return true if this segment encloses the specified character position in the source document, otherwise false.
	 */
	public final boolean encloses(final int pos) {
		return begin<=pos && posString.
	 * 

	 * The returned String is newly created with every call to this method, unless this
	 * segment is itself an instance of {@link Source}.
	 *
	 * @return the source text of this segment as a String.
	 */
	public String toString() {
		return source.subSequence(begin,end).toString();
	}

	/**
	 * Performs a simple rendering of the HTML markup in this segment into text.
	 * 

	 * The output can be configured by setting any number of properties on the returned {@link Renderer} instance before
	 * {@linkplain Renderer#writeTo(Writer) obtaining its output}.
	 * 
	 * @return an instance of {@link Renderer} based on this segment.
	 * @see #getTextExtractor()
	 */
	public Renderer getRenderer() {
		return new Renderer(this);
	}

	/**
	 * Extracts the textual content from the HTML markup of this segment.
	 * 

	 * The output can be configured by setting properties on the returned {@link TextExtractor} instance before
	 * {@linkplain TextExtractor#writeTo(Writer) obtaining its output}.
	 * 

	 * @return an instance of {@link TextExtractor} based on this segment.
	 * @see #getRenderer()
	 */
	public TextExtractor getTextExtractor() {
		return new TextExtractor(this);
	}

	/**
	 * Returns an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and plain text segment contained within this segment.
	 * 

	 * See the {@link Source#iterator()} method for a detailed description.
	 * 

	 * 

	 *  Example:
	 *  
	 *   
	 *    The following code demonstrates the typical usage of this method to make an exact copy of this segment to writer (assuming no server tags are present):
	 *   
	 * 	 * for (Iterator<Segment> nodeIterator=segment.getNoteIterator(); nodeIterator.hasNext();) {
	 *   Segment nodeSegment=nodeIterator.next();
	 *   if (nodeSegment instanceof Tag) {
	 *     Tag tag=(Tag)nodeSegment;
	 *     // HANDLE TAG
	 *     // Uncomment the following line to ensure each tag is valid XML:
	 *     // writer.write(tag.tidy()); continue;
	 *   } else if (nodeSegment instanceof CharacterReference) {
	 *     CharacterReference characterReference=(CharacterReference)nodeSegment;
	 *     // HANDLE CHARACTER REFERENCE
	 *     // Uncomment the following line to decode all character references instead of copying them verbatim:
	 *     // characterReference.appendCharTo(writer); continue;
	 *   } else {
	 *     // HANDLE PLAIN TEXT
	 *   }
	 *   // unless specific handling has prevented getting to here, simply output the segment as is:
	 *   writer.write(nodeSegment.toString());
	 * }
	 *  
	 * 
	 * @return an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and plain text segment contained within this segment.
	 */
	public Iterator getNodeIterator() {
		return new NodeIterator(this);
	}

	/**
	 * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 
	 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
	 * if this method is to be used on a large proportion of the source.
	 * It is called automatically if this method is called on the {@link Source} object itself.
	 * 

	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
	 *
	 * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List getAllTags() {
		return getAllTags(null);
	}

	/**
	 * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
	 * 

	 * Specifying a null argument to the tagType parameter is equivalent to {@link #getAllTags()}.
	 *
	 * @param tagType  the {@linkplain TagType type} of tags to get.
	 * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * @see #getAllStartTags(StartTagType)
	 */
	public List getAllTags(final TagType tagType) {
		Tag tag=checkTagEnclosure(Tag.getNextTag(source,begin,tagType));
		if (tag==null) return Collections.emptyList();
		final ArrayList list=new ArrayList();
		do {
			list.add(tag);
			tag=checkTagEnclosure(tag.getNextTag(tagType));
		} while (tag!=null);
		return list;
	}

	/**
	 * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
	 * if this method is to be used on a large proportion of the source.
	 * It is called automatically if this method is called on the {@link Source} object itself.
	 * 

	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
	 *
	 * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List getAllStartTags() {
		StartTag startTag=checkEnclosure(StartTag.getNext(source,begin));
		if (startTag==null) return Collections.emptyList();
		final ArrayList list=new ArrayList();
		do {
			list.add(startTag);
			startTag=checkEnclosure(startTag.getNextStartTag());
		} while (startTag!=null);
		return list;
	}

	/**
	 * Returns a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
	 * 

	 * Specifying a null argument to the startTagType parameter is equivalent to {@link #getAllStartTags()}.
	 *
	 * @param startTagType  the {@linkplain StartTagType type} of tags to get.
	 * @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List getAllStartTags(final StartTagType startTagType) {
		if (startTagType==null) return getAllStartTags();
		StartTag startTag=(StartTag)checkTagEnclosure(Tag.getNextTag(source,begin,startTagType));
		if (startTag==null) return Collections.emptyList();
		final ArrayList list=new ArrayList();
		do {
			list.add(startTag);
			startTag=(StartTag)checkTagEnclosure(startTag.getNextTag(startTagType));
		} while (startTag!=null);
		return list;
	}

	/**
	 * Returns a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
	 * 

	 * Specifying a null argument to the name parameter is equivalent to {@link #getAllStartTags()}, which may include non-{@linkplain StartTagType#NORMAL normal} start tags.
	 * 

	 * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
	 *
	 * @param name  the {@linkplain StartTag#getName() name} of the start tags to get.
	 * @return a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List getAllStartTags(String name) {
		if (name==null) return getAllStartTags();
		final boolean isXMLTagName=Tag.isXMLName(name);
		StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName));
		if (startTag==null) return Collections.emptyList();
		final ArrayList list=new ArrayList();
		do {
			list.add(startTag);
			startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName));
		} while (startTag!=null);
		return list;
	}

	/**
	 * Returns a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
	 *
	 * @param attributeName  the attribute name (case insensitive) to search for, must not be null.
	 * @param value  the value of the specified attribute to search for, must not be null.
	 * @param valueCaseSensitive  specifies whether the attribute value matching is case sensitive.
	 * @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * @see #getAllStartTags(String attributeName, Pattern valueRegexPattern)
	 */
	public List getAllStartTags(final String attributeName, final String value, final boolean valueCaseSensitive) {
		StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
		if (startTag==null) return Collections.emptyList();
		final ArrayList list=new ArrayList();
		do {
			list.add(startTag);
			startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
		} while (startTag!=null);
		return list;
	}

	/**
	 * Returns a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * Specifying a null argument to the valueRegexPattern parameter performs the search on the attribute name only,
	 * without regard to the attribute value.  This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
	 * 

	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
	 *
	 * @param attributeName  the attribute name (case insensitive) to search for, must not be null.
	 * @param valueRegexPattern  the regular expression pattern that must match the attribute value, may be null.
	 * @return a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * @see #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)
	 */
	public List getAllStartTags(final String attributeName, final Pattern valueRegexPattern) {
		StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
		if (startTag==null) return Collections.emptyList();
		final ArrayList list=new ArrayList();
		do {
			list.add(startTag);
			startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern));
		} while (startTag!=null);
		return list;
	}

	/**
	 * Returns a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This matches start tags with a class attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple
	 * class names separated by white space in the attribute value.
	 * 

	 * See the {@link Tag} class documentation for more details about the behaviour of this method.
	 *
	 * @param className  the class name (case sensitive) to search for, must not be null.
	 * @return a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List getAllStartTagsByClass(final String className) {
		return getAllStartTags("class",getClassPattern(className));
	}

	/**
	 * Returns a list of the immediate children of this segment in the document element hierarchy.
	 * 

	 * The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
	 * 

	 * An element found at the start of this segment is included in the list.
	 * Note however that if this segment is an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead,
	 * which only returns the children of the element.
	 * 

	 * Calling getChildElements() on an Element is much more efficient than calling it on a Segment.
	 * 

	 * The objects in the list are all of type {@link Element}.
	 * 

	 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
	 * if this method is to be used on a large proportion of the source.
	 * It is called automatically if this method is called on the {@link Source} object itself.
	 * 

	 * See the {@link Source#getChildElements()} method for more details.
	 *
	 * @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not null.
	 * @see Element#getParentElement()
	 */
	public List getChildElements() {
		if (length()==0) return Collections.emptyList();
		List childElements=new ArrayList();
		int pos=begin;
		while (true) {
			final StartTag childStartTag=source.getNextStartTag(pos);
			if (childStartTag==null || childStartTag.begin>=end) break;
			if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag()) {
				pos=childStartTag.end;
				continue;
			}
			final Element childElement=childStartTag.getElement();
			childElements.add(childElement);
			childElement.getChildElements();
			pos=childElement.end;
		}
		return childElements;
	}

	/**
	 * Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
	 * if this method is to be used on a large proportion of the source.
	 * It is called automatically if this method is called on the {@link Source} object itself.
	 * 

	 * The elements returned correspond exactly with the start tags returned in the {@link #getAllStartTags()} method.
	 * 

	 * If this segment is itself an {@link Element}, the result includes this element in the list.
	 *
	 * @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List getAllElements() {
		return getAllElements(getAllStartTags());
	}

	/**
	 * Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String name)} method,
	 * except that elements which are not entirely enclosed by this segment are excluded.
	 * 

	 * Specifying a null argument to the name parameter is equivalent to {@link #getAllElements()}, which may include elements of non-{@linkplain StartTagType#NORMAL normal} tags.
	 * 

	 * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
	 * 

	 * If this segment is itself an {@link Element} with the specified name, the result includes this element in the list.
	 *
	 * @param name  the {@linkplain Element#getName() name} of the elements to get.
	 * @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List getAllElements(String name) {
		return getAllElements(getAllStartTags(name));
	}

	/**
	 * Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * The elements returned correspond with the start tags returned in the {@link #getAllTags(TagType)} method,
	 * except that elements which are not entirely enclosed by this segment are excluded.
	 * 

	 * If this segment is itself an {@link Element} with the specified type, the result includes this element in the list.
	 *
	 * @param startTagType  the {@linkplain StartTagType type} of start tags to get, must not be null.
	 * @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List getAllElements(final StartTagType startTagType) {
		if (startTagType==null) throw new IllegalArgumentException("startTagType argument must not be null");
		return getAllElements(getAllStartTags(startTagType));
	}

	/**
	 * Returns a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method,
	 * except that elements which are not entirely enclosed by this segment are excluded.
	 * 

	 * If this segment is itself an {@link Element} with the specified name/value pair, the result includes this element in the list.
	 *
	 * @param attributeName  the attribute name (case insensitive) to search for, must not be null.
	 * @param value  the value of the specified attribute to search for, must not be null.
	 * @param valueCaseSensitive  specifies whether the attribute value matching is case sensitive.
	 * @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * @see #getAllElements(String attributeName, Pattern valueRegexPattern)
	 */
	public List getAllElements(final String attributeName, final String value, final boolean valueCaseSensitive) {
		return getAllElements(getAllStartTags(attributeName,value,valueCaseSensitive));
	}

	/**
	 * Returns a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, Pattern valueRegexPattern)} method,
	 * except that elements which are not entirely enclosed by this segment are excluded.
	 * 

	 * Specifying a null argument to the valueRegexPattern parameter performs the search on the attribute name only,
	 * without regard to the attribute value.  This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
	 * 

	 * If this segment is itself an {@link Element} with the specified attribute name and value pattern, the result includes this element in the list.
	 *
	 * @param attributeName  the attribute name (case insensitive) to search for, must not be null.
	 * @param valueRegexPattern  the regular expression pattern that must match the attribute value, may be null.
	 * @return a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * @see #getAllElements(String attributeName, String value, boolean valueCaseSensitive)
	 */
	public List getAllElements(final String attributeName, final Pattern valueRegexPattern) {
		return getAllElements(getAllStartTags(attributeName,valueRegexPattern));
	}

	/**
	 * Returns a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This matches elements with a class attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple
	 * class names separated by white space in the attribute value.
	 * 

	 * The elements returned correspond with the start tags returned in the {@link #getAllStartTagsByClass(String className)} method,
	 * except that elements which are not entirely enclosed by this segment are excluded.
	 * 

	 * If this segment is itself an {@link Element} with the specified class, the result includes this element in the list.
	 *
	 * @param className  the class name (case sensitive) to search for, must not be null.
	 * @return a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List getAllElementsByClass(final String className) {
		return getAllElements(getAllStartTagsByClass(className));
	}

	/**
	 * Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List getAllCharacterReferences() {
		CharacterReference characterReference=getNextCharacterReference(begin);
		if (characterReference==null) return Collections.emptyList();
		final ArrayList list=new ArrayList();
		do {
			list.add(characterReference);
			characterReference=getNextCharacterReference(characterReference.end);
		} while (characterReference!=null);
		return list;
	}

	/**
	 * Returns the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This is functionally equivalent to {@link #getAllStartTags()}.iterator().next(),
	 * but does not search beyond the first start tag and returns null if no such start tag exists.
	 *
	 * @return the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists.
	 */
	public final StartTag getFirstStartTag() {
		return checkEnclosure(source.getNextStartTag(begin));
	}
	
	/**
	 * Returns the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This is functionally equivalent to {@link #getAllStartTags(StartTagType) getAllStartTags(startTagType)}.iterator().next(),
	 * but does not search beyond the first start tag and returns null if no such start tag exists.
	 *
	 * @param startTagType  the StartTagType to search for.
	 * @return the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists.
	 */
	public final StartTag getFirstStartTag(StartTagType startTagType) {
		return checkEnclosure(source.getNextStartTag(begin,startTagType));
	}
	
	/**
	 * Returns the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This is functionally equivalent to {@link #getAllStartTags(String) getAllStartTags(name)}.iterator().next(),
	 * but does not search beyond the first start tag and returns null if no such start tag exists.
	 * 

	 * Specifying a null argument to the name parameter is equivalent to {@link #getFirstStartTag()}.
	 *
	 * @param name  the {@linkplain StartTag#getName() name} of the start tag to search for, may be null.
	 * @return the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists.
	 */
	public final StartTag getFirstStartTag(String name) {
		return checkEnclosure(source.getNextStartTag(begin,name));
	}
	
	/**
	 * Returns the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This is functionally equivalent to {@link #getAllStartTags(String,String,boolean) getAllStartTags(attributeName,value,valueCaseSensitive)}.iterator().next(),
	 * but does not search beyond the first start tag and returns null if no such start tag exists.
	 *
	 * @param attributeName  the attribute name (case insensitive) to search for, must not be null.
	 * @param value  the value of the specified attribute to search for, must not be null.
	 * @param valueCaseSensitive  specifies whether the attribute value matching is case sensitive.
	 * @return the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists.
	 * @see #getFirstStartTag(String attributeName, Pattern valueRegexPattern)
	 */
	public final StartTag getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive) {
		return checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
	}

	/**
	 * Returns the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This is functionally equivalent to {@link #getAllStartTags(String,Pattern) getAllStartTags(attributeName,valueRegexPattern)}.iterator().next(),
	 * but does not search beyond the first start tag and returns null if no such start tag exists.
	 *
	 * @param attributeName  the attribute name (case insensitive) to search for, must not be null.
	 * @param valueRegexPattern  the regular expression pattern that must match the attribute value, may be null.
	 * @return the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists.
	 * @see #getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
	 */
	public final StartTag getFirstStartTag(final String attributeName, final Pattern valueRegexPattern) {
		return checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
	}

	/**
	 * Returns the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This is functionally equivalent to {@link #getAllStartTagsByClass(String) getAllStartTagsByClass(className)}.iterator().next(),
	 * but does not search beyond the first start tag and returns null if no such start tag exists.
	 *
	 * @param className  the class name (case sensitive) to search for, must not be null.
	 * @return the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists.
	 */
	public final StartTag getFirstStartTagByClass(final String className) {
		return checkEnclosure(source.getNextStartTagByClass(begin,className));
	}
	
	/**
	 * Returns the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This is functionally equivalent to {@link #getAllElements()}.iterator().next(),
	 * but does not search beyond the first enclosed element and returns null if no such element exists.
	 * 

	 * If this segment is itself an {@link Element}, this element is returned, not the first child element.
	 *
	 * @return the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists.
	 */
	public final Element getFirstElement() {
		StartTag startTag=checkEnclosure(StartTag.getNext(source,begin));
		while (startTag!=null) {
			final Element element=startTag.getElement();
			if (element.end<=end) return element;
			startTag=checkEnclosure(startTag.getNextStartTag());
		}
		return null;
	}
	
	/**
	 * Returns the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This is functionally equivalent to {@link #getAllElements(String) getAllElements(name)}.iterator().next(),
	 * but does not search beyond the first enclosed element and returns null if no such element exists.
	 * 

	 * Specifying a null argument to the name parameter is equivalent to {@link #getFirstElement()}.
	 * 

	 * If this segment is itself an {@link Element} with the specified name, this element is returned.
	 *
	 * @param name  the {@linkplain Element#getName() name} of the element to search for.
	 * @return the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists.
	 */
	public final Element getFirstElement(String name) {
		if (name==null) return getFirstElement();
		final boolean isXMLTagName=Tag.isXMLName(name);
		StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName));
		while (startTag!=null) {
			final Element element=startTag.getElement();
			if (element.end<=end) return element;
			startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName));
		}
		return null;
	}
	
	/**
	 * Returns the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This is functionally equivalent to {@link #getAllElements(String,String,boolean) getAllElements(attributeName,value,valueCaseSensitive)}.iterator().next(),
	 * but does not search beyond the first enclosed element and returns null if no such element exists.
	 * 

	 * If this segment is itself an {@link Element} with the specified attribute name/value pair, this element is returned.
	 *
	 * @param attributeName  the attribute name (case insensitive) to search for, must not be null.
	 * @param value  the value of the specified attribute to search for, must not be null.
	 * @param valueCaseSensitive  specifies whether the attribute value matching is case sensitive.
	 * @return the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists.
	 * @see #getFirstElement(String attributeName, Pattern valueRegexPattern)
	 */
	public final Element getFirstElement(String attributeName, String value, boolean valueCaseSensitive) {
		StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
		while (startTag!=null) {
			final Element element=startTag.getElement();
			if (element.end<=end) return element;
			startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
		}
		return null;
	}

	/**
	 * Returns the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This is functionally equivalent to {@link #getAllElements(String,Pattern) getAllElements(attributeName,valueRegexPattern)}.iterator().next(),
	 * but does not search beyond the first enclosed element and returns null if no such element exists.
	 * 

	 * If this segment is itself an {@link Element} with the specified attribute name and value pattern, this element is returned.
	 *
	 * @param attributeName  the attribute name (case insensitive) to search for, must not be null.
	 * @param valueRegexPattern  the regular expression pattern that must match the attribute value, may be null.
	 * @return the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists.
	 * @see #getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
	 */
	public final Element getFirstElement(final String attributeName, final Pattern valueRegexPattern) {
		StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
		while (startTag!=null) {
			final Element element=startTag.getElement();
			if (element.end<=end) return element;
			startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern));
		}
		return null;
	}

	/**
	 * Returns the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This is functionally equivalent to {@link #getAllElementsByClass(String) getAllElementsByClass(className)}.iterator().next(),
	 * but does not search beyond the first enclosed element and returns null if no such element exists.
	 * 

	 * If this segment is itself an {@link Element} with the specified class, this element is returned.
	 *
	 * @param className  the class name (case sensitive) to search for, must not be null.
	 * @return the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists.
	 */
	public final Element getFirstElementByClass(final String className) {
		StartTag startTag=checkEnclosure(source.getNextStartTagByClass(begin,className));
		while (startTag!=null) {
			final Element element=startTag.getElement();
			if (element.end<=end) return element;
			startTag=checkEnclosure(source.getNextStartTagByClass(startTag.begin+1,className));
		}
		return null;
	}

	/**
	 * Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 */
	public List getFormControls() {
		return FormControl.getAll(this);
	}

	/**
	 * Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * 

	 * This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}({@link #getFormControls()}).
	 *
	 * @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
	 * @see #getFormControls()
	 */
	public FormFields getFormFields() {
		return new FormFields(getFormControls());
	}

	/**
	 * Parses any {@link Attributes} within this segment.
	 * This method is only used in the unusual situation where attributes exist outside of a start tag.
	 * The {@link StartTag#getAttributes()} method should be used in normal situations.
	 * 

	 * This is equivalent to source.{@link Source#parseAttributes(int,int) parseAttributes}({@link #getBegin()},{@link #getEnd()}).
	 *
	 * @return the {@link Attributes} within this segment, or null if too many errors occur while parsing.
	 */
	public Attributes parseAttributes() {
		return source.parseAttributes(begin,end);
	}

	/**
	 * Causes the this segment to be ignored when parsing.
	 * 

	 * Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions.
	 * 

	 * This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside
	 * {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags
	 * (such as where an {@linkplain Attribute attribute} of a normal tag uses a server tag to dynamically set its value),
	 * as well as preventing non-server tags from being recognised inside server tags.
	 * 

	 * It is not necessary to use this method to ignore {@linkplain TagType#isServerTag() server tags} located inside normal tags,
	 * as the attributes parser automatically ignores any server tags.
	 * 

	 * It is not necessary to use this method to ignore non-server tags inside server tags, or the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements,
	 * as the parser does this automatically when performing a {@linkplain Source#fullSequentialParse() full sequential parse}.
	 * 

	 * This leaves only very few scenarios where calling this method still provides a significant benefit.
	 * 

	 * One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags.
	 * Here is an example using an XML-style JSP tag:
	 * 
<a href="<i18n:resource path="/Portal"/>?BACK=TRUE">back</a>
	 * The first double-quote of "/Portal" will be interpreted as the end quote for the href attribute,
	 * as there is no way for the parser to recognise the il8n:resource element as a server tag.
	 * Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice,
	 * but it is nevertheless valid JSP.  The only way to ensure that this library is able to parse the normal tag surrounding it is to
	 * find these server tags first and call the ignoreWhenParsing method to ignore them before parsing the rest of the document.
	 * 
	 * It is important to understand the difference between ignoring the segment when parsing and removing the segment completely.
	 * Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of
	 * tools such as {@link TextExtractor} and {@link Renderer}.
	 * 

	 * To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
	 * {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment.
	 * Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
	 * and perform the desired operations on this new source object.
	 * 

	 * Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an IllegalStateException.
	 * 

	 * Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache},
	 * and so will continue to be found by the tag search methods.
	 * If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache.
	 * Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
	 * 

	 * For best performance, this method should be called on all segments that need to be ignored without calling
	 * any of the tag search methods in between.
	 *
	 * @see Source#ignoreWhenParsing(Collection segments)
	 */
	public void ignoreWhenParsing() {
		source.ignoreWhenParsing(begin,end);
	}

	/**
	 * Compares this Segment object to another object.
	 * 

	 * If the argument is not a Segment, a ClassCastException is thrown.
	 * 

	 * A segment is considered to be before another segment if its begin position is earlier,
	 * or in the case that both segments begin at the same position, its end position is earlier.
	 * 

	 * Segments that begin and end at the same position are considered equal for
	 * the purposes of this comparison, even if they relate to different source documents.
	 * 

	 * Note: this class has a natural ordering that is inconsistent with equals.
	 * This means that this method may return zero in some cases where calling the
	 * {@link #equals(Object)} method with the same argument returns false.
	 *
	 * @param segment  the segment to be compared
	 * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
	 * @throws ClassCastException if the argument is not a Segment
	 */
	public int compareTo(final Segment segment) {
		if (this==segment) return 0;
		if (beginsegment.begin) return 1;
		if (endsegment.end) return 1;
		return 0;
	}

	/**
	 * Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}.
	 * @return true if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise false.
	 */
	public final boolean isWhiteSpace() {
		for (int i=begin; iwhite space.
	 * 

	 * The HTML 4.01 specification section 9.1
	 * specifies the following white space characters:
	 * 

	 *  space (U+0020)
	 *  
tab (U+0009)
	 *  
form feed (U+000C)
	 *  
line feed (U+000A)
	 *  
carriage return (U+000D)
	 *  
zero-width space (U+200B)
	 * 
	 * 
	 * Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not
	 * recognise them as white space and renders them as an unprintable character (empty square).
	 * Even zero-width spaces included using the numeric character reference &#x200B; are rendered this way.
	 *
	 * @param ch  the character to test.
	 * @return true if the specified character is white space, otherwise false.
	 */
	public static final boolean isWhiteSpace(final char ch) {
		for (char whiteSpaceChar : WHITESPACE) if (ch==whiteSpaceChar) return true;
		return false;
	}

	/**
	 * Returns a string representation of this object useful for debugging purposes.
	 * @return a string representation of this object useful for debugging purposes.
	 */
	public String getDebugInfo() {
		final StringBuilder sb=new StringBuilder(50);
		sb.append('(');
		source.getRowColumnVector(begin).appendTo(sb);
		sb.append('-');
		source.getRowColumnVector(end).appendTo(sb);
		sb.append(')');
		return sb.toString();
	}

	/**
	 * Returns the character at the specified index.
	 * 

	 * This is logically equivalent to toString().charAt(index)
	 * for valid argument values 0 <= index < length().
	 * 

	 * However because this implementation works directly on the underlying document source string,
	 * it should not be assumed that an IndexOutOfBoundsException is thrown
	 * for an invalid argument value.
	 *
	 * @param index  the index of the character.
	 * @return the character at the specified index.
	 */
	public char charAt(final int index) {
		return source.charAt(begin+index);
	}

	/**
	 * Returns a new character sequence that is a subsequence of this sequence.
	 * 

	 * This is logically equivalent to toString().subSequence(beginIndex,endIndex)
	 * for valid values of beginIndex and endIndex.
	 * 

	 * However because this implementation works directly on the underlying document source text,
	 * it should not be assumed that an IndexOutOfBoundsException is thrown
	 * for invalid argument values as described in the String.subSequence(int,int) method.
	 *
	 * @param beginIndex  the begin index, inclusive.
	 * @param endIndex  the end index, exclusive.
	 * @return a new character sequence that is a subsequence of this sequence.
	 */
	public CharSequence subSequence(final int beginIndex, final int endIndex) {
		return source.subSequence(begin+beginIndex,begin+endIndex);
	}

	/**
	 * Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text.
	 * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
	 */
	static final StringBuilder appendCollapseWhiteSpace(final StringBuilder sb, final CharSequence text) {
		final int textLength=text.length();
		int i=0;
		boolean lastWasWhiteSpace=false;
		while (true) {
			if (i>=textLength) return sb;
			if (!isWhiteSpace(text.charAt(i))) break;
			i++;
		}
		do {
			final char ch=text.charAt(i++);
			if (isWhiteSpace(ch)) {
				lastWasWhiteSpace=true;
			} else {
				if (lastWasWhiteSpace) {
					sb.append(' ');
					lastWasWhiteSpace=false;
				}
				sb.append(ch);
			}
		} while (i getAllElements(final List startTags) {
		if (startTags.isEmpty()) return Collections.emptyList();
		final ArrayList elements=new ArrayList(startTags.size());
		for (StartTag startTag : startTags) {
			final Element element=startTag.getElement();
			if (element.end<=end) elements.add(element);
		}
		return elements;
	}

	private StartTag checkEnclosure(final StartTag startTag) {
		if (startTag==null || startTag.end>end) return null;
		return startTag;
	}

	private Tag checkTagEnclosure(final Tag tag) {
		if (tag==null || tag.end>end) return null;
		return tag;
	}

	private CharacterReference getNextCharacterReference(final int pos) {
		final CharacterReference characterReference=source.getNextCharacterReference(pos);
		if (characterReference==null || characterReference.end>end) return null;
		return characterReference;
	}
}