src.au.id.jericho.lib.html.Segment Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jericho-html
Jericho HTML Parser is a simple but powerful java library allowing analysis and manipulation of parts of an HTML document, including some common server-side tags, while reproducing verbatim any unrecognised or invalid HTML. It also provides high-level HTML form manipulation functions.
There is a newer version: 2.3
Show newest version
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 1.5
// Copyright (C) 2004 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// http://www.gnu.org/copyleft/lesser.html
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package au.id.jericho.lib.html;

import java.util.*;

/**
 * Represents a segment of a {@link Source} document.
 * 
 * The "span" of a segment is defined by the combination of its begin and end character positions.
 */
public class Segment implements Comparable, CharSequence {
	protected int begin;
	protected int end;
	protected Source source;

	private static final String WHITESPACE=" \n\r\t\f\u200B"; // see comments in isWhiteSpace(char) method

	/**
	 * Constructs a new Segment with the specified Source and the specified begin and end character positions.
	 * @param source  the source document.
	 * @param begin  the character position in the source where this segment begins.
	 * @param end  the character position in the source where this segment ends.
	 */
	public Segment(Source source, int begin, int end) {
		this(begin,end);
		if (source==null) throw new IllegalArgumentException("source argument must not be null");
		this.source=source;
	}

	// Only called from Source constructor
	Segment(int begin, int end) {
		if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException();
		this.begin=begin;
		this.end=end;
	}

	Segment() {} // used when creating CACHED_NULL objects

	/**
	 * Returns the character position in the Source where this segment begins.
	 * @return the character position in the Source where this segment begins.
	 */
	public final int getBegin() {
		return begin;
	}

	/**
	 * Returns the character position in the Source where this segment ends.
	 * @return the character position in the Source where this segment ends.
	 */
	public final int getEnd() {
		return end;
	}

	/**
	 * Compares the specified object with this Segment for equality.
	 * 

	 * Returns true if and only if the specified object is also a Segment,
	 * and both segments have the same Source, and the same begin and end positions.
	 * @param object  the object to be compared for equality with this Segment.
	 * @return true if the specified object is equal to this Segment, otherwise false.
	 */
	public final boolean equals(Object object) {
		if (object==null || !(object instanceof Segment)) return false;
		Segment segment=(Segment)object;
		return segment.begin==begin && segment.end==end && segment.source==source;
	}

	/**
	 * Returns a hash code value for the segment.
	 * 

	 * The current implementation returns the sum of the begin and end positions, although this is not
	 * guaranteed in future versions.
	 *
	 * @return a hash code value for the segment.
	 */
	public int hashCode() {
		return begin+end;
	}

	/**
	 * Returns the length of the segment.
	 * This is defined as the number of characters between the begin and end positions.
	 * @return the length of the segment.
	 */
	public final int length() {
		return end-begin;
	}

	/**
	 * Indicates whether this Segment encloses the specified Segment.
	 * @param segment  the segment to be tested for being enclosed by this segment.
	 * @return true if this Segment encloses the specified Segment, otherwise false.
	 */
	public final boolean encloses(Segment segment) {
		return begin<=segment.begin && end>=segment.end;
	}

	/**
	 * Indicates whether this segment encloses the specified character position in the {@link Source} document.
	 * 
	 * This is the case if {@link #getBegin()} <= pos < {@link #getEnd()}.
	 *
	 * @param pos  the position in the source document to be tested.
	 * @return true if this segment encloses the specified position, otherwise false.
	 */
	public final boolean encloses(int pos) {
		return begin<=pos && posSegment represents an HTML comment.
	 * 

	 * An HTML comment is an area of the source document enclosed by the delimiters
	 * <!-- on the left and --> on the right.
	 * 

	 * The HTML 4.01 Specification section 3.2.4
	 * states that the end of comment delimiter may contain white space between the "--" and ">" characters,
	 * but this library does not recognise end of comment delimiters containing white space.
	 *
	 * @return true if this Segment represents an HTML comment, otherwise false.
	 */
	public boolean isComment() {
		return false; // overridden in StartTag
	}

	/**
	 * Returns the source text of this segment as a String.
	 * 

	 * The returned String is newly created with every call to this method, unless this
	 * segment is itself a {@link Source} object.
	 * 

	 * Note that before version 1.5 this returned a representation of this object useful for debugging purposes,
	 * which can now be obtained via the {@link #getDebugInfo()} method.
	 *
	 * @return the source text of this segment as a String.
	 */
	public String toString() {
		return source.text.subSequence(begin,end).toString();
	}

	/**
	 * Returns a list of all {@link StartTag} objects enclosed by this segment.
	 * @return a list of all {@link StartTag} objects enclosed by this segment.
	 */
	public List findAllStartTags() {
		return findAllStartTags(null);
	}

	/**
	 * Returns a list of all {@link StartTag} objects with the specified name enclosed by this segment.
	 * 

	 * If the name argument is null, all StartTags are returned.
	 *
	 * @param name  the {@linkplain StartTag#getName() name} of the StartTags to find.
	 * @return a list of all StartTag objects with the specified name enclosed by this segment.
	 */
	public List findAllStartTags(String name) {
		if (name!=null) name=name.toLowerCase();
		StartTag startTag=checkEnclosure(source.findNextStartTag(begin,name));
		if (startTag==null) return Collections.EMPTY_LIST;
		ArrayList list=new ArrayList();
		do {
			list.add(startTag);
			startTag=checkEnclosure(source.findNextStartTag(startTag.end,name));
		} while (startTag!=null);
		return list;
	}

	/**
	 * Returns a list of all StartTag objects with the specified attribute name/value pair beginning at or immediately following the specified position in the source document.
	 *
	 * @param attributeName  the attribute name (case insensitive) to search for, must not be null.
	 * @param value  the value of the specified attribute to search for, must not be null.
	 * @param valueCaseSensitive  specifies whether the attribute value matching is case sensitive.
	 * @return a list of all StartTag objects with the specified attribute name/value pair beginning at or immediately following the specified position in the source document.
	 */
	public List findAllStartTags(String attributeName, String value, boolean valueCaseSensitive) {
		StartTag startTag=checkEnclosure(source.findNextStartTag(begin,attributeName,value,valueCaseSensitive));
		if (startTag==null) return Collections.EMPTY_LIST;
		ArrayList list=new ArrayList();
		do {
			list.add(startTag);
			startTag=checkEnclosure(source.findNextStartTag(startTag.end,attributeName,value,valueCaseSensitive));
		} while (startTag!=null);
		return list;
	}

	/**
	 * Returns a list of all Segment objects enclosed by this segment that represent HTML {@linkplain #isComment() comments}.
	 * @return a list of all Segment objects enclosed by this segment that represent HTML {@linkplain #isComment() comments}.
	 */
	public List findAllComments() {
		return findAllStartTags(SpecialTag.COMMENT.getName());
	}

	/**
	 * Returns a list of all {@link Element} objects enclosed by this segment.
	 * @return a list of all {@link Element} objects enclosed by this segment.
	 */
	public List findAllElements() {
		return findAllElements(null);
	}

	/**
	 * Returns a list of all {@link Element} objects with the specified name enclosed by this segment.
	 * 

	 * If the name argument is null, all Elements are returned.
	 *
	 * @param name  the {@linkplain Element#getName() name} of the Elements to find.
	 * @return a list of all Element objects with the specified name enclosed by this segment.
	 */
	public List findAllElements(String name) {
		if (name!=null) name=name.toLowerCase();
		List startTags=findAllStartTags(name);
		if (startTags.isEmpty()) return Collections.EMPTY_LIST;
		ArrayList elements=new ArrayList(startTags.size());
		for (Iterator i=startTags.iterator(); i.hasNext();) {
			StartTag startTag=(StartTag)i.next();
			Element element=startTag.getElement();
			if (element.end>end) break;
			elements.add(element);
		}
		return elements;
	}

	/**
	 * Returns a list of all {@link CharacterReference} objects enclosed by this segment.
	 *
	 * @return a list of all CharacterReference objects enclosed by this segment.
	 */
	public List findAllCharacterReferences() {
		CharacterReference characterReference=findNextCharacterReference(begin);
		if (characterReference==null) return Collections.EMPTY_LIST;
		ArrayList list=new ArrayList();
		do {
			list.add(characterReference);
			characterReference=findNextCharacterReference(characterReference.end);
		} while (characterReference!=null);
		return list;
	}

	/**
	 * Returns a list of the {@link FormControl} objects enclosed by this segment.
	 * @return a list of the {@link FormControl} objects enclosed by this segment.
	 */
	public List findFormControls() {
		return FormControl.findAll(this);
	}

	/**
	 * Returns the {@link FormFields} object representing all form fields enclosed by this segment.
	 * 

	 * This is equivalent to FormFields.constructFrom(findFormControls())
	 *
	 * @return the {@link FormFields} object representing all form fields enclosed by this segment.
	 * @see #findFormControls()
	 */
	public FormFields findFormFields() {
		return new FormFields(findFormControls());
	}

	/**
	 * Parses any {@link Attributes} within this segment.
	 * This method is only used in the unusual situation where attributes exist outside of a start tag.
	 * The {@link StartTag#getAttributes()} method should be used in normal situations.
	 * 

	 * This is equivalent to {@link Source#parseAttributes(int,int) source.parseAttributes(this.getBegin(),this.getEnd())}
	 *
	 * @return the {@link Attributes} within this segment, or null if too many errors occur while parsing.
	 */
	public Attributes parseAttributes() {
		return source.parseAttributes(begin,end);
	}

	/**
	 * Causes the this segment to be ignored when parsing.
	 * 

	 * This is equivalent to {@link Source#ignoreWhenParsing(int,int) source.ignoreWhenParsing(segment.getBegin(),segment.getEnd())}
	 *
	 * @see Source#ignoreWhenParsing(int begin, int end)
	 * @see Source#ignoreWhenParsing(Collection segments)
	 */
	public void ignoreWhenParsing() {
		source.ignoreWhenParsing(begin,end);
	}

	/**
	 * Compares this Segment object to another object.
	 * 

	 * If the argument is not a Segment, a ClassCastException is thrown.
	 * 

	 * A segment is considered to be before another segment if its begin position is earlier,
	 * or in the case that both segments begin at the same position, its end position is earlier.
	 * 

	 * Segments that begin and end at the same position are considered equal for
	 * the purposes of this comparison, even if they relate to different source documents.
	 * 

	 * Note: this class has a natural ordering that is inconsistent with equals.
	 * This means that this method may return zero in some cases where calling the
	 * {@link #equals(Object)} method with the same argument returns false.
	 *
	 * @param o  the segment to be compared
	 * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
	 * @throws ClassCastException  if the argument is not a Segment
	 */
	public int compareTo(Object o) {
		if (this==o) return 0;
		Segment segment=(Segment)o;
		if (beginsegment.begin) return 1;
		if (endsegment.end) return 1;
		return 0;
	}

	/**
	 * Indicates whether the specified character is white space.
	 * 

	 * The HTML 4.01 Specification section 9.1
	 * specifies the following white space characters:
	 * 

	 *  space (U+0020)
	 *  
tab (U+0009)
	 *  
form feed (U+000C)
	 *  
line feed (U+000A)
	 *  
carriage return (U+000D)
	 *  
zero-width space (U+200B)
	 * 
	 * 
	 * Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not
	 * recognise them as whitespace and renders them as an unprintable character (empty square).
	 * Even zero-width spaces included using the numeric character reference  are rendered this way.
	 * 

	 * Note that in versions prior to 1.5, this method did not recognise form feeds or zero-width spaces as white space.
	 *
	 * @param ch  the character to test.
	 * @return true if the specified character is white space, otherwise false.
	 */
	public static final boolean isWhiteSpace(char ch) {
		return WHITESPACE.indexOf(ch)!=-1;
	}

	/**
	 * Returns a string representation of this object useful for debugging purposes.
	 * @return a string representation of this object useful for debugging purposes.
	 */
	public String getDebugInfo() {
		return "("+begin+','+end+')';
	}

	/**
	 * Returns the character at the specified index.
	 * 

	 * This is logically equivalent to toString().charAt(index)
	 * for a valid argument values 0 <= index < length().
	 * 

	 * However because this implementation works directly on the underlying document source string,
	 * it should not be assumed that an IndexOutOfBoundsException will be thrown
	 * for an invalid argument value.
	 *
	 * @param index  the index of the character.
	 * @return the character at the specified index.
	 */
	public char charAt(int index) {
		return source.toString().charAt(begin+index);
	}

	/**
	 * Returns a new character sequence that is a subsequence of this sequence.
	 * 

	 * This is logically equivalent to toString().subSequence(beginIndex,endIndex)
	 * for valid values of beginIndex and endIndex.
	 * 

	 * However because this implementation works directly on the underlying document source string,
	 * it should not be assumed that an IndexOutOfBoundsException will be thrown
	 * for invalid argument values as described in the String.subSequence(int,int) method.
	 *
	 * @param beginIndex  the begin index, inclusive.
	 * @param endIndex  the end index, exclusive.
	 * @return a new character sequence that is a subsequence of this sequence.
	 */
	public final CharSequence subSequence(int beginIndex, int endIndex) {
		return source.text.subSequence(begin+beginIndex,begin+endIndex);
	}

	/**
	 * Returns the source text of this segment.
	 * 

 	 * This method has been deprecated as of version 1.5 as it now duplicates the functionality of the {@link #toString()} method.
	 *
	 * @return the source text of this segment.
	 * @deprecated  Use the {@link #toString()} method instead
	 */
	public String getSourceText() {
		return toString();
	}

	/**
	 * Returns the source text of this segment without {@linkplain #isWhiteSpace(char) white space}.
	 * 

	 * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
	 * 

 	 * This method has been deprecated as of version 1.5 as it is no longer used internally and
	 * was never very useful as a public method.
	 * It is similar to the new {@link CharacterReference#decodeCollapseWhiteSpace(CharSequence)} method, but
	 * does not {@linkplain CharacterReference#decode(CharSequence) decode} the text after collapsing the white space.
	 * 

	 * @return the source text of this segment without white space.
	 * @deprecated  Use the more useful {@link CharacterReference#decodeCollapseWhiteSpace(CharSequence)} method instead.
	 */
	public final String getSourceTextNoWhitespace() {
		return appendCollapseWhiteSpace(new StringBuffer(length()),this).toString();
	}

	/**
	 * Returns a list of Segment objects representing every word in this segment separated by {@linkplain #isWhiteSpace(char) white space}.
	 * Note that any markup contained in this segment will be regarded as normal text for the purposes of this method.
	 * 

 	 * This method has been deprecated as of version 1.5 as it has no discernable use.
	 *
	 * @return a list of Segment objects representing every word in this segment separated by white space.
	 * @deprecated  no replacement
	 */
	public final List findWords() {
		ArrayList words=new ArrayList();
		int wordBegin=-1;
		for (int i=begin; iend) return null;
		return startTag;
	}

	private CharacterReference findNextCharacterReference(int pos) {
		CharacterReference characterReference=source.findNextCharacterReference(pos);
		if (characterReference==null || characterReference.end>end) return null;
		return characterReference;
	}
}