src.au.id.jericho.lib.html.Source Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jericho-html
Jericho HTML Parser is a simple but powerful java library allowing analysis and manipulation of parts of an HTML document, including some common server-side tags, while reproducing verbatim any unrecognised or invalid HTML. It also provides high-level HTML form manipulation functions.
There is a newer version: 2.3
Show newest version
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 1.5
// Copyright (C) 2004 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// http://www.gnu.org/copyleft/lesser.html
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package au.id.jericho.lib.html;

import java.util.*;
import java.io.*;

/**
 * Represents a source HTML document.
 * 
 * Note that many of the useful functions which can be performed on the source document are
 * defined in its superclass, {@link Segment}.
 * The Source object is itself a Segment which spans the entire document.
 * 

 * Most of the methods defined in this class are useful for determining the elements and tags
 * surrounding or neighbouring a particular character position in the document.
 * 

 * IMPORTANT NOTE: Because HTML allows '<' characters within attribute values
 * (see section 5.3.2 of the HTML spec),
 * it is theoretically impossible to determine with certainty whether
 * any given '<' character in a source document is the start of a tag
 * without having parsed from the beginning of the document (which Jericho HTML Parser doesn't do).
 * For this reason, the parser may reject a start tag completely if its attributes are not
 * properly formed, although it does try to provide some leniency.
 * In XHTML, such characters must be represented in attribute values as character entities.
 * (see section 3.1 of the XML spec)
 *
 * @see Segment
 */
public class Source extends Segment {
	protected CharSequence text;
	private String parseTextLowerCase=null;
	private OutputDocument parseTextLowerCaseOutputDocument=null;
	private Writer logWriter=null;

	/**
	 * Constructs a new Source object with the specified text.
	 * @param text  the source text.
	 */
	public Source(CharSequence text) {
		super(0,text.length());
		source=this;
		this.text=text;
	}

	/**
	 * Returns the source text as a String.
	 * 

	 * If the original CharSequence supplied when this instance was constructed was not a String,
	 * the first conversion of the text to a String is cached for subsequent calls.
	 *
	 * @return the source text as a String.
	 */
	public String toString() {
		String string=text.toString();
		if (text!=string) text=string; // make sure CharSequence is only converted to String once
		return string;
	}

	/**
	 * Returns the {@link Element} with the specified id attribute value.
	 * 

	 * This simulates the script method
	 * getElementById
	 * defined in DOM HTML level 1.
	 * 

	 * This is equivalent to findNextStartTag(0,"id",id,true).getElement().
	 * 

	 * A well formed HTML document should have no more than one element with any given id attribute value.
	 * 

	 * Calls to this method are not cached.
	 *
	 * @param id  the id attribute value (case sensitive) to search for, must not be null.
	 * @return the {@link Element} with the specified id attribute value.
	 */
	public Element getElementById(String id) {
		StartTag startTag=findNextStartTag(5,Attribute.ID,id,true);
		return startTag==null ? null : startTag.getElement();
	}

	/**
	 * Returns the StartTag at or immediately preceding (or enclosing) the specified position in the source document.
	 * 

	 * If the specified position is within an HTML {@linkplain Segment#isComment() comment}, the segment
	 * spanning the comment is returned.
	 *
	 * @param pos  the position in the source document from which to start the search.
	 * @return the StartTag immediately preceding the specified position in the source document, or null if none exists.
	 */
	public StartTag findPreviousStartTag(int pos) {
		return findPreviousStartTag(pos,null);
	}

	/**
	 * Returns the StartTag with the specified name at or immediately preceding (or enclosing) the specified position in the source document.
	 * 

	 * Start tags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored, but the comment segment itself is treated as a start tag.
	 * 

	 * Specifying a null name parameter is equivalent to {@link #findPreviousStartTag(int) findPreviousStartTag(pos)}.
	 *
	 * @param pos  the position in the source document from which to start the search.
	 * @param name  the {@linkplain StartTag#getName() name} of the StartTag to search for.
	 * @return the StartTag with the specified name immediately preceding the specified position in the source document, or null if none exists.
	 */
	public StartTag findPreviousStartTag(int pos, String name) {
		if (name!=null) name=name.toLowerCase();
		return StartTag.findPreviousOrNext(this,pos,name,true);
	}

	/**
	 * Returns the StartTag beginning at or immediately following the specified position in the source document.
	 * 

	 * StartTags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored, but subsequent comment segments are treated as start tags.
	 *
	 * @param pos  the position in the source document from which to start the search.
	 * @return the StartTag beginning at or immediately following the specified position in the source document, or null if none exists.
	 */
	public StartTag findNextStartTag(int pos) {
		return findNextStartTag(pos,null);
	}

	/**
	 * Returns the StartTag with the specified name beginning at or immediately following the specified position in the source document.
	 * 

	 * Start tags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored.
	 * 

	 * Specifying a null name parameter is equivalent to {@link #findNextStartTag(int) findNextStartTag(pos)}.
	 * 

	 * Specifying a name parameter ending in a colon (:) searches for all start tags in the specified XML namespace.
	 *
	 * @param pos  the position in the source document from which to start the search.
	 * @param name  the {@linkplain StartTag#getName() name} of the StartTag to search for.
	 * @return the StartTag with the specified name beginning at or immediately following the specified position in the source document, or null if none exists.
	 */
	public StartTag findNextStartTag(int pos, String name) {
		if (name!=null) name=name.toLowerCase();
		return StartTag.findPreviousOrNext(this,pos,name,false);
	}

	/**
	 * Returns the StartTag with the specified attribute name/value pair beginning at or immediately following the specified position in the source document.
	 * 

	 * Calls to this method are not cached.
	 *
	 * @param pos  the position in the source document from which to start the search.
	 * @param attributeName  the attribute name (case insensitive) to search for, must not be null.
	 * @param value  the value of the specified attribute to search for, must not be null.
	 * @param valueCaseSensitive  specifies whether the attribute value matching is case sensitive.
	 * @return the StartTag with the specified attribute name/value pair beginning at or immediately following the specified position in the source document.
	 */
	public StartTag findNextStartTag(int pos, String attributeName, String value, boolean valueCaseSensitive) {
		return StartTag.findNext(this,pos,attributeName,value,valueCaseSensitive);
	}

	/**
	 * Returns the Comment beginning at or immediately following the specified position in the source document.
	 * 

	 * If the specified position is within a comment, the comment following the enclosing comment is returned.
	 *
	 * @param pos  the position in the source document from which to start the search.
	 * @return the Comment beginning at or immediately following the specified position in the source document, or null if none exists.
	 */
	public StartTag findNextComment(int pos) {
		return findNextStartTag(pos,SpecialTag.COMMENT.getName());
	}

	/**
	 * Returns the EndTag with the specified name at or immediately preceding (or enclosing) the specified position in the source document.
	 * 

	 * End tags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored.
	 *
	 * @param pos  the position in the source document from which to start the search.
	 * @param name  the {@linkplain StartTag#getName() name} of the EndTag to search for, must not be null.
	 * @return the EndTag immediately preceding the specified position in the source document, or null if none exists.
	 */
	public EndTag findPreviousEndTag(int pos, String name) {
		if (name==null) throw new IllegalArgumentException();
		return EndTag.findPreviousOrNext(this,pos,name.toLowerCase(),true);
	}

	/**
	 * Returns the EndTag beginning at or immediately following the specified position in the source document.
	 * 

	 * End tags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored.
	 *
	 * @param pos  the position in the source document from which to start the search.
	 * @return the EndTag beginning at or immediately following the specified position in the source document, or null if none exists.
	 */
	public EndTag findNextEndTag(int pos) {
		return EndTag.findNext(this,pos);
	}

	/**
	 * Returns the EndTag with the specified name beginning at or immediately following the specified position in the source document.
	 * 

	 * End tags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored.
	 *
	 * @param pos  the position in the source document from which to start the search.
	 * @param name  the {@linkplain StartTag#getName() name} of the EndTag to search for, must not be null.
	 * @return the EndTag with the specified name beginning at or immediately following the specified position in the source document, or null if none exists.
	 */
	public EndTag findNextEndTag(int pos, String name) {
		if (name==null) throw new IllegalArgumentException();
		return EndTag.findPreviousOrNext(this,pos,name.toLowerCase(),false);
	}

	/**
	 * Returns an iterator of {@link Tag} objects beginning at or immediately following the specified position in the source document.
	 * 

	 * Tags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored, but the comment segments themselves are treated as start tags.
	 *
	 * @param pos  the position in the source document from which to start the iteration.
	 * @return an iterator of {@link Tag} objects beginning at or immediately following the specified position in the source document.
	 */
	public Iterator getNextTagIterator(int pos) {
		return Tag.getNextTagIterator(this,pos);
	}

	/**
	 * Returns the tag (either a {@link StartTag} or {@link EndTag}) beginning at or immediately following the specified position in the source document.
	 * 

	 * IMPLEMENTATION NOTE: Sequential tags in a document should be retrieved using the iterator from
	 * {@link #getNextTagIterator(int pos)} as it is far more efficient than using multiple calls to this method.
	 *
	 * @param pos  the position in the source document from which to start the search.
	 * @return the tag beginning at or immediately following the specified position in the source document, or null if none exists.
	 * @see #getNextTagIterator(int pos)
	 */
	public Tag findNextTag(int pos) {
		Iterator i=getNextTagIterator(pos);
		return i.hasNext() ? (Tag)i.next() : null;
	}

	/**
	 * Returns the StartTag enclosing the specified position in the source document.
	 * 

	 * If the specified position is within an HTML {@linkplain Segment#isComment() comment}, the segment
	 * spanning the comment is returned.
	 * 

	 * A segment is considered to enclose a character position x if
segment.getBegin() <= x < segment.getEnd()
	 *
	 * @param pos  the position in the source document.
	 * @return the StartTag enclosing the specified position in the source document, or null if the position is not within a StartTag.
	 */
	public StartTag findEnclosingStartTag(int pos) {
		return findEnclosingStartTag(pos,null);
	}

	/**
	 * Returns a Segment spanning the HTML {@linkplain Segment#isComment() comment} that encloses the specified position in the source document.
	 * 

	 * A segment is considered to enclose a character position x if
segment.getBegin() <= x < segment.getEnd()
	 *
	 * @param pos  the position in the source document.
	 * @return a Segment spanning the HTML {@linkplain Segment#isComment() comment} that encloses the specified position in the source document, or null if the position is not within a comment.
	 */
	public Segment findEnclosingComment(int pos) {
		return findEnclosingStartTag(pos,SpecialTag.COMMENT.getName());
	}

	/**
	 * Returns the most nested Element enclosing the specified position in the source document.
	 * 

	 * If the specified position is within an HTML {@linkplain Segment#isComment() comment}, the segment
	 * spanning the comment is returned.
	 * 

	 * A segment is considered to enclose a character position x if
segment.getBegin() <= x < segment.getEnd()
	 *
	 * @param pos  the position in the source document.
	 * @return the most nested Element enclosing the specified position in the source document, or null if the position is not within an Element.
	 */
	public Element findEnclosingElement(int pos) {
		return findEnclosingElement(pos,null);
	}

	/**
	 * Returns the most nested Element with the specified name enclosing the specified position in the source document.
	 * 

	 * Elements positioned within an HTML {@linkplain Segment#isComment() comment} are ignored, but the comment segment itself is treated as an Element.
	 *
	 * @param pos  the position in the source document.
	 * @param name  the {@linkplain Element#getName() name} of the Element to search for.
	 * @return the most nested Element with the specified name enclosing the specified position in the source document, or null if none exists.
	 */
	public Element findEnclosingElement(int pos, String name) {
		int startBefore=pos;
		if (name!=null) name=name.toLowerCase();
		while (true) {
			StartTag startTag=findPreviousStartTag(startBefore,name);
			if (startTag==null) return null;
			Element element=startTag.getElement();
			if (pos < element.end) return element;
			startBefore=startTag.begin-1;
		}
	}

	/**
	 * Returns the CharacterReference at or immediately preceding (or enclosing) the specified position in the source document.
	 * 

	 * Character references positioned within an HTML {@linkplain Segment#isComment() comment} are NOT ignored.
	 *
	 * @param pos  the position in the source document from which to start the search.
	 * @return the CharacterReference beginning at or immediately preceding the specified position in the source document, or null if none exists.
	 */
	public CharacterReference findPreviousCharacterReference(int pos) {
		return CharacterReference.findPreviousOrNext(this,pos,true);
	}

	/**
	 * Returns the CharacterReference beginning at or immediately following the specified position in the source document.
	 * 

	 * Character references positioned within an HTML {@linkplain Segment#isComment() comment} are NOT ignored.
	 *
	 * @param pos  the position in the source document from which to start the search.
	 * @return the CharacterReference beginning at or immediately following the specified position in the source document, or null if none exists.
	 */
	public CharacterReference findNextCharacterReference(int pos) {
		return CharacterReference.findPreviousOrNext(this,pos,false);
	}

	/**
	 * Parses any {@link Attributes} starting at the specified position.
	 * This method is only used in the unusual situation where attributes exist outside of a start tag.
	 * The {@link StartTag#getAttributes()} method should be used in normal situations.
	 * 

	 * The returned Attributes segment will always begin at pos,
	 * and will end at the first occurrence of "/>" or ">" outside of a quoted attribute value,
	 * or at maxEnd, whichever comes first.
	 * 

	 * Only returns null if the segment contains a major syntactical error
	 * or more than the {@linkplain Attributes#setDefaultMaxErrorCount(int) default maximum} number of
	 * minor syntactical errors.
	 * 

	 * This is equivalent to
	 * {@link #parseAttributes(int,int,int) parseAttributes(pos,maxEnd,Attributes.getDefaultMaxErrorCount())}
	 *
	 * @param pos  the position in the source document at the beginning of the attribute list
	 * @param maxEnd  the maximum end position of the attribute list, or -1 if no maximum
	 * @return the {@link Attributes} starting at the specified position, or null if too many errors occur while parsing.
	 * @see StartTag#getAttributes()
	 * @see Segment#parseAttributes()
	 */
	public Attributes parseAttributes(int pos, int maxEnd) {
		return parseAttributes(pos,maxEnd,Attributes.getDefaultMaxErrorCount());
	}

	/**
	 * Parses any {@link Attributes} starting at the specified position.
	 * This method is only used in the unusual situation where attributes exist outside of a start tag.
	 * The {@link StartTag#getAttributes()} method should be used in normal situations.
	 * 

	 * Only returns null if the segment contains a major syntactical error
	 * or more than the specified number of minor syntactical errors.
	 * 

	 * The maxErrorCount argument overrides the default maximum number of minor errors allowed,
	 * which can be set using the {@link Attributes#setDefaultMaxErrorCount(int)} static method.
	 * 

	 * See {@link #parseAttributes(int pos, int maxEnd)} for more information.
	 *
	 * @param pos  the position in the source document at the beginning of the attribute list
	 * @param maxEnd  the maximum end position of the attribute list, or -1 if no maximum
	 * @param maxErrorCount  the maximum number of minor errors allowed while parsing
	 * @return the {@link Attributes} starting at the specified position, or null if too many errors occur while parsing.
	 * @see StartTag#getAttributes()
	 * @see #parseAttributes(int pos, int MaxEnd)
	 */
	public Attributes parseAttributes(int pos, int maxEnd, int maxErrorCount) {
		return Attributes.construct(this,pos,maxEnd,maxErrorCount);
	}

	/**
	 * Causes the specified range of the source text to be ignored when parsing.
	 * 

	 * This method is usually used to exclude server tags or other non-HTML segments from the source text
	 * so that it does not interfere with the parsing of the surrounding HTML.
	 * 

	 * This is necessary because many server tags are used as attribute values and in other places within
	 * HTML tags, and very often contain characters that prevent the parser from recognising the surrounding tag.
	 * 

	 * For efficiency reasons, all segments to be ignored should be registered at once, without performing
	 * searches in between.
	 *
	 * @param begin  the beginning character position in the source text.
	 * @param end  the end character position in the source text.
	 * @see Segment#ignoreWhenParsing()
	 */
	public void ignoreWhenParsing(int begin, int end) {
		if (parseTextLowerCaseOutputDocument==null) {
			parseTextLowerCaseOutputDocument=new OutputDocument(getParseTextLowerCase());
			parseTextLowerCase=null;
		}
		parseTextLowerCaseOutputDocument.add(new BlankOutputSegment(begin,end));
	}

	/**
	 * Causes all of the segments in the specified collection to be ignored when parsing.
	 * 

	 * This is equivalent to calling {@link Segment#ignoreWhenParsing()} on each segment in the collection.
	 */
	public void ignoreWhenParsing(Collection segments) {
		for (Iterator i=segments.iterator(); i.hasNext();) {
			((Segment)i.next()).ignoreWhenParsing();
		}
	}

	/**
	 * Sets the destination for log messages.
	 * 

	 * By default, the log writer is set to null, which supresses log messages.
	 *
	 * @param writer  the java.io.Writer where log messages will be sent
	 */
	public void setLogWriter(Writer writer) {
		logWriter=writer;
	}

	/**
	 * Gets a list of all the tags that have been parsed so far.
	 * 

	 * This information may be useful for debugging purposes.
	 * Execution of this method collects information from the internal cache and is relatively expensive.
	 *
	 * @return a list of all the tags that have been parsed so far.
	 */
	protected List getParsedTags() {
		return searchCache.getTagList();
	}

	/**
	 * Returns the parse text in lower case.
	 * 
	 * The parse text is the text used when parsing, which is the same as the source text but with
	 * some segments replaced with spaces where the {@link #ignoreWhenParsing(int begin, int end)} method
	 * has been called.
	 *
	 * @return the parse text in lower case.
	 */
	final String getParseTextLowerCase() {
		if (parseTextLowerCase==null) {
			if (parseTextLowerCaseOutputDocument!=null) {
				parseTextLowerCase=parseTextLowerCaseOutputDocument.toString();
				parseTextLowerCaseOutputDocument=null;
			} else {
				parseTextLowerCase=toString().toLowerCase();
			}
		}
		return parseTextLowerCase;
	}

	protected final int getIdentifierEnd(int pos, boolean fromStart) {
		if (fromStart && !isIdentifierStart(text.charAt(pos++))) return -1;
		while (true) {
			if (!isIdentifierPart(text.charAt(pos))) return pos;
			pos++;
		}
	}

	protected int findEnd(int pos, SpecialTag specialTag) {
		int delimiterBegin=getParseTextLowerCase().indexOf(specialTag.getEndDelimiter(),pos);
		return (delimiterBegin==-1 ? -1 : delimiterBegin+specialTag.getEndDelimiter().length());
	}

	private StartTag findEnclosingStartTag(int pos, String name) {
		StartTag startTag=findPreviousStartTag(pos,name);
		if (startTag==null || startTag.end<=pos) return null;
		return startTag;
	}

	private void logLine(String message) {
		try {
			logWriter.write(message);
			logWriter.write('\n');
			logWriter.flush();
		} catch (IOException ex) {
			throw new RuntimeException(ex);
		}
	}

	protected void log(int pos, String message) {
		if (logWriter==null) return;
		logLine(pos+": "+message);
	}

	void log(String type, String name, int begin, String message, int pos) {
		if (logWriter==null) return;
		StringBuffer sb=new StringBuffer(type);
		if (name!=null) sb.append(' ').append(name);
		sb.append(" at ").append(begin).append(' ').append(message);
		if (pos!=-1) sb.append(" at position ").append(pos);
		logLine(sb.toString());
	}

	final SearchCache getSearchCache() {
		if (searchCache==null) searchCache=new SearchCache();
		return searchCache;
	}
	private SearchCache searchCache=null;
}