All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.au.id.jericho.lib.html.Source Maven / Gradle / Ivy

Go to download

Jericho HTML Parser is a simple but powerful java library allowing analysis and manipulation of parts of an HTML document, including some common server-side tags, while reproducing verbatim any unrecognised or invalid HTML. It also provides high-level HTML form manipulation functions.

There is a newer version: 2.3
Show newest version
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 1.5
// Copyright (C) 2004 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// http://www.gnu.org/copyleft/lesser.html
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package au.id.jericho.lib.html;

import java.util.*;
import java.io.*;

/**
 * Represents a source HTML document.
 * 

* Note that many of the useful functions which can be performed on the source document are * defined in its superclass, {@link Segment}. * The Source object is itself a Segment which spans the entire document. *

* Most of the methods defined in this class are useful for determining the elements and tags * surrounding or neighbouring a particular character position in the document. *

* IMPORTANT NOTE: Because HTML allows '<' characters within attribute values * (see section 5.3.2 of the HTML spec), * it is theoretically impossible to determine with certainty whether * any given '<' character in a source document is the start of a tag * without having parsed from the beginning of the document (which Jericho HTML Parser doesn't do). * For this reason, the parser may reject a start tag completely if its attributes are not * properly formed, although it does try to provide some leniency. * In XHTML, such characters must be represented in attribute values as character entities. * (see section 3.1 of the XML spec) * * @see Segment */ public class Source extends Segment { protected CharSequence text; private String parseTextLowerCase=null; private OutputDocument parseTextLowerCaseOutputDocument=null; private Writer logWriter=null; /** * Constructs a new Source object with the specified text. * @param text the source text. */ public Source(CharSequence text) { super(0,text.length()); source=this; this.text=text; } /** * Returns the source text as a String. *

* If the original CharSequence supplied when this instance was constructed was not a String, * the first conversion of the text to a String is cached for subsequent calls. * * @return the source text as a String. */ public String toString() { String string=text.toString(); if (text!=string) text=string; // make sure CharSequence is only converted to String once return string; } /** * Returns the {@link Element} with the specified id attribute value. *

* This simulates the script method * getElementById * defined in DOM HTML level 1. *

* This is equivalent to findNextStartTag(0,"id",id,true).getElement(). *

* A well formed HTML document should have no more than one element with any given id attribute value. *

* Calls to this method are not cached. * * @param id the id attribute value (case sensitive) to search for, must not be null. * @return the {@link Element} with the specified id attribute value. */ public Element getElementById(String id) { StartTag startTag=findNextStartTag(5,Attribute.ID,id,true); return startTag==null ? null : startTag.getElement(); } /** * Returns the StartTag at or immediately preceding (or enclosing) the specified position in the source document. *

* If the specified position is within an HTML {@linkplain Segment#isComment() comment}, the segment * spanning the comment is returned. * * @param pos the position in the source document from which to start the search. * @return the StartTag immediately preceding the specified position in the source document, or null if none exists. */ public StartTag findPreviousStartTag(int pos) { return findPreviousStartTag(pos,null); } /** * Returns the StartTag with the specified name at or immediately preceding (or enclosing) the specified position in the source document. *

* Start tags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored, but the comment segment itself is treated as a start tag. *

* Specifying a null name parameter is equivalent to {@link #findPreviousStartTag(int) findPreviousStartTag(pos)}. * * @param pos the position in the source document from which to start the search. * @param name the {@linkplain StartTag#getName() name} of the StartTag to search for. * @return the StartTag with the specified name immediately preceding the specified position in the source document, or null if none exists. */ public StartTag findPreviousStartTag(int pos, String name) { if (name!=null) name=name.toLowerCase(); return StartTag.findPreviousOrNext(this,pos,name,true); } /** * Returns the StartTag beginning at or immediately following the specified position in the source document. *

* StartTags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored, but subsequent comment segments are treated as start tags. * * @param pos the position in the source document from which to start the search. * @return the StartTag beginning at or immediately following the specified position in the source document, or null if none exists. */ public StartTag findNextStartTag(int pos) { return findNextStartTag(pos,null); } /** * Returns the StartTag with the specified name beginning at or immediately following the specified position in the source document. *

* Start tags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored. *

* Specifying a null name parameter is equivalent to {@link #findNextStartTag(int) findNextStartTag(pos)}. *

* Specifying a name parameter ending in a colon (:) searches for all start tags in the specified XML namespace. * * @param pos the position in the source document from which to start the search. * @param name the {@linkplain StartTag#getName() name} of the StartTag to search for. * @return the StartTag with the specified name beginning at or immediately following the specified position in the source document, or null if none exists. */ public StartTag findNextStartTag(int pos, String name) { if (name!=null) name=name.toLowerCase(); return StartTag.findPreviousOrNext(this,pos,name,false); } /** * Returns the StartTag with the specified attribute name/value pair beginning at or immediately following the specified position in the source document. *

* Calls to this method are not cached. * * @param pos the position in the source document from which to start the search. * @param attributeName the attribute name (case insensitive) to search for, must not be null. * @param value the value of the specified attribute to search for, must not be null. * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive. * @return the StartTag with the specified attribute name/value pair beginning at or immediately following the specified position in the source document. */ public StartTag findNextStartTag(int pos, String attributeName, String value, boolean valueCaseSensitive) { return StartTag.findNext(this,pos,attributeName,value,valueCaseSensitive); } /** * Returns the Comment beginning at or immediately following the specified position in the source document. *

* If the specified position is within a comment, the comment following the enclosing comment is returned. * * @param pos the position in the source document from which to start the search. * @return the Comment beginning at or immediately following the specified position in the source document, or null if none exists. */ public StartTag findNextComment(int pos) { return findNextStartTag(pos,SpecialTag.COMMENT.getName()); } /** * Returns the EndTag with the specified name at or immediately preceding (or enclosing) the specified position in the source document. *

* End tags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored. * * @param pos the position in the source document from which to start the search. * @param name the {@linkplain StartTag#getName() name} of the EndTag to search for, must not be null. * @return the EndTag immediately preceding the specified position in the source document, or null if none exists. */ public EndTag findPreviousEndTag(int pos, String name) { if (name==null) throw new IllegalArgumentException(); return EndTag.findPreviousOrNext(this,pos,name.toLowerCase(),true); } /** * Returns the EndTag beginning at or immediately following the specified position in the source document. *

* End tags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored. * * @param pos the position in the source document from which to start the search. * @return the EndTag beginning at or immediately following the specified position in the source document, or null if none exists. */ public EndTag findNextEndTag(int pos) { return EndTag.findNext(this,pos); } /** * Returns the EndTag with the specified name beginning at or immediately following the specified position in the source document. *

* End tags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored. * * @param pos the position in the source document from which to start the search. * @param name the {@linkplain StartTag#getName() name} of the EndTag to search for, must not be null. * @return the EndTag with the specified name beginning at or immediately following the specified position in the source document, or null if none exists. */ public EndTag findNextEndTag(int pos, String name) { if (name==null) throw new IllegalArgumentException(); return EndTag.findPreviousOrNext(this,pos,name.toLowerCase(),false); } /** * Returns an iterator of {@link Tag} objects beginning at or immediately following the specified position in the source document. *

* Tags positioned within an HTML {@linkplain Segment#isComment() comment} are ignored, but the comment segments themselves are treated as start tags. * * @param pos the position in the source document from which to start the iteration. * @return an iterator of {@link Tag} objects beginning at or immediately following the specified position in the source document. */ public Iterator getNextTagIterator(int pos) { return Tag.getNextTagIterator(this,pos); } /** * Returns the tag (either a {@link StartTag} or {@link EndTag}) beginning at or immediately following the specified position in the source document. *

* IMPLEMENTATION NOTE: Sequential tags in a document should be retrieved using the iterator from * {@link #getNextTagIterator(int pos)} as it is far more efficient than using multiple calls to this method. * * @param pos the position in the source document from which to start the search. * @return the tag beginning at or immediately following the specified position in the source document, or null if none exists. * @see #getNextTagIterator(int pos) */ public Tag findNextTag(int pos) { Iterator i=getNextTagIterator(pos); return i.hasNext() ? (Tag)i.next() : null; } /** * Returns the StartTag enclosing the specified position in the source document. *

* If the specified position is within an HTML {@linkplain Segment#isComment() comment}, the segment * spanning the comment is returned. *

* A segment is considered to enclose a character position x if
segment.getBegin() <= x < segment.getEnd() * * @param pos the position in the source document. * @return the StartTag enclosing the specified position in the source document, or null if the position is not within a StartTag. */ public StartTag findEnclosingStartTag(int pos) { return findEnclosingStartTag(pos,null); } /** * Returns a Segment spanning the HTML {@linkplain Segment#isComment() comment} that encloses the specified position in the source document. *

* A segment is considered to enclose a character position x if
segment.getBegin() <= x < segment.getEnd() * * @param pos the position in the source document. * @return a Segment spanning the HTML {@linkplain Segment#isComment() comment} that encloses the specified position in the source document, or null if the position is not within a comment. */ public Segment findEnclosingComment(int pos) { return findEnclosingStartTag(pos,SpecialTag.COMMENT.getName()); } /** * Returns the most nested Element enclosing the specified position in the source document. *

* If the specified position is within an HTML {@linkplain Segment#isComment() comment}, the segment * spanning the comment is returned. *

* A segment is considered to enclose a character position x if
segment.getBegin() <= x < segment.getEnd() * * @param pos the position in the source document. * @return the most nested Element enclosing the specified position in the source document, or null if the position is not within an Element. */ public Element findEnclosingElement(int pos) { return findEnclosingElement(pos,null); } /** * Returns the most nested Element with the specified name enclosing the specified position in the source document. *

* Elements positioned within an HTML {@linkplain Segment#isComment() comment} are ignored, but the comment segment itself is treated as an Element. * * @param pos the position in the source document. * @param name the {@linkplain Element#getName() name} of the Element to search for. * @return the most nested Element with the specified name enclosing the specified position in the source document, or null if none exists. */ public Element findEnclosingElement(int pos, String name) { int startBefore=pos; if (name!=null) name=name.toLowerCase(); while (true) { StartTag startTag=findPreviousStartTag(startBefore,name); if (startTag==null) return null; Element element=startTag.getElement(); if (pos < element.end) return element; startBefore=startTag.begin-1; } } /** * Returns the CharacterReference at or immediately preceding (or enclosing) the specified position in the source document. *

* Character references positioned within an HTML {@linkplain Segment#isComment() comment} are NOT ignored. * * @param pos the position in the source document from which to start the search. * @return the CharacterReference beginning at or immediately preceding the specified position in the source document, or null if none exists. */ public CharacterReference findPreviousCharacterReference(int pos) { return CharacterReference.findPreviousOrNext(this,pos,true); } /** * Returns the CharacterReference beginning at or immediately following the specified position in the source document. *

* Character references positioned within an HTML {@linkplain Segment#isComment() comment} are NOT ignored. * * @param pos the position in the source document from which to start the search. * @return the CharacterReference beginning at or immediately following the specified position in the source document, or null if none exists. */ public CharacterReference findNextCharacterReference(int pos) { return CharacterReference.findPreviousOrNext(this,pos,false); } /** * Parses any {@link Attributes} starting at the specified position. * This method is only used in the unusual situation where attributes exist outside of a start tag. * The {@link StartTag#getAttributes()} method should be used in normal situations. *

* The returned Attributes segment will always begin at pos, * and will end at the first occurrence of "/>" or ">" outside of a quoted attribute value, * or at maxEnd, whichever comes first. *

* Only returns null if the segment contains a major syntactical error * or more than the {@linkplain Attributes#setDefaultMaxErrorCount(int) default maximum} number of * minor syntactical errors. *

* This is equivalent to * {@link #parseAttributes(int,int,int) parseAttributes(pos,maxEnd,Attributes.getDefaultMaxErrorCount())} * * @param pos the position in the source document at the beginning of the attribute list * @param maxEnd the maximum end position of the attribute list, or -1 if no maximum * @return the {@link Attributes} starting at the specified position, or null if too many errors occur while parsing. * @see StartTag#getAttributes() * @see Segment#parseAttributes() */ public Attributes parseAttributes(int pos, int maxEnd) { return parseAttributes(pos,maxEnd,Attributes.getDefaultMaxErrorCount()); } /** * Parses any {@link Attributes} starting at the specified position. * This method is only used in the unusual situation where attributes exist outside of a start tag. * The {@link StartTag#getAttributes()} method should be used in normal situations. *

* Only returns null if the segment contains a major syntactical error * or more than the specified number of minor syntactical errors. *

* The maxErrorCount argument overrides the default maximum number of minor errors allowed, * which can be set using the {@link Attributes#setDefaultMaxErrorCount(int)} static method. *

* See {@link #parseAttributes(int pos, int maxEnd)} for more information. * * @param pos the position in the source document at the beginning of the attribute list * @param maxEnd the maximum end position of the attribute list, or -1 if no maximum * @param maxErrorCount the maximum number of minor errors allowed while parsing * @return the {@link Attributes} starting at the specified position, or null if too many errors occur while parsing. * @see StartTag#getAttributes() * @see #parseAttributes(int pos, int MaxEnd) */ public Attributes parseAttributes(int pos, int maxEnd, int maxErrorCount) { return Attributes.construct(this,pos,maxEnd,maxErrorCount); } /** * Causes the specified range of the source text to be ignored when parsing. *

* This method is usually used to exclude server tags or other non-HTML segments from the source text * so that it does not interfere with the parsing of the surrounding HTML. *

* This is necessary because many server tags are used as attribute values and in other places within * HTML tags, and very often contain characters that prevent the parser from recognising the surrounding tag. *

* For efficiency reasons, all segments to be ignored should be registered at once, without performing * searches in between. * * @param begin the beginning character position in the source text. * @param end the end character position in the source text. * @see Segment#ignoreWhenParsing() */ public void ignoreWhenParsing(int begin, int end) { if (parseTextLowerCaseOutputDocument==null) { parseTextLowerCaseOutputDocument=new OutputDocument(getParseTextLowerCase()); parseTextLowerCase=null; } parseTextLowerCaseOutputDocument.add(new BlankOutputSegment(begin,end)); } /** * Causes all of the segments in the specified collection to be ignored when parsing. *

* This is equivalent to calling {@link Segment#ignoreWhenParsing()} on each segment in the collection. */ public void ignoreWhenParsing(Collection segments) { for (Iterator i=segments.iterator(); i.hasNext();) { ((Segment)i.next()).ignoreWhenParsing(); } } /** * Sets the destination for log messages. *

* By default, the log writer is set to null, which supresses log messages. * * @param writer the java.io.Writer where log messages will be sent */ public void setLogWriter(Writer writer) { logWriter=writer; } /** * Gets a list of all the tags that have been parsed so far. *

* This information may be useful for debugging purposes. * Execution of this method collects information from the internal cache and is relatively expensive. * * @return a list of all the tags that have been parsed so far. */ protected List getParsedTags() { return searchCache.getTagList(); } /** * Returns the parse text in lower case. *

* The parse text is the text used when parsing, which is the same as the source text but with * some segments replaced with spaces where the {@link #ignoreWhenParsing(int begin, int end)} method * has been called. * * @return the parse text in lower case. */ final String getParseTextLowerCase() { if (parseTextLowerCase==null) { if (parseTextLowerCaseOutputDocument!=null) { parseTextLowerCase=parseTextLowerCaseOutputDocument.toString(); parseTextLowerCaseOutputDocument=null; } else { parseTextLowerCase=toString().toLowerCase(); } } return parseTextLowerCase; } protected final int getIdentifierEnd(int pos, boolean fromStart) { if (fromStart && !isIdentifierStart(text.charAt(pos++))) return -1; while (true) { if (!isIdentifierPart(text.charAt(pos))) return pos; pos++; } } protected int findEnd(int pos, SpecialTag specialTag) { int delimiterBegin=getParseTextLowerCase().indexOf(specialTag.getEndDelimiter(),pos); return (delimiterBegin==-1 ? -1 : delimiterBegin+specialTag.getEndDelimiter().length()); } private StartTag findEnclosingStartTag(int pos, String name) { StartTag startTag=findPreviousStartTag(pos,name); if (startTag==null || startTag.end<=pos) return null; return startTag; } private void logLine(String message) { try { logWriter.write(message); logWriter.write('\n'); logWriter.flush(); } catch (IOException ex) { throw new RuntimeException(ex); } } protected void log(int pos, String message) { if (logWriter==null) return; logLine(pos+": "+message); } void log(String type, String name, int begin, String message, int pos) { if (logWriter==null) return; StringBuffer sb=new StringBuffer(type); if (name!=null) sb.append(' ').append(name); sb.append(" at ").append(begin).append(' ').append(message); if (pos!=-1) sb.append(" at position ").append(pos); logLine(sb.toString()); } final SearchCache getSearchCache() { if (searchCache==null) searchCache=new SearchCache(); return searchCache; } private SearchCache searchCache=null; }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy