All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.au.id.jericho.lib.html.Attributes Maven / Gradle / Ivy

Go to download

Jericho HTML Parser is a simple but powerful java library allowing analysis and manipulation of parts of an HTML document, including some common server-side tags, while reproducing verbatim any unrecognised or invalid HTML. It also provides high-level HTML form manipulation functions.

There is a newer version: 2.3
Show newest version
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 1.5
// Copyright (C) 2004 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// http://www.gnu.org/copyleft/lesser.html
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package au.id.jericho.lib.html;

import au.id.jericho.lib.html.internal.*;
import java.util.*;
import java.io.*;

/**
 * Represents the list of {@link Attribute} objects present within a particular {@link StartTag}.
 * 

* The attributes in this list are a representation of those found in the source document and are not modifiable. * The {@link AttributesOutputSegment} class provides a means to add, delete or modify attributes and * their values for inclusion in an {@link OutputDocument}. *

* This segment starts at the end of the StartTag's {@linkplain StartTag#getName() name} * and ends at the end of the last attribute. *

* Note that before version 1.5 the segment ended just before the closing '/', '?' or '>' character of the StartTag * instead of at the end of the last attribute. *

* Created using the {@link StartTag#getAttributes()} method, or explicitly using the {@link Source#parseAttributes(int pos, int maxEnd)} method. *

* It is possible (and common) for instances of this class to contain no attributes. *

* See also the XML 1.0 specification for attributes. * * @see StartTag * @see Attribute */ public final class Attributes extends SequentialListSegment { private LinkedList attributeList; // never null // parsing states: private static final int AFTER_TAG_NAME=0; private static final int BETWEEN_ATTRIBUTES=1; private static final int IN_NAME=2; private static final int AFTER_NAME=3; // this only happens if an attribute name is followed by whitespace private static final int START_VALUE=4; private static final int IN_VALUE=5; private static final int AFTER_VALUE_FINAL_QUOTE=6; private static int defaultMaxErrorCount=1; // defines maximum number of minor errors that can be encountered in attributes before entire start tag is rejected. private Attributes(Source source, int begin, int end, LinkedList attributeList) { super(source,begin,end); this.attributeList=attributeList; } /** * called from Source.parseAttributes */ static Attributes construct(Source source, int begin, int maxEnd, int maxErrorCount) { return construct(source,"Attributes",BETWEEN_ATTRIBUTES,begin,-1,maxEnd,null,null,maxErrorCount); } /** * called from StartTag.parseAttributes */ static Attributes construct(Source source, int startTagBegin, int attributesBegin, int maxEnd, String startTagName, int maxErrorCount) { return construct(source,"Attributes for StartTag",BETWEEN_ATTRIBUTES,startTagBegin,attributesBegin,maxEnd,null,startTagName,maxErrorCount); } /** * called from StartTag.constructWithAttributes */ static Attributes construct(Source source, int startTagBegin, ByRefInt startTagEndDelimiterPos, String startTagName) { return construct(source,"StartTag",AFTER_TAG_NAME,startTagBegin,-1,-1,startTagEndDelimiterPos,startTagName,defaultMaxErrorCount); } /** * Any < character found within the start tag is treated as though it is part of the attribute * list, which is consistent with the way IE treats it. * A processing instruction will be terminated by > as well as ?>, which is also consistent with IE. * In some cases an invalid character will result in the entire start tag being rejected. * This may seem ruthless, but we have to be able to distinguish whether any * particular < found in the source is actually the start of a tag or not. * Being too lenient with attributes means more chance of false positives, which in turn * means surrounding tags may be ignored. * @param source the source document. * @param logBegin the position of the beginning of the object being searched (for logging) * @param attributesBegin the position of the beginning of the attribute list, or -1 if it should be calculated automatically from logBegin. * @param maxEnd the position at which the attributes must end if a terminating character is not found, or -1 if no maximum. * @param startTagName the name of the enclosing StartTag, or null if constucting attributes directly. */ private static Attributes construct(Source source, String logType, int state, int logBegin, int attributesBegin, int maxEnd, ByRefInt startTagEndDelimiterPos, String startTagName, int maxErrorCount) { char optionalTerminatingChar='/'; if (startTagName!=null) { // 'logBegin' parameter is the start of the associated start tag if (attributesBegin==-1) attributesBegin=logBegin+1+startTagName.length(); if (startTagName.charAt(0)=='?') optionalTerminatingChar='?'; // optionalTerminatingChar will normally be '/' but can also be '?' for xml processing instructions like } else { attributesBegin=logBegin; } int attributesEnd=attributesBegin; LinkedList attributeList=new LinkedList(); String lsource=source.getParseTextLowerCase(); int i=attributesBegin; char quote=' '; Segment nameSegment=null; String key=null; int currentBegin=-1; boolean isTerminatingCharacter=false; int errorCount=0; try { while (!isTerminatingCharacter) { char c=lsource.charAt(i); if (c=='>' || i==maxEnd || (c==optionalTerminatingChar && lsource.charAt(i+1)=='>')) isTerminatingCharacter=true; switch (state) { case IN_VALUE: if (isTerminatingCharacter || c==quote || (quote==' ' && isWhiteSpace(c))) { Segment valueSegment; Segment valueSegmentIncludingQuotes; if (quote==' ') { valueSegment=valueSegmentIncludingQuotes=new Segment(source,currentBegin,i); } else { if (isTerminatingCharacter) { if (i==maxEnd) { source.log(logType,startTagName,logBegin,"terminated in the middle of a quoted attribute value",i); if (reachedMaxErrorCount(++errorCount,source,logType,startTagName,logBegin,maxErrorCount)) return null; valueSegment=new Segment(source,currentBegin,i); valueSegmentIncludingQuotes=new Segment(source,currentBegin-1,i); // this is missing the end quote } else { // don't want to terminate, only encountered a terminating character in the middle of a quoted value isTerminatingCharacter=false; break; } } else { valueSegment=new Segment(source,currentBegin,i); valueSegmentIncludingQuotes=new Segment(source,currentBegin-1,i+1); } } attributeList.add(new Attribute(source, key, nameSegment, valueSegment, valueSegmentIncludingQuotes)); attributesEnd=valueSegmentIncludingQuotes.getEnd(); state=BETWEEN_ATTRIBUTES; } else if (c=='<' && quote==' ') { source.log(logType,startTagName,logBegin,"rejected because of '<' character in unquoted attribute value",i); return null; } break; case IN_NAME: if (isTerminatingCharacter || c=='=' || isWhiteSpace(c)) { nameSegment=new Segment(source,currentBegin,i); key=nameSegment.toString().toLowerCase(); if (isTerminatingCharacter) { attributeList.add(new Attribute(source,key,nameSegment)); // attribute with no value attributesEnd=i; } else { state=(c=='=' ? START_VALUE : AFTER_NAME); } } else if (!isIdentifierPart(c)) { // invalid character detected in attribute name. // only reject whole start tag if it is a < character or if the error count is exceeded. if (c=='<') { source.log(logType,startTagName,logBegin,"rejected because of '<' character in attribute name",i); return null; } source.log(logType,startTagName,logBegin,"contains attribute name with invalid character",i); if (reachedMaxErrorCount(++errorCount,source,logType,startTagName,logBegin,maxErrorCount)) return null; } break; case AFTER_NAME: if (isTerminatingCharacter || !(c=='=' || isWhiteSpace(c))) { attributeList.add(new Attribute(source,key,nameSegment)); // attribute with no value attributesEnd=nameSegment.getEnd(); if (isTerminatingCharacter) break; // The current character is the first character of an attribute name state=BETWEEN_ATTRIBUTES; i--; // want to reparse the same character again, so decrement i. Note we could instead just fall into the next case statement without a break, but such code is always discouraged. } else if (c=='=') { state=START_VALUE; } break; case BETWEEN_ATTRIBUTES: if (!isTerminatingCharacter) { // the quote variable is used here to make sure whitespace has come after the last quoted attribute value if (isWhiteSpace(c)) { quote=' '; } else { if (quote!=' ') { source.log(logType,startTagName,logBegin,"has missing whitespace after quoted attribute value",i); // log this as an error but don't count it } if (!isIdentifierStart(c)) { // invalid character detected as first character of attribute name. // only reject whole start tag if it is a < character or if the error count is exceeded. if (c=='<') { source.log(logType,startTagName,logBegin,"rejected because of '<' character",i); return null; } source.log(logType,startTagName,logBegin,"contains attribute name with invalid first character",i); if (reachedMaxErrorCount(++errorCount,source,logType,startTagName,logBegin,maxErrorCount)) return null; } state=IN_NAME; currentBegin=i; } } break; case START_VALUE: currentBegin=i; if (isTerminatingCharacter) { source.log(logType,startTagName,logBegin,"has missing attribute value after '=' sign",i); // log this as an error but don't count it Segment valueSegment=new Segment(source,i,i); attributeList.add(new Attribute(source,key,nameSegment,valueSegment,valueSegment)); attributesEnd=i; state=BETWEEN_ATTRIBUTES; break; } if (isWhiteSpace(c)) break; // just ignore whitespace after the '=' sign as nearly all browsers do. if (c=='<') { source.log(logType,startTagName,logBegin,"rejected because of '<' character at start of attribuite value",i); return null; } else if (c=='\'' || c=='"') { quote=c; currentBegin++; } else { quote=' '; } state=IN_VALUE; break; case AFTER_TAG_NAME: if (!isTerminatingCharacter) { if (!isWhiteSpace(c)) { source.log(logType,startTagName,logBegin,"rejected because name contains invalid character",i); return null; } state=BETWEEN_ATTRIBUTES; } break; } i++; } if (startTagEndDelimiterPos!=null) startTagEndDelimiterPos.value=i-1; return new Attributes(source,attributesBegin,attributesEnd,attributeList); // used to end at i-1 } catch (IndexOutOfBoundsException ex) { source.log(logType,startTagName,logBegin,"rejected because it has no closing '>' character",-1); return null; } } private static boolean reachedMaxErrorCount(int errorCount, Source source, String logType, String startTagName, int logBegin, int maxErrorCount) { if (errorCount<=maxErrorCount) return false; source.log(logType,startTagName,logBegin,"rejected because it contains too many errors",-1); return true; } /** * Returns the {@link Attribute} with the specified name (case insensitive). *

* If more than one attribute exists with the specified name (which is technically illegal HTML), * the first is returned. * * @param name the name of the attribute to get. * @return the attribute with the specified name, or null if no attribute with the specified name exists. * @see #getValue(String name) */ public Attribute get(String name) { if (size()==0) return null; for (int i=0; i * Returns null if no attribute with the specified name exists or no value has been assigned to * the attribute. *

* This is equivalent to get(name).getValue(), although it will return null * if no attribute with the specified name exists instead of throwing a * NullPointerException. *

* Note that before version 1.5 this method returned the raw value of the attribute, without * {@linkplain CharacterReference#decode(CharSequence) decoding}. * * @param name the name of the attribute to get. * @return the {@linkplain CharacterReference#decode(CharSequence) decoded} value of the attribute with the specified name, or null if the attribute has no value. * @see #get(String name) */ public String getValue(String name) { Attribute attribute=get(name); return attribute==null ? null : attribute.getValue(); } /** * Returns the number of attributes. *

* This is equivalent to calling the size() method specified in the List interface. * * @return the number of attributes. */ public int getCount() { return attributeList.size(); } /** * Returns an iterator over the {@link Attribute} objects in this list in proper sequence. * @return an iterator over the {@link Attribute} objects in this list in proper sequence. */ public Iterator iterator() { return listIterator(); } /** * Returns a list iterator of the {@link Attribute} objects in this list (in proper sequence), * starting at the specified position in the list. *

* The specified index indicates the first item that would be returned by an initial call to the next() method. * An initial call to the previous() method would return the item with the specified index minus one. * * @param index the index of the first item to be returned from the list iterator (by a call to the next() method). * @return a list iterator of the items in this list (in proper sequence), starting at the specified position in the list. * @throws IndexOutOfBoundsException if the specified index is out of range (index < 0 || index > size()). */ public ListIterator listIterator(final int index) { return new ListIteratorImpl(index); } protected ListIterator internalListIterator(int index) { return attributeList.listIterator(index); } /** * Populates the specified Map with the name/value pairs from these attributes. *

* Both names and values are stored as String objects. *

* The entries are added in order of apprearance in the source document. *

* An attribute with no value is represented by a map entry with a null value. *

* Attribute values are automatically {@linkplain CharacterReference#decode(CharSequence) decoded} * before storage in the map. * * @param attributesMap the map to populate, must not be null. * @param convertNamesToLowerCase specifies whether all attribute names are converted to lower case in the map. * @return the same map specified as the attributesMap argument, populated with the name/value pairs from these attributes. * @see #generateHTML(Map attributesMap) */ public Map populateMap(Map attributesMap, boolean convertNamesToLowerCase) { for (Iterator i=internalListIterator(0); i.hasNext();) { Attribute attribute=(Attribute)i.next(); attributesMap.put(convertNamesToLowerCase ? attribute.getKey() : attribute.getName(),attribute.getValue()); } return attributesMap; } public String getDebugInfo() { StringBuffer sb=new StringBuffer(); sb.append("Attributes ").append(super.getDebugInfo()).append(": "); if (isEmpty()) { sb.append("EMPTY"); } else { sb.append('\n'); for (Iterator i=internalListIterator(0); i.hasNext();) { Attribute attribute=(Attribute)i.next(); sb.append(" ").append(attribute.getDebugInfo()); } } return sb.toString(); } /** * Returns the default maximum error count allowed when parsing attributes. *

* The system default value is 1. * * @return the default maximum error count allowed when parsing attributes. * @see #setDefaultMaxErrorCount(int value) * @see Source#parseAttributes(int pos, int maxEnd, int maxErrorCount) */ public static int getDefaultMaxErrorCount() { return defaultMaxErrorCount; } /** * Sets the default maximum error count allowed when parsing attributes. *

* When searching for start tags, the parser can find the end of the start tag only by parsing * the the attributes, as it is valid HTML for attribute values to contain '>' characters * (see section 5.3.2 of the HTML spec). *

* If the source text being parsed does not follow the syntax of an attribute list at all, the parser assumes * that the text which was originally identified as the beginning of of a start tag is in fact some other text, * such as an invalid '<' character in the middle of some text, or part of a script element. * In this case the entire start tag is rejected. *

* On the other hand, it is quite common for attributes to contain minor syntactical errors, * such as an invalid character in an attribute name, or a couple of special characters in * {@linkplain StartTag#isServerTag() server tags} that otherwise contain only attributes. * For this reason the parser allows a certain number of minor errors to occur while parsing an * attribute list before the entire start tag or attribute list is rejected. * This method sets the number of minor errors allowed. *

* Major syntactical errors will cause the start tag or attribute list to be rejected immediately, regardless * of the maximum error count setting. *

* Some errors are considered too minor to count at all (ignorable), such as missing whitespace between the end * of a quoted attribute value and the start of the next attribute name. *

* The classification of particular syntax errors in attribute lists into major, minor, and ignorable is * not part of the specification and may change in future versions. *

* To track errors as they occur, use the {@link Source#setLogWriter(Writer writer)} method to set the * destination of the error log. * * @param value the default maximum error count allowed when parsing attributes. * @see #getDefaultMaxErrorCount() * @see Source#parseAttributes(int pos, int maxEnd, int maxErrorCount) * @see Source#setLogWriter(Writer writer) */ public static void setDefaultMaxErrorCount(int value) { defaultMaxErrorCount=value; } /** * Returns the raw (not {@linkplain CharacterReference#decode(CharSequence) decoded}) value of the attribute, or null if the attribute has no value. * @return the raw (not {@linkplain CharacterReference#decode(CharSequence) decoded}) value of the attribute, or null if the attribute has no value. */ protected String getRawValue(String name) { Attribute attribute=get(name); return attribute==null || !attribute.hasValue() ? null : attribute.getValueSegment().toString(); } /** * Returns the contents of the specified {@linkplain #populateMap(Map,boolean) attributes map} as HTML attribute name/value pairs. *

* Each attribute (including the first) is preceded by a single space, and all values are * {@linkplain CharacterReference#encode(CharSequence) encoded} and enclosed in double quotes. *

* The map keys must be of type String and values must be objects that implement the CharSequence interface. *

* A null value represents an attribute with no value. * * @param attributesMap a map containing attribute name/value pairs. * @return the contents of the specified {@linkplain #populateMap(Map,boolean) attributes map} as HTML attribute name/value pairs. * @see StartTag#generateHTML(String tagName, Map attributesMap, boolean emptyElementTag) */ public static String generateHTML(Map attributesMap) { StringWriter stringWriter=new StringWriter(); try {appendHTML(stringWriter,attributesMap);} catch (IOException ex) {} // IOException never occurs in StringWriter return stringWriter.toString(); } /** * Returns this instance. *

* This method has been deprecated as of version 1.5 as the Attributes class now implements * the List interface, so the instance itself can be used instead. * * @return this instance. * @deprecated use this instance instead. */ public List getList() { return this; } /** * Outputs the contents of the specified {@linkplain #populateMap(Map,boolean) attributes map} as HTML attribute name/value pairs to the specified Writer. *

* Each attribute is preceded by a single space, and all values are * {@linkplain CharacterReference#encode(CharSequence) encoded} and enclosed in double quotes. * * @param out the Writer to which the output is to be sent. * @param attributesMap a map containing attribute name/value pairs. * @throws IOException if an I/O exception occurs. * @see #populateMap(Map attributesMap, boolean convertNamesToLowerCase) */ protected static void appendHTML(Writer writer, Map attributesMap) throws IOException { for (Iterator i=attributesMap.entrySet().iterator(); i.hasNext();) { Map.Entry entry=(Map.Entry)i.next(); Attribute.appendHTML(writer,(String)entry.getKey(),(CharSequence)entry.getValue()); } } protected StringBuffer appendRegeneratedHTML(StringBuffer sb) { for (Iterator i=internalListIterator(0); i.hasNext();) ((Attribute)i.next()).appendRegeneratedHTML(sb); return sb; } protected Map getMap(boolean convertNamesToLowerCase) { return populateMap(new LinkedHashMap(getCount()*2,1.0F),convertNamesToLowerCase); } private class ListIteratorImpl implements ListIterator { private ListIterator listIterator; public ListIteratorImpl(int index) { listIterator=attributeList.listIterator(); } public boolean hasNext() { return listIterator.hasNext(); } public Object next() { return listIterator.next(); } public boolean hasPrevious() { return listIterator.hasPrevious(); } public Object previous() { return listIterator.previous(); } public int nextIndex() { return listIterator.nextIndex(); } public int previousIndex() { return listIterator.previousIndex(); } public void remove() { throw new UnsupportedOperationException(); } public void set(Object o) { throw new UnsupportedOperationException(); } public void add(Object o) { throw new UnsupportedOperationException(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy