All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.htmlparser.jericho.Attributes Maven / Gradle / Ivy

Go to download

Jericho HTML Parser is a java library allowing analysis and manipulation of parts of an HTML document, including server-side tags, while reproducing verbatim any unrecognised or invalid HTML.

There is a newer version: 3.4
Show newest version
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.3
// Copyright (C) 2004-2009 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import net.htmlparser.jericho.nodoc.*;
import java.util.*;
import java.io.*;

/**
 * Represents the list of {@link Attribute} objects present within a particular {@link StartTag}.
 * 

* This segment starts at the end of the start tag's {@linkplain StartTag#getName() name} * and ends at the end of the last attribute. *

* The attributes in this list are a representation of those found in the source document and are not modifiable. * The {@link OutputDocument#replace(Attributes, Map)} and {@link OutputDocument#replace(Attributes, boolean convertNamesToLowerCase)} methods * provide the means to add, delete or modify attributes and their values in an {@link OutputDocument}. *

* Any {@linkplain TagType#isServerTag() server tags} encountered inside the attributes area of a non-server tag * do not interfere with the parsing of the attributes. *

* If too many syntax errors are encountered while parsing a start tag's attributes, the parser rejects the entire start tag * and generates a {@linkplain Source#getLogger() log} entry. * The threshold for the number of errors allowed can be set using the {@link #setDefaultMaxErrorCount(int)} static method. *

* Obtained using the {@link StartTag#getAttributes()} method, or explicitly using the {@link Source#parseAttributes(int pos, int maxEnd)} method. *

* It is common for instances of this class to contain no attributes. *

* See also the XML 1.0 specification for attributes. * * @see StartTag * @see Attribute */ public final class Attributes extends SequentialListSegment { private final LinkedList attributeList; // never null final boolean containsServerTagOutsideOfAttributeValue; private enum ParsingState { AFTER_TAG_NAME, BETWEEN_ATTRIBUTES, IN_NAME, AFTER_NAME, // this only happens if an attribute name is followed by whitespace START_VALUE, IN_VALUE, AFTER_VALUE_FINAL_QUOTE } private static int defaultMaxErrorCount=2; // defines maximum number of minor errors that can be encountered in attributes before entire start tag is rejected. private Attributes(final Source source, final int begin, final int end, final LinkedList attributeList, final boolean containsServerTagOutsideOfAttributeValue) { super(source,begin,end); this.attributeList=attributeList; this.containsServerTagOutsideOfAttributeValue=containsServerTagOutsideOfAttributeValue; } /** called from StartTagType.parseAttributes(Source, int startTagBegin, String tagName) */ static Attributes construct(final Source source, final int startTagBegin, final StartTagType startTagType, final String tagName) { return construct(source,"StartTag",ParsingState.AFTER_TAG_NAME,startTagBegin,-1,-1,startTagType,tagName,defaultMaxErrorCount); } /** called from StartTag.parseAttributes(int maxErrorCount) */ static Attributes construct(final Source source, final int startTagBegin, final int attributesBegin, final int maxEnd, final StartTagType startTagType, final String tagName, final int maxErrorCount) { return construct(source,"Attributes for StartTag",ParsingState.BETWEEN_ATTRIBUTES,startTagBegin,attributesBegin,maxEnd,startTagType,tagName,maxErrorCount); } /** called from Source.parseAttributes(int pos, int maxEnd, int maxErrorCount) */ static Attributes construct(final Source source, final int begin, final int maxEnd, final int maxErrorCount) { return construct(source,"Attributes",ParsingState.BETWEEN_ATTRIBUTES,begin,-1,maxEnd,StartTagType.NORMAL,null,maxErrorCount); } /** * Any < character found within the start tag is treated as though it is part of the attribute * list, which is consistent with the way IE treats it. * @param logBegin the position of the beginning of the object being searched (for logging) * @param attributesBegin the position of the beginning of the attribute list, or -1 if it should be calculated automatically from logBegin. * @param maxEnd the position at which the attributes must end if a terminating character is not found, or -1 if no maximum. * @param tagName the name of the enclosing StartTag, or null if constucting attributes directly. */ private static Attributes construct(final Source source, final String logType, ParsingState parsingState, final int logBegin, int attributesBegin, final int maxEnd, final StartTagType startTagType, final String tagName, final int maxErrorCount) { boolean isClosingSlashIgnored=false; if (tagName!=null) { // 'logBegin' parameter is the start of the associated start tag if (attributesBegin==-1) attributesBegin=logBegin+1+tagName.length(); if (startTagType==StartTagType.NORMAL && HTMLElements.isClosingSlashIgnored(tagName)) isClosingSlashIgnored=true; } else { attributesBegin=logBegin; } int attributesEnd=attributesBegin; final LinkedList attributeList=new LinkedList(); boolean containsServerTagOutsideOfAttributeValue=false; final ParseText parseText=source.getParseText(); int i=attributesBegin; char quote=' '; Segment nameSegment=null; String key=null; int currentBegin=-1; boolean isTerminatingCharacter=false; int errorCount=0; try { while (!isTerminatingCharacter) { if (i==maxEnd || startTagType.atEndOfAttributes(source,i,isClosingSlashIgnored)) isTerminatingCharacter=true; final char ch=parseText.charAt(i); // First check if there is a server tag in this position: if (ch=='<') { final Tag interlopingTag=Tag.getTagAt(source,i,true); // search for server tags only if (interlopingTag!=null) { // There is a server tag in this position. Skip over it: if (parsingState==ParsingState.START_VALUE) { currentBegin=i; quote=' '; parsingState=ParsingState.IN_VALUE; } i=attributesEnd=interlopingTag.end; if (parsingState!=ParsingState.IN_VALUE) containsServerTagOutsideOfAttributeValue=true; continue; } } // There is no server tag in this position. Now we can parse the attributes: switch (parsingState) { case IN_VALUE: if (isTerminatingCharacter || ch==quote || (quote==' ' && isWhiteSpace(ch))) { Segment valueSegment; Segment valueSegmentIncludingQuotes; if (quote==' ') { valueSegment=valueSegmentIncludingQuotes=new Segment(source,currentBegin,i); } else { if (isTerminatingCharacter) { if (i==maxEnd) { if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"terminated in the middle of a quoted attribute value",i); if (reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null; valueSegment=new Segment(source,currentBegin,i); valueSegmentIncludingQuotes=new Segment(source,currentBegin-1,i); // this is missing the end quote } else { // don't want to terminate, only encountered a terminating character in the middle of a quoted value isTerminatingCharacter=false; break; } } else { valueSegment=new Segment(source,currentBegin,i); valueSegmentIncludingQuotes=new Segment(source,currentBegin-1,i+1); } } attributeList.add(new Attribute(source,key,nameSegment,valueSegment,valueSegmentIncludingQuotes)); attributesEnd=valueSegmentIncludingQuotes.getEnd(); parsingState=ParsingState.BETWEEN_ATTRIBUTES; } else if (ch=='<' && quote==' ') { if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character in unquoted attribute value",i); return null; } break; case IN_NAME: if (isTerminatingCharacter || ch=='=' || isWhiteSpace(ch)) { nameSegment=new Segment(source,currentBegin,i); key=nameSegment.toString().toLowerCase(); if (isTerminatingCharacter) { attributeList.add(new Attribute(source,key,nameSegment)); // attribute with no value attributesEnd=i; } else { parsingState=(ch=='=' ? ParsingState.START_VALUE : ParsingState.AFTER_NAME); } } else if (!Tag.isXMLNameChar(ch)) { // invalid character detected in attribute name. if (ch=='<') { if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character in attribute name",i); return null; } if (isInvalidEmptyElementTag(startTagType,source,i,logType,tagName,logBegin)) break; if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"contains attribute name with invalid character",i); if (reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null; } break; case AFTER_NAME: // attribute name has been followed by whitespace, but may still be followed by an '=' character. if (isTerminatingCharacter || !(ch=='=' || isWhiteSpace(ch))) { attributeList.add(new Attribute(source,key,nameSegment)); // attribute with no value attributesEnd=nameSegment.getEnd(); if (isTerminatingCharacter) break; // The current character is the first character of an attribute name parsingState=ParsingState.BETWEEN_ATTRIBUTES; i--; // want to reparse the same character again, so decrement i. Note we could instead just fall into the next case statement without a break, but such code is always discouraged. } else if (ch=='=') { parsingState=ParsingState.START_VALUE; } else if (ch=='<') { if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character after attribute name",i); return null; } break; case BETWEEN_ATTRIBUTES: if (!isTerminatingCharacter) { // the quote variable is used here to make sure whitespace has come after the last quoted attribute value if (isWhiteSpace(ch)) { quote=' '; } else { if (quote!=' ') { if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"has missing whitespace after quoted attribute value",i); // only count this as an error if there have already been other errors, otherwise allow unlimited errors of this type. if (errorCount>0 && reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null; } if (!Tag.isXMLNameStartChar(ch)) { // invalid character detected as first character of attribute name. if (ch=='<') { if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character",i); return null; } if (isInvalidEmptyElementTag(startTagType,source,i,logType,tagName,logBegin)) break; if (startTagType==StartTagType.NORMAL && startTagType.atEndOfAttributes(source,i,false)) { // This checks whether we've found the characters "/>" but it wasn't recognised as the closing delimiter because isClosingSlashIgnored is true. if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"contains a '/' character before the closing '>', which is ignored because tags of this name cannot be empty-element tags"); break; } if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"contains attribute name with invalid first character",i); if (reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null; } parsingState=ParsingState.IN_NAME; currentBegin=i; } } break; case START_VALUE: currentBegin=i; if (isTerminatingCharacter) { if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"has missing attribute value after '=' sign",i); // only count this as an error if there have already been other errors, otherwise allow unlimited errors of this type. if (errorCount>0 && reachedMaxErrorCount(++errorCount,source,logType,tagName,logBegin,maxErrorCount)) return null; final Segment valueSegment=new Segment(source,i,i); attributeList.add(new Attribute(source,key,nameSegment,valueSegment,valueSegment)); attributesEnd=i; parsingState=ParsingState.BETWEEN_ATTRIBUTES; break; } if (ch=='\'' || ch=='"') { quote=ch; currentBegin++; } else if (isWhiteSpace(ch)) { break; // just ignore whitespace after the '=' sign as nearly all browsers do. } else if (ch=='<') { if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"rejected because of '<' character at the start of an attribute value",i); return null; } else { quote=' '; } parsingState=ParsingState.IN_VALUE; break; case AFTER_TAG_NAME: if (!isTerminatingCharacter) { if (!isWhiteSpace(ch)) { if (isInvalidEmptyElementTag(startTagType,source,i,logType,tagName,logBegin)) break; if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"rejected because the name contains an invalid character",i); return null; } parsingState=ParsingState.BETWEEN_ATTRIBUTES; } break; } i++; } return new Attributes(source,attributesBegin,attributesEnd,attributeList,containsServerTagOutsideOfAttributeValue); } catch (IndexOutOfBoundsException ex) { if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"rejected because it has no closing '>' character"); return null; } } private static boolean reachedMaxErrorCount(final int errorCount, final Source source, final String logType, final String tagName, final int logBegin, final int maxErrorCount) { if (errorCount<=maxErrorCount) return false; if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"rejected because it contains too many errors"); return true; } private static boolean isInvalidEmptyElementTag(final StartTagType startTagType, final Source source, final int i, final String logType, final String tagName, final int logBegin) { // This checks whether we've found the characters "/>" but it wasn't recognised as the closing delimiter because isClosingSlashIgnored is true. if (startTagType!=StartTagType.NORMAL || !startTagType.atEndOfAttributes(source,i,false)) return false; if (source.logger.isErrorEnabled()) log(source,logType,tagName,logBegin,"contains a '/' character before the closing '>', which is ignored because tags of this name cannot be empty-element tags"); return true; } /** * Returns the {@link Attribute} with the specified name (case insensitive). *

* If more than one attribute exists with the specified name (which is illegal HTML), * the first is returned. * * @param name the name of the attribute to get. * @return the attribute with the specified name, or null if no attribute with the specified name exists. * @see #getValue(String name) */ public Attribute get(final String name) { if (size()==0) return null; for (int i=0; i * Returns null if no attribute with the specified name exists or * the attribute {@linkplain Attribute#hasValue() has no value}. *

* This is equivalent to {@link #get(String) get(name)}.{@link Attribute#getValue() getValue()}, * except that it returns null if no attribute with the specified name exists instead of throwing a * NullPointerException. * * @param name the name of the attribute to get. * @return the {@linkplain CharacterReference#decode(CharSequence) decoded} value of the attribute with the specified name, or null if the attribute does not exist or {@linkplain Attribute#hasValue() has no value}. * @see Attribute#getValue() */ public String getValue(final String name) { final Attribute attribute=get(name); return attribute==null ? null : attribute.getValue(); } /** * Returns the raw (not {@linkplain CharacterReference#decode(CharSequence) decoded}) value of the attribute, or null if the attribute {@linkplain Attribute#hasValue() has no value}. *

* This is an internal convenience method. * * @return the raw (not {@linkplain CharacterReference#decode(CharSequence) decoded}) value of the attribute, or null if the attribute {@linkplain Attribute#hasValue() has no value}. */ String getRawValue(final String name) { final Attribute attribute=get(name); return attribute==null || !attribute.hasValue() ? null : attribute.getValueSegment().toString(); } /** * Returns the number of attributes. *

* This is equivalent to calling the size() method specified in the List interface. * * @return the number of attributes. */ public int getCount() { return attributeList.size(); } /** * Returns an iterator over the {@link Attribute} objects in this list in order of appearance. * @return an iterator over the {@link Attribute} objects in this list in order of appearance. */ public Iterator iterator() { return listIterator(); } /** * Returns a list iterator of the {@link Attribute} objects in this list in order of appearance, * starting at the specified position in the list. *

* The specified index indicates the first item that would be returned by an initial call to the next() method. * An initial call to the previous() method would return the item with the specified index minus one. *

* IMPLEMENTATION NOTE: For efficiency reasons this method does not return an immutable list iterator. * Calling any of the add(Object), remove() or set(Object) methods on the returned * ListIterator does not throw an exception but could result in unexpected behaviour. * * @param index the index of the first item to be returned from the list iterator (by a call to the next() method). * @return a list iterator of the items in this list (in proper sequence), starting at the specified position in the list. * @throws IndexOutOfBoundsException if the specified index is out of range (index < 0 || index > size()). */ public ListIterator listIterator(final int index) { return attributeList.listIterator(index); } /** * Populates the specified Map with the name/value pairs from these attributes. *

* Both names and values are stored as String objects. *

* The entries are added in order of apprearance in the source document. *

* An attribute with {@linkplain Attribute#hasValue() no value} is represented by a map entry with a null value. *

* Attribute values are automatically {@linkplain CharacterReference#decode(CharSequence) decoded} * before storage in the map. * * @param attributesMap the map to populate, must not be null. * @param convertNamesToLowerCase specifies whether all attribute names are converted to lower case in the map. * @return the same map specified as the argument to the attributesMap parameter, populated with the name/value pairs from these attributes. * @see #generateHTML(Map attributesMap) */ public Map populateMap(final Map attributesMap, final boolean convertNamesToLowerCase) { for (Attribute attribute : this) { attributesMap.put(convertNamesToLowerCase ? attribute.getKey() : attribute.getName(),attribute.getValue()); } return attributesMap; } /** * Returns a string representation of this object useful for debugging purposes. * @return a string representation of this object useful for debugging purposes. */ public String getDebugInfo() { final StringBuilder sb=new StringBuilder(); sb.append("Attributes ").append(super.getDebugInfo()).append(": "); if (isEmpty()) { sb.append("EMPTY"); } else { sb.append(Config.NewLine); for (Attribute attribute : this) { sb.append(" ").append(attribute.getDebugInfo()); } } return sb.toString(); } /** * Returns the default maximum error count allowed when parsing attributes. *

* The system default value is 2. *

* When searching for start tags, the parser can find the end of the start tag only by * {@linkplain StartTagType#parseAttributes(Source,int,String) parsing} * the attributes, as it is valid HTML for attribute values to contain '>' characters * (see the HTML 4.01 specification section 5.3.2). *

* If the source text being parsed does not follow the syntax of an attribute list at all, the parser assumes * that the text which was originally identified as the beginning of of a start tag is in fact some other text, * such as an invalid '<' character in the middle of some text, or part of a script element. * In this case the entire start tag is rejected. *

* On the other hand, it is quite common for attributes to contain minor syntactical errors, * such as an invalid character in an attribute name. * For this reason the parser allows a certain number of minor errors to occur while parsing an * attribute list before the entire start tag or attribute list is rejected. * This property indicates the number of minor errors allowed. *

* Major syntactical errors cause the start tag or attribute list to be rejected immediately, regardless * of the maximum error count setting. *

* Some errors are considered too minor to count at all (ignorable), such as missing white space between the end * of a quoted attribute value and the start of the next attribute name. *

* The classification of particular syntax errors in attribute lists into major, minor, and ignorable is * not part of the specification and may change in future versions. *

* Errors are {@linkplain Source#getLogger() logged} as they occur. *

* The value of this property is set using the {@link #setDefaultMaxErrorCount(int)} method. * * @return the default maximum error count allowed when parsing attributes. * @see Source#parseAttributes(int pos, int maxEnd, int maxErrorCount) */ public static int getDefaultMaxErrorCount() { return defaultMaxErrorCount; } /** * Sets the default maximum error count allowed when parsing attributes. *

* See the {@link #getDefaultMaxErrorCount()} method for a full description of this property. * * @param value the default maximum error count allowed when parsing attributes. */ public static void setDefaultMaxErrorCount(final int value) { defaultMaxErrorCount=value; } /** * Returns the contents of the specified {@linkplain #populateMap(Map,boolean) attributes map} as HTML attribute name/value pairs. *

* Each attribute (including the first) is preceded by a single space, and all values are * {@linkplain CharacterReference#encode(CharSequence) encoded} and enclosed in double quotes. *

* The map keys must be of type String and values must be objects that implement the CharSequence interface. *

* A null value represents an attribute with no value. * * @param attributesMap a map containing attribute name/value pairs. * @return the contents of the specified {@linkplain #populateMap(Map,boolean) attributes map} as HTML attribute name/value pairs. * @see StartTag#generateHTML(String tagName, Map attributesMap, boolean emptyElementTag) */ public static String generateHTML(final Map attributesMap) { final StringBuilder sb=new StringBuilder(); try {appendHTML(sb,attributesMap);} catch (IOException ex) {} // IOException never occurs in StringWriter return sb.toString(); } /** * Outputs the contents of the specified {@linkplain #populateMap(Map,boolean) attributes map} as HTML attribute name/value pairs to the specified Appendable object. *

* Each attribute is preceded by a single space, and all values are * {@linkplain CharacterReference#encode(CharSequence) encoded} and enclosed in double quotes. * * @param appendable the Appendable object to which the output is to be sent. * @param attributesMap a map containing attribute name/value pairs. * @throws IOException if an I/O exception occurs. * @see #populateMap(Map attributesMap, boolean convertNamesToLowerCase) */ static void appendHTML(final Appendable appendable, final Map attributesMap) throws IOException { for (Map.Entry entry : attributesMap.entrySet()) { Attribute.appendHTML(appendable,entry.getKey(),entry.getValue()); } } Appendable appendTidy(final Appendable appendable, Tag nextTag) throws IOException { for (Attribute attribute : this) nextTag=attribute.appendTidy(appendable,nextTag); return appendable; } Map getMap(final boolean convertNamesToLowerCase) { return populateMap(new LinkedHashMap(getCount()*2,1.0F),convertNamesToLowerCase); } void setStartTag(final StartTag startTag) { // this just preloads the startTag cache in each Attribute so we don't have to go looking for it if it is requested. for (Attribute attribute : attributeList) attribute.startTag=startTag; } private static void log(final Source source, final String part1, final CharSequence part2, final int begin, final String part3, final int pos) { source.logger.error(source.getRowColumnVector(pos).appendTo(source.getRowColumnVector(begin).appendTo(new StringBuilder(200).append(part1).append(' ').append(part2).append(" at ")).append(' ').append(part3).append(" at position ")).toString()); } private static void log(final Source source, final String part1, final CharSequence part2, final int begin, final String part3) { source.logger.error(source.getRowColumnVector(begin).appendTo(new StringBuilder(200).append(part1).append(' ').append(part2).append(" at ")).append(' ').append(part3).toString()); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy