au.id.jericho.lib.html.Attribute Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jericho-html
Jericho HTML Parser is a java library allowing analysis and manipulation of parts of an HTML document, including server-side tags, while reproducing verbatim any unrecognised or invalid HTML.
There is a newer version: 3.4
Show newest version
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 2.3
// Copyright (C) 2006 Martin Jericho
// http://sourceforge.net/projects/jerichohtml/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// http://www.gnu.org/copyleft/lesser.html
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package au.id.jericho.lib.html;

import java.io.*;

/**
 * Represents a single attribute
 * name/value segment within a {@link StartTag}.
 * 
 * An instance of this class is a representation of a single attribute in the source document and is not modifiable.
 * The {@link AttributesOutputSegment} class provides the means to add, delete or modify attributes and
 * their values for inclusion in an {@link OutputDocument}.
 * 

 * Obtained using the {@link Attributes#get(String key)} method.
 * 

 * See also the XML 1.0 specification for attributes.
 *
 * @see Attributes
 */
public final class Attribute extends Segment {
	private final String key;
	private final Segment nameSegment;
	private final Segment valueSegment;
	private final Segment valueSegmentIncludingQuotes;

	static final String CHECKED="checked";
	static final String CLASS="class";
	static final String DISABLED="disabled";
	static final String ID="id";
	static final String MULTIPLE="multiple";
	static final String NAME="name";
	static final String SELECTED="selected";
	static final String STYLE="style";
	static final String TYPE="type";
	static final String VALUE="value";

	/**
	 * Constructs a new Attribute with no value part, called from Attributes class.
	 * 

	 * Note that the resulting Attribute segment has the same span as the supplied nameSegment.
	 *
	 * @param source  the {@link Source} document.
	 * @param key  the name of this attribute in lower case.
	 * @param nameSegment  the segment representing the name.
	 */
	Attribute(final Source source, final String key, final Segment nameSegment) {
		this(source,key,nameSegment,null,null);
	}

	/**
	 * Constructs a new Attribute, called from Attributes class.
	 * 

	 * The resulting Attribute segment begins at the start of the nameSegment
	 * and finishes at the end of the valueSegmentIncludingQuotes.  If this attribute
	 * has no value, it finishes at the end of the nameSegment.
	 * 

	 * If this attribute has no value, the valueSegment and valueSegmentIncludingQuotes must be null.
	 * The  parameter must not be null if the valueSegment is not null, and vice versa
	 *
	 * @param source  the {@link Source} document.
	 * @param key  the name of this attribute in lower case.
	 * @param nameSegment  the segment spanning the name.
	 * @param valueSegment  the segment spanning the value.
	 * @param valueSegmentIncludingQuotes  the segment spanning the value, including quotation marks if any.
	 */
	Attribute(final Source source, final String key, final Segment nameSegment, final Segment valueSegment, final Segment valueSegmentIncludingQuotes) {
		super(source,nameSegment.getBegin(),(valueSegmentIncludingQuotes==null ? nameSegment.getEnd() : valueSegmentIncludingQuotes.getEnd()));
		this.key=key;
		this.nameSegment=nameSegment;
		this.valueSegment=valueSegment;
		this.valueSegmentIncludingQuotes=valueSegmentIncludingQuotes;
	}

	/**
	 * Returns the name of this attribute in lower case.
	 * 

	 * This package treats all attribute names as case insensitive, consistent with
	 * HTML but not consistent with
	 * XHTML.
	 *
	 * @return the name of this attribute in lower case.
	 * @see #getName()
	 */
	public String getKey() {
		return key;
	}

	/**
	 * Returns the name of this attribute in original case.
	 * 

	 * This is exactly equivalent to {@link #getNameSegment()}.toString().
	 *
	 * @return the name of this attribute in original case.
	 * @see #getKey()
	 */
	public String getName() {
		return nameSegment.toString();
	}

	/**
	 * Returns the segment spanning the {@linkplain #getName() name} of this attribute.
	 * @return the segment spanning the {@linkplain #getName() name} of this attribute.
	 * @see #getName()
	 */
	public Segment getNameSegment() {
		return nameSegment;
	}

	/**
	 * Indicates whether this attribute has a value.
	 * 

	 * This method also returns true if this attribute has been assigned a zero-length value.
	 * 

	 * It only returns false if this attribute appears in
	 * minimized form.
	 *
	 * @return true if this attribute has a value, otherwise false.
	 */
	public boolean hasValue() {
		return valueSegment!=null;
	}

	/**
	 * Returns the {@linkplain CharacterReference#decode(CharSequence,boolean) decoded} value of this attribute,
	 * or null if it {@linkplain #hasValue() has no value}.
	 * 

	 * This is equivalent to {@link CharacterReference}.{@link CharacterReference#decode(CharSequence,boolean) decode}({@link #getValueSegment()},true).
	 * 

	 * Note that before version 1.4.1 this method returned the raw value of the attribute as it appears in the source document,
	 * without {@linkplain CharacterReference#decode(CharSequence,boolean) decoding}.
	 * 

	 * To obtain the raw value without decoding, use {@link #getValueSegment()}.toString().
	 * 

	 * Special attention should be given to attributes that contain URLs, such as the
	 * href attribute.
	 * When such an attribute contains a URL with parameters (as described in the
	 * form-urlencoded media type),
	 * the ampersand (&) characters used to separate the parameters should be
	 * {@linkplain CharacterReference#encode(CharSequence) encoded} to prevent the parameter names from being
	 * unintentionally interpreted as {@linkplain CharacterEntityReference character entity references}.
	 * This requirement is explicitly stated in the 
	 * HTML 4.01 specification section 5.3.2.
	 * 

	 * For example, take the following element in the source document:
	 * 
<a href="Report.jsp?chapt=2&sect=3">next</a>
	 * By default, calling 
	 * {@link Element#getAttributes() getAttributes()}.{@link Attributes#getValue(String) getValue}("href")
	 * on this element returns the string
	 * "Report.jsp?chapt=2§=3", since the text "&sect" is interpreted as the rarely used
	 * character entity reference {@link CharacterEntityReference#_sect &sect;} (U+00A7), despite the fact that it is
	 * missing the {@linkplain CharacterReference#isTerminated() terminating semicolon} (;).
	 * 
	 * Most browsers recognise unterminated character entity references
	 * in attribute values representing a codepoint of U+00FF or below, but ignore those representing codepoints above this value.
 	 * One relatively popular browser only recognises those representing a codepoint of U+003E or below, meaning it would
 	 * have interpreted the URL in the above example differently to most other browsers.
	 * Most browsers also use different rules depending on whether the unterminated character reference is inside or outside
	 * of an attribute value, with both of these possibilities further split into different rules for
	 * {@linkplain CharacterEntityReference character entity references},
	 * decimal character references, and
	 * hexadecimal character references.
	 * 

	 * The behaviour of this library is determined by the current {@linkplain Config.CompatibilityMode compatibility mode} setting,
	 * which is determined by the {@link Config#CurrentCompatibilityMode} property.
	 *
	 * @return the {@linkplain CharacterReference#decode(CharSequence,boolean) decoded} value of this attribute, or null if it {@linkplain #hasValue() has no value}.
	 */
	public String getValue() {
		return CharacterReference.decode(valueSegment,true);
	}

	/**
	 * Returns the segment spanning the {@linkplain #getValue() value} of this attribute, or null if it {@linkplain #hasValue() has no value}.
	 * @return the segment spanning the {@linkplain #getValue() value} of this attribute, or null if it {@linkplain #hasValue() has no value}.
	 * @see #getValue()
	 */
	public Segment getValueSegment() {
		return valueSegment;
	}

	/**
	 * Returns the segment spanning the {@linkplain #getValue() value} of this attribute, including quotation marks if any,
	 * or null if it {@linkplain #hasValue() has no value}.
	 * 

	 * If the value is not enclosed by quotation marks, this is the same as the {@linkplain #getValueSegment() value segment}
	 *
	 * @return the segment spanning the {@linkplain #getValue() value} of this attribute, including quotation marks if any, or null if it {@linkplain #hasValue() has no value}.
	 */
	public Segment getValueSegmentIncludingQuotes() {
		return valueSegmentIncludingQuotes;
	}

	/**
	 * Returns the character used to quote the value.
	 * 

	 * The return value is either a double-quote ("), a single-quote ('), or a space.
	 *
	 * @return the character used to quote the value, or a space if the value is not quoted or this attribute has no value.
	 */
	public char getQuoteChar() {
		if (valueSegment==valueSegmentIncludingQuotes) return ' '; // no quotes
		return source.charAt(valueSegmentIncludingQuotes.getBegin());
	}

	/**
	 * Returns a string representation of this object useful for debugging purposes.
	 * @return a string representation of this object useful for debugging purposes.
	 */
	public String getDebugInfo() {
		final StringBuffer sb=new StringBuffer().append(key).append(super.getDebugInfo()).append(",name=").append(nameSegment.getDebugInfo());
		if (hasValue())
			sb.append(",value=").append(valueSegment.getDebugInfo()).append('"').append(valueSegment).append("\"\n");
		else
			sb.append(",NO VALUE\n");
		return sb.toString();
	}

	Tag appendTidy(final StringBuffer sb, Tag nextTag) {
		sb.append(' ');
		Util.appendTo(sb,nameSegment);
		if (valueSegment!=null) {
			sb.append("=\"");
			while (nextTag!=null && nextTag.begin=valueSegment.end) {
				appendTidyValue(sb,valueSegment);
			} else {
				int i=valueSegment.begin;
				while (nextTag!=null && nextTag.beginvalueSegment.end) {
						sb.append(new Segment(source,nextTag.begin,i=valueSegment.end));
						break;
					}
					sb.append(nextTag);
					i=nextTag.end;
					nextTag=nextTag.findNextTag();
				}
				if (i