src.java.net.htmlparser.jericho.Attribute Maven / Gradle / Ivy
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.1
// Copyright (C) 2004-2009 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package net.htmlparser.jericho;
import java.io.*;
/**
* Represents a single attribute
* name/value segment within a {@link StartTag}.
*
* An instance of this class is a representation of a single attribute in the source document and is not modifiable.
* The {@link OutputDocument#replace(Attributes, Map)} and {@link OutputDocument#replace(Attributes, boolean convertNamesToLowerCase)} methods
* provide the means to add, delete or modify attributes and their values in an {@link OutputDocument}.
*
* Obtained using the {@link Attributes#get(String key)} method.
*
* See also the XML 1.0 specification for attributes.
*
* @see Attributes
*/
public final class Attribute extends Segment {
private final String key;
private final Segment nameSegment;
private final Segment valueSegment;
private final Segment valueSegmentIncludingQuotes;
static final String CHECKED="checked";
static final String CLASS="class";
static final String DISABLED="disabled";
static final String ID="id";
static final String MULTIPLE="multiple";
static final String NAME="name";
static final String SELECTED="selected";
static final String STYLE="style";
static final String TYPE="type";
static final String VALUE="value";
/**
* Constructs a new Attribute with no value part, called from Attributes class.
*
* Note that the resulting Attribute segment has the same span as the supplied nameSegment.
*
* @param source the {@link Source} document.
* @param key the name of this attribute in lower case.
* @param nameSegment the segment representing the name.
*/
Attribute(final Source source, final String key, final Segment nameSegment) {
this(source,key,nameSegment,null,null);
}
/**
* Constructs a new Attribute, called from Attributes class.
*
* The resulting Attribute segment begins at the start of the nameSegment
* and finishes at the end of the valueSegmentIncludingQuotes. If this attribute
* has no value, it finishes at the end of the nameSegment.
*
* If this attribute has no value, the valueSegment
and valueSegmentIncludingQuotes
must be null.
* The parameter must not be null if the valueSegment
is not null, and vice versa
*
* @param source the {@link Source} document.
* @param key the name of this attribute in lower case.
* @param nameSegment the segment spanning the name.
* @param valueSegment the segment spanning the value.
* @param valueSegmentIncludingQuotes the segment spanning the value, including quotation marks if any.
*/
Attribute(final Source source, final String key, final Segment nameSegment, final Segment valueSegment, final Segment valueSegmentIncludingQuotes) {
super(source,nameSegment.getBegin(),(valueSegmentIncludingQuotes==null ? nameSegment.getEnd() : valueSegmentIncludingQuotes.getEnd()));
this.key=key;
this.nameSegment=nameSegment;
this.valueSegment=valueSegment;
this.valueSegmentIncludingQuotes=valueSegmentIncludingQuotes;
}
/**
* Returns the name of this attribute in lower case.
*
* This package treats all attribute names as case insensitive, consistent with
* HTML but not consistent with
* XHTML.
*
* @return the name of this attribute in lower case.
* @see #getName()
*/
public String getKey() {
return key;
}
/**
* Returns the name of this attribute in original case.
*
* This is exactly equivalent to {@link #getNameSegment()}.toString()
.
*
* @return the name of this attribute in original case.
* @see #getKey()
*/
public String getName() {
return nameSegment.toString();
}
/**
* Returns the segment spanning the {@linkplain #getName() name} of this attribute.
* @return the segment spanning the {@linkplain #getName() name} of this attribute.
* @see #getName()
*/
public Segment getNameSegment() {
return nameSegment;
}
/**
* Indicates whether this attribute has a value.
*
* This method also returns true
if this attribute has been assigned a zero-length value.
*
* It only returns false
if this attribute appears in
* minimized form.
*
* @return true
if this attribute has a value, otherwise false
.
*/
public boolean hasValue() {
return valueSegment!=null;
}
/**
* Returns the {@linkplain CharacterReference#decode(CharSequence,boolean) decoded} value of this attribute,
* or null
if it {@linkplain #hasValue() has no value}.
*
* This is equivalent to {@link CharacterReference}.
{@link CharacterReference#decode(CharSequence,boolean) decode}(
{@link #getValueSegment()},true)
.
*
* Note that before version 1.4.1 this method returned the raw value of the attribute as it appears in the source document,
* without {@linkplain CharacterReference#decode(CharSequence,boolean) decoding}.
*
* To obtain the raw value without decoding, use {@link #getValueSegment()}.toString()
.
*
* Special attention should be given to attributes that contain URLs, such as the
* href
attribute.
* When such an attribute contains a URL with parameters (as described in the
* form-urlencoded media type),
* the ampersand (&
) characters used to separate the parameters should be
* {@linkplain CharacterReference#encode(CharSequence) encoded} to prevent the parameter names from being
* unintentionally interpreted as {@linkplain CharacterEntityReference character entity references}.
* This requirement is explicitly stated in the
* HTML 4.01 specification section 5.3.2.
*
* For example, take the following element in the source document:
*
<a href="Report.jsp?chapt=2§=3">next</a>
* By default, calling
* {@link Element#getAttributes() getAttributes()}.
{@link Attributes#getValue(String) getValue}("href")
* on this element returns the string
* "Report.jsp?chapt=2§=3
", since the text "§
" is interpreted as the rarely used
* character entity reference {@link CharacterEntityReference#_sect §} (U+00A7), despite the fact that it is
* missing the {@linkplain CharacterReference#isTerminated() terminating semicolon} (;
).
*
* Most browsers recognise unterminated character entity references
* in attribute values representing a codepoint of U+00FF or below, but ignore those representing codepoints above this value.
* One relatively popular browser only recognises those representing a codepoint of U+003E or below, meaning it would
* have interpreted the URL in the above example differently to most other browsers.
* Most browsers also use different rules depending on whether the unterminated character reference is inside or outside
* of an attribute value, with both of these possibilities further split into different rules for
* {@linkplain CharacterEntityReference character entity references},
* decimal character references, and
* hexadecimal character references.
*
* The behaviour of this library is determined by the current {@linkplain Config.CompatibilityMode compatibility mode} setting,
* which is determined by the static {@link Config#CurrentCompatibilityMode} property.
*
* @return the {@linkplain CharacterReference#decode(CharSequence,boolean) decoded} value of this attribute, or null
if it {@linkplain #hasValue() has no value}.
*/
public String getValue() {
return CharacterReference.decode(valueSegment,true);
}
/**
* Returns the segment spanning the {@linkplain #getValue() value} of this attribute, or null
if it {@linkplain #hasValue() has no value}.
* @return the segment spanning the {@linkplain #getValue() value} of this attribute, or null
if it {@linkplain #hasValue() has no value}.
* @see #getValue()
*/
public Segment getValueSegment() {
return valueSegment;
}
/**
* Returns the segment spanning the {@linkplain #getValue() value} of this attribute, including quotation marks if any,
* or null
if it {@linkplain #hasValue() has no value}.
*
* If the value is not enclosed by quotation marks, this is the same as the {@linkplain #getValueSegment() value segment}
*
* @return the segment spanning the {@linkplain #getValue() value} of this attribute, including quotation marks if any, or null
if it {@linkplain #hasValue() has no value}.
*/
public Segment getValueSegmentIncludingQuotes() {
return valueSegmentIncludingQuotes;
}
/**
* Returns the character used to quote the value.
*
* The return value is either a double-quote ("
), a single-quote ('
), or a space.
*
* @return the character used to quote the value, or a space if the value is not quoted or this attribute has no value.
*/
public char getQuoteChar() {
if (valueSegment==valueSegmentIncludingQuotes) return ' '; // no quotes
return source.charAt(valueSegmentIncludingQuotes.getBegin());
}
/**
* Returns a string representation of this object useful for debugging purposes.
* @return a string representation of this object useful for debugging purposes.
*/
public String getDebugInfo() {
final StringBuilder sb=new StringBuilder().append(key).append(super.getDebugInfo()).append(",name=").append(nameSegment.getDebugInfo());
if (hasValue())
sb.append(",value=").append(valueSegment.getDebugInfo()).append('"').append(valueSegment).append('"').append(Config.NewLine);
else
sb.append(",NO VALUE").append(Config.NewLine);
return sb.toString();
}
Tag appendTidy(final Appendable appendable, Tag nextTag) throws IOException {
appendable.append(' ').append(nameSegment);
if (valueSegment!=null) {
appendable.append("=\"");
while (nextTag!=null && nextTag.begin=valueSegment.end) {
appendTidyValue(appendable,valueSegment);
} else {
int i=valueSegment.begin;
while (nextTag!=null && nextTag.beginvalueSegment.end) {
appendable.append(new Segment(source,nextTag.begin,i=valueSegment.end));
break;
}
appendable.append(nextTag);
i=nextTag.end;
nextTag=nextTag.getNextTag();
}
if (i