net.htmlparser.jericho.CharacterReference Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jericho-html Show documentation
Jericho HTML Parser is a java library allowing analysis and manipulation of parts of an HTML document, including server-side tags, while reproducing verbatim any unrecognised or invalid HTML.
The newest version!
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.4
// Copyright (C) 2004-2013 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// 3. The Apache License version 2.0,
// included in this distribution in the file licence-apache-2.0.html
// or available at http://www.apache.org/licenses/LICENSE-2.0.html
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import java.util.*;
import java.io.*;

/**
 * Represents an HTML Character Reference,
 * implemented by the subclasses {@link CharacterEntityReference} and {@link NumericCharacterReference}.
 * 
 * This class, together with its subclasses, contains static methods to perform most required operations
 * without having to instantiate an object.
 * 

 * Instances of this class are useful when the positions of character references in a source document are required,
 * or to replace the found character references with customised text.
 * 

 * CharacterReference instances are obtained using one of the following methods:
 * 

 *  {@link CharacterReference#parse(CharSequence characterReferenceText)}
 *  
{@link Source#getNextCharacterReference(int pos)}
 *  
{@link Source#getPreviousCharacterReference(int pos)}
 *  
{@link Segment#getAllCharacterReferences()}
 * 
 */
public abstract class CharacterReference extends Segment {
	int codePoint;

	/**
	 * Represents an invalid unicode code point.
	 * 
	 * This can be the result of parsing a numeric character reference outside of the valid unicode range of 0x000000-0x10FFFF, or any other invalid character reference.
	 */
	public static final int INVALID_CODE_POINT=-1;

	static int MAX_ENTITY_REFERENCE_LENGTH; // set in CharacterEntityReference static class initialisation

	/** The number of spaces used to simulate a tab when {@linkplain #encodeWithWhiteSpaceFormatting encoding with white space formatting}. */
	private static final int TAB_LENGTH=4;

	CharacterReference(final Source source, final int begin, final int end, final int codePoint) {
		super(source,begin,end);
		this.codePoint=codePoint;
	}

	/**
	 * Returns the unicode code point represented by this character reference.
	 * @return the unicode code point represented by this character reference.
	 * @see #appendCharTo(Appendable)
	 */
	public int getCodePoint() {
		return codePoint;
	}

	/**
	 * Returns the character represented by this character reference.
	 * 

	 * If this character reference represents a unicode
	 * supplimentary code point,
	 * any bits outside of the least significant 16 bits of the code point are truncated, yielding an incorrect result.
	 * 

	 * To ensure that the character is correctly appended to an Appendable object such as a Writer, use the code:
	 * 
characterReference.{@link #appendCharTo(Appendable) appendCharTo}(appendable)

	 * instead of:
	 * 
appendable.append(characterReference.getChar())
	 *
	 * @return the character represented by this character reference.
	 * @see #appendCharTo(Appendable)
	 * @see #getCodePoint()
	 */
	public char getChar() {
		return (char)codePoint;
	}

	/**
	 * Appends the character represented by this character reference to the specified appendable object.
	 * 

	 * If this character is a unicode supplementary character,
	 * then both the UTF-16 high/low surrogate char values of the of the character are appended, as described in the
	 * Unicode character representations section of the
	 * java.lang.Character class.
	 * 

	 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to true (the default),
	 * then calling this method on a non-breaking space character reference ({@link CharacterEntityReference#_nbsp &nbsp;})
	 * results in a normal space being appended.
	 *
	 * @param appendable  the object to append this character reference to.
	 */
	public final void appendCharTo(Appendable appendable) throws IOException {
		appendCharTo(appendable,Config.ConvertNonBreakingSpaces);
	}

	private void appendCharTo(Appendable appendable, final boolean convertNonBreakingSpaces) throws IOException {
		if (Character.isSupplementaryCodePoint(codePoint)) {
			appendable.append(getHighSurrogate(codePoint));
			appendable.append(getLowSurrogate(codePoint));
		} else {
			final char ch=getChar();
			if (ch==CharacterEntityReference._nbsp && convertNonBreakingSpaces) {
				appendable.append(' ');
			} else {
				appendable.append(ch);
			}
		}
	}

	/**
	 * Indicates whether this character reference is terminated by a semicolon (;).
	 * 

	 * Conversely, this library defines an unterminated character reference as one which does
	 * not end with a semicolon.
	 * 

	 * The SGML specification allows unterminated character references in some circumstances, and because the
	 * HTML 4.01 specification states simply that
	 * "authors may use SGML character references",
	 * it follows that they are also valid in HTML documents, although their use is strongly discouraged.
	 * 

	 * Unterminated character references are not allowed in XHTML documents.
	 *
	 * @return true if this character reference is terminated by a semicolon, otherwise false.
	 * @see #decode(CharSequence encodedText, boolean insideAttributeValue)
	 */
	public boolean isTerminated() {
		return source.charAt(end-1)==';';
	}

	/**
	 * Encodes the specified text, escaping certain characters into character references.
	 * 

	 * This is equivalent to {@link #encode(CharSequence,boolean) encode(unencodedText,true)}.
	 *
	 * @param unencodedText  the text to encode.
	 * @return the encoded string.
	 */
	public static String encode(final CharSequence unencodedText) {
		return encode(unencodedText,true);
	}

	/**
	 * Encodes the specified text, escaping certain characters into character references.
	 * 

	 * The {@link Config#CurrentCharacterReferenceEncodingBehaviour} setting determines which characters are encoded.
	 * 

	 * For characters that are to be encoded, the {@link CharacterEntityReference} is used if possible, otherwise a {@link NumericCharacterReference} is used.
	 * 

	 * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
	 * which is encoded as the numeric character reference "&#39;" rather than its character entity reference {@link CharacterEntityReference#_apos &apos;}
	 * as this entity is not defined for use in HTML. See the comments in the {@link CharacterEntityReference} class for more information.
	 * 

	 * Specifying a value of true as an argument to the insideAttributeValue parameter ensures that
	 * double quote characters (") are encoded. The default behaviour is that they are not encoded if a value of false is specified.
	 * 

	 * To encode text using only numeric character references, use the

	 * {@link NumericCharacterReference#encode(CharSequence)} method instead.
	 *
	 * @param unencodedText  the text to encode.
	 * @param insideAttributeValue  specifies whether the output must be valid inside a quoted attribute value.
	 * @return the encoded string.
	 * @see #decode(CharSequence)
	 */
	public static String encode(final CharSequence unencodedText, final boolean insideAttributeValue) {
		if (unencodedText==null) return null;
		try {
			return appendEncode(new StringBuilder(unencodedText.length()*2),unencodedText,insideAttributeValue).toString();
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
	}

	/**
	 * Encodes the specified character into a character reference if {@linkplain Config#CurrentCharacterReferenceEncodingBehaviour required}.
	 * 

	 * The encoding of the character follows the same rules as for each character in the {@link #encode(CharSequence unencodedText, boolean insideAttributeValue)} method,
	 * with insideAttributeValue set to true.
	 *
	 * @param ch  the character to encode.
	 * @return a character reference if appropriate, otherwise a string containing the original character.
	 */
	public static String encode(final char ch) {
		try {
			return appendEncode(new StringBuilder(MAX_ENTITY_REFERENCE_LENGTH),ch,true).toString();
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
	}

	/**
	 * {@linkplain #encode(CharSequence) Encodes} the specified text, preserving line breaks, tabs and spaces for rendering by converting them to markup.
	 * 

	 * This performs the same encoding as {@link #encode(CharSequence,boolean) encode(CharSequence,false)}, but also performs the following conversions:
	 * 

	 *  Line breaks, being Carriage Return (U+000D) or Line Feed (U+000A) characters, and Form Feed characters (U+000C)
	 *   are converted to "<br />".  CR/LF pairs are treated as a single line break.
	 *  
Multiple consecutive spaces are converted so that every second space is converted to "&nbsp;"
	 *   while ensuring the last is always a normal space.
	 *  
Tab characters (U+0009) are converted as if they were four consecutive spaces.
	 * 
	 * 
	 * The conversion of multiple consecutive spaces to alternating space/non-breaking-space allows the correct number of
	 * spaces to be rendered, but also allows the line to wrap in the middle of it.
	 * 

	 * Note that zero-width spaces (U+200B) are converted to the numeric character reference
	 * "&#x200B;" through the normal encoding process, but IE6 does not render them properly
	 * either encoded or unencoded.
	 * 

	 * There is no method provided to reverse this encoding.
	 *
	 * @param unencodedText  the text to encode.
	 * @return the encoded string with white space formatting converted to markup.
	 * @see #encode(CharSequence)
	 */
	public static String encodeWithWhiteSpaceFormatting(CharSequence unencodedText) {
		if (unencodedText==null) return null;
		try {
			Appendable appendable=new StringBuilder(unencodedText.length()*2);
			int beginPos=0;
			int endPos=unencodedText.length();
			if (unencodedText instanceof Segment) {
				// this might improve performance slightly
				final Segment segment=(Segment)unencodedText;
				final int segmentOffset=segment.getBegin();
				beginPos=segmentOffset;
				endPos+=segmentOffset;
				unencodedText=segment.source;
			}
			for (int i=beginPos; i"); // add line break
						continue;
					} else {
						spaceCount=TAB_LENGTH;
					}
				} else {
					spaceCount=1;
				}
				while (nexti=2) {
					appendable.append("  "); // use alternating   and spaces to keep original number of spaces
					spaceCount-=2;
				}
				// note that the last character is never a nbsp, so that word wrapping won't result in a nbsp before the first character in a line
				i=nexti-1; // minus 1 because top level for loop will add it again
			}
			return appendable.toString();
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
	}

	/**
	 * Decodes the specified HTML encoded text into normal text.
	 * 

	 * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
	 * are converted to their respective characters.
	 * 

	 * This is equivalent to {@link #decode(CharSequence,boolean) decode(encodedText,false)}.
	 * 

	 * Unterminated character references are dealt with according to the rules for
	 * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
	 * 

	 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to true (the default),
	 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to normal spaces.
	 * 

	 * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
	 * some browsers also recognise them in a case-insensitive way.
	 * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
	 *
	 * @param encodedText  the text to decode.
	 * @return the decoded string.
	 * @see #encode(CharSequence)
	 */
	public static String decode(final CharSequence encodedText) {
		return decode(encodedText,false,Config.ConvertNonBreakingSpaces);
	}

	/**
	 * Decodes the specified HTML encoded text into normal text.
	 * 

	 * All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
	 * are converted to their respective characters.
	 * 

	 * Unterminated character references are dealt with according to the
	 * value of the insideAttributeValue parameter and the
	 * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
	 * 

	 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to true (the default),
	 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to normal spaces.
	 * 

	 * Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
	 * some browsers also recognise them in a case-insensitive way.
	 * For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
	 *
	 * @param encodedText  the text to decode.
	 * @param insideAttributeValue  specifies whether the encoded text is inside an attribute value.
	 * @return the decoded string.
	 * @see #decode(CharSequence)
	 * @see #encode(CharSequence)
	 */
	public static String decode(final CharSequence encodedText, final boolean insideAttributeValue) {
		return decode(encodedText,insideAttributeValue,Config.ConvertNonBreakingSpaces);
	}

	static final String decode(final CharSequence encodedText, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) {
		if (encodedText==null) return null;
		final String encodedTextString=encodedText.toString(); // converting to string first is faster than searching the CharSequence directly.
		final int firstAmpersandPos=encodedTextString.indexOf('&');
		if (firstAmpersandPos==-1) return encodedTextString;
		try {
			return appendDecode(new StringBuilder(encodedText.length()),encodedText,firstAmpersandPos,insideAttributeValue,convertNonBreakingSpaces).toString();
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
	}

	/**
	 * {@linkplain #decode(CharSequence) Decodes} the specified text after collapsing its {@linkplain #isWhiteSpace(char) white space}.
	 * 

	 * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
	 * 

	 * The result is how the text would normally be rendered by a
	 * user agent,
	 * assuming it does not contain any tags.
	 * 

	 * If the static {@link Config#ConvertNonBreakingSpaces} property is set to true (the default),
	 * then all non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to normal spaces.
	 * For consistency with the rendered output of most user agents these converted spaces are not treated as white space,
	 * so they are not collapsed and not trimmed.
	 * 

	 * Unterminated character references are dealt with according to the rules for
	 * text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
	 * See the discussion of the insideAttributeValue parameter of the {@link #decode(CharSequence, boolean insideAttributeValue)}
	 * method for a more detailed explanation of this topic.
	 *
	 * @param text  the source text
	 * @return the decoded text with collapsed white space.
	 * @see FormControl#getPredefinedValues()
	 */
	public static String decodeCollapseWhiteSpace(final CharSequence text) {
		return decodeCollapseWhiteSpace(text,Config.ConvertNonBreakingSpaces);
	}

	static String decodeCollapseWhiteSpace(final CharSequence text, final boolean convertNonBreakingSpaces) {
		return decode(appendCollapseWhiteSpace(new StringBuilder(text.length()),text),false,convertNonBreakingSpaces);
	}

	/**
	 * Re-encodes the specified text, equivalent to {@linkplain #decode(CharSequence) decoding} and then {@linkplain #encode(CharSequence) encoding} again.
	 * 

	 * This process ensures that the specified encoded text does not contain any remaining unencoded characters.
	 * 

	 * IMPLEMENTATION NOTE: At present this method simply calls the {@link #decode(CharSequence) decode} method followed by the
	 * {@link #encode(CharSequence) encode} method, both with insideAttributeValue set to true.
	 *
	 * @param encodedText  the text to re-encode.
	 * @return the re-encoded string.
	 */
	public static String reencode(final CharSequence encodedText) {
		return encode(decode(encodedText,true),true);
	}

	/**
	 * Returns the encoded form of this character reference.
	 * 

	 * The exact behaviour of this method depends on the class of this object.
	 * See the {@link CharacterEntityReference#getCharacterReferenceString()} and
	 * {@link NumericCharacterReference#getCharacterReferenceString()} methods for more details.
	 * 

	 * 

	 *  Examples:
	 *   CharacterReference.parse("&GT;").getCharacterReferenceString() returns "&gt;"
	 *   CharacterReference.parse("&#x3E;").getCharacterReferenceString() returns "&#3e;"
	 * 
	 *
	 * @return the encoded form of this character reference.
	 * @see #getCharacterReferenceString(int codePoint)
	 * @see #getDecimalCharacterReferenceString()
	 */
	public abstract String getCharacterReferenceString();

	/**
	 * Returns the encoded form of the specified unicode code point.
	 * 
	 * This method returns the {@linkplain CharacterEntityReference#getCharacterReferenceString(int) character entity reference} encoded form of the unicode code point
	 * if one exists, otherwise it returns the {@linkplain #getDecimalCharacterReferenceString(int) decimal character reference} encoded form.
	 * 

	 * The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
	 * which is encoded as the numeric character reference "&#39;" instead of its character entity reference
	 * "&apos;".
	 * 

	 * 

	 *  Examples:
	 *   CharacterReference.getCharacterReferenceString(62) returns "&gt;"
	 *   CharacterReference.getCharacterReferenceString('>') returns "&gt;"
	 *   CharacterReference.getCharacterReferenceString('☺') returns "&#9786;"
	 * 
	 *
	 * @param codePoint  the unicode code point to encode.
	 * @return the encoded form of the specified unicode code point.
	 * @see #getHexadecimalCharacterReferenceString(int codePoint)
	 */
	public static String getCharacterReferenceString(final int codePoint) {
		String characterReferenceString=null;
		if (codePoint!=CharacterEntityReference._apos) characterReferenceString=CharacterEntityReference.getCharacterReferenceString(codePoint);
		if (characterReferenceString==null) characterReferenceString=NumericCharacterReference.getCharacterReferenceString(codePoint);
		return characterReferenceString;
	}

	/**
	 * Returns the decimal encoded form of this character reference.
	 * 
	 * This is equivalent to {@link #getDecimalCharacterReferenceString(int) getDecimalCharacterReferenceString}({@link #getCodePoint()}).
	 * 

	 * 

	 *  Example:
	 *  CharacterReference.parse("&gt;").getDecimalCharacterReferenceString() returns "&#62;"
	 * 
	 *
	 * @return the decimal encoded form of this character reference.
	 * @see #getCharacterReferenceString()
	 * @see #getHexadecimalCharacterReferenceString()
	 */
	public String getDecimalCharacterReferenceString() {
		return getDecimalCharacterReferenceString(codePoint);
	}

	/**
	 * Returns the decimal encoded form of the specified unicode code point.
	 * 
	 * 

	 *  Example:
	 *  CharacterReference.getDecimalCharacterReferenceString('>') returns "&#62;"
	 * 
	 *
	 * @param codePoint  the unicode code point to encode.
	 * @return the decimal encoded form of the specified unicode code point.
	 * @see #getCharacterReferenceString(int codePoint)
	 * @see #getHexadecimalCharacterReferenceString(int codePoint)
	 */
	public static String getDecimalCharacterReferenceString(final int codePoint) {
		try {
			return appendDecimalCharacterReferenceString(new StringBuilder(),codePoint).toString();
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
	}

	/**
	 * Returns the hexadecimal encoded form of this character reference.
	 * 
	 * This is equivalent to {@link #getHexadecimalCharacterReferenceString(int) getHexadecimalCharacterReferenceString}({@link #getCodePoint()}).
	 * 

	 * 

	 *  Example:
	 *  CharacterReference.parse("&gt;").getHexadecimalCharacterReferenceString() returns "&#x3e;"
	 * 
	 *
	 * @return the hexadecimal encoded form of this character reference.
	 * @see #getCharacterReferenceString()
	 * @see #getDecimalCharacterReferenceString()
	 */
	public String getHexadecimalCharacterReferenceString() {
		return getHexadecimalCharacterReferenceString(codePoint);
	}

	/**
	 * Returns the hexadecimal encoded form of the specified unicode code point.
	 * 
	 * 

	 *  Example:
	 *  CharacterReference.getHexadecimalCharacterReferenceString('>') returns "&#x3e;"
	 * 
	 *
	 * @param codePoint  the unicode code point to encode.
	 * @return the hexadecimal encoded form of the specified unicode code point.
	 * @see #getCharacterReferenceString(int codePoint)
	 * @see #getDecimalCharacterReferenceString(int codePoint)
	 */
	public static String getHexadecimalCharacterReferenceString(final int codePoint) {
		try {
			return appendHexadecimalCharacterReferenceString(new StringBuilder(),codePoint).toString();
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
	}

	/**
	 * Returns the unicode code point of this character reference in U+ notation.
	 * 
	 * This is equivalent to {@link #getUnicodeText(int) getUnicodeText(getCodePoint())}.
	 * 

	 * 

	 *  Example:
	 *  CharacterReference.parse("&gt;").getUnicodeText() returns "U+003E"
	 * 
	 *
	 * @return the unicode code point of this character reference in U+ notation.
	 * @see #getUnicodeText(int codePoint)
	 */
	public String getUnicodeText() {
		return getUnicodeText(codePoint);
	}

	/**
	 * Returns the specified unicode code point in U+ notation.
	 * 
	 * 

	 *  Example:
	 *  CharacterReference.getUnicodeText('>') returns "U+003E"
	 * 
	 *
	 * @param codePoint  the unicode code point.
	 * @return the specified unicode code point in U+ notation.
	 */
	public static String getUnicodeText(final int codePoint) {
		try {
			return appendUnicodeText(new StringBuilder(),codePoint).toString();
		} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
	}

	static final Appendable appendUnicodeText(final Appendable appendable, final int codePoint) throws IOException {
		appendable.append("U+");
		final String hex=Integer.toString(codePoint,16).toUpperCase();
		for (int i=4-hex.length(); i>0; i--) appendable.append('0');
		appendable.append(hex);
		return appendable;
	}

	/**
	 * Parses a single encoded character reference text into a CharacterReference object.
	 * 
	 * The character reference must be at the start of the given text, but may contain other characters at the end.
	 * The {@link #getEnd() getEnd()} method can be used on the resulting object to determine at which character position the character reference ended.
	 * 

	 * If the text does not represent a valid character reference, this method returns null.
	 * 

 	 * Unterminated character references are always accepted, regardless of the settings in the
	 * {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
	 * 

	 * To decode all character references in a given text, use the {@link #decode(CharSequence)} method instead.
	 * 

	 * 

	 *  Example:
	 *  CharacterReference.parse("&gt;").getChar() returns '>'
	 * 
	 *
	 * @param characterReferenceText  the text containing a single encoded character reference.
	 * @return a CharacterReference object representing the specified text, or null if the text does not represent a valid character reference.
	 * @see #decode(CharSequence)
	 */
	public static CharacterReference parse(final CharSequence characterReferenceText) {
		return construct(new Source(characterReferenceText,true),0,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
	}

	/**
	 * Parses a single encoded character reference text into a unicode code point.
	 * 
	 * The character reference must be at the start of the given text, but may contain other characters at the end.
	 * 

	 * If the text does not represent a valid character reference, this method returns {@link #INVALID_CODE_POINT}.
	 * 

	 * This is equivalent to {@link #parse(CharSequence) parse(characterReferenceText)}.{@link #getCodePoint()},
	 * except that it returns {@link #INVALID_CODE_POINT} if an invalid character reference is specified instead of throwing a
	 * NullPointerException.
	 * 

	 * 

	 *  Example:
	 *  CharacterReference.getCodePointFromCharacterReferenceString("&gt;") returns 38
	 * 
	 *
	 * @param characterReferenceText  the text containing a single encoded character reference.
	 * @return the unicode code point representing representing the specified text, or {@link #INVALID_CODE_POINT} if the text does not represent a valid character reference.
	 */
	public static int getCodePointFromCharacterReferenceString(final CharSequence characterReferenceText) {
		final CharacterReference characterReference=parse(characterReferenceText);
		return (characterReference!=null) ? characterReference.getCodePoint() : INVALID_CODE_POINT;
	}

	/**
	 * Indicates whether the specified character would need to be encoded in an attribute value.
	 * @deprecated  use {@link Config#CurrentCharacterReferenceEncodingBehaviour} instead.
	 */
	@Deprecated
	public static final boolean requiresEncoding(final char ch) {
		return Config.CurrentCharacterReferenceEncodingBehaviour.isEncoded(ch,true);
	}

	/**
	 * Returns a filter Writer that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified Writer.
	 *
	 * @param writer  the destination for the encoded text
	 * @return a filter Writer that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified Writer.
	 * @see #encode(CharSequence unencodedText)
	 */
	public static Writer getEncodingFilterWriter(final Writer writer) {
		return new EncodingFilterWriter(writer);
	}

	private static final class EncodingFilterWriter extends FilterWriter {
		StringBuilder sb=new StringBuilder(MAX_ENTITY_REFERENCE_LENGTH);
		public EncodingFilterWriter(final Writer writer) {
			super(writer);
		}
		public void write(final char ch) throws IOException {
			sb.setLength(0);
			appendEncode(sb,ch,true);
			if (sb.length()==1)
				out.write(sb.charAt(0));
			else
				out.append(sb);
		}
		public void write(final int chInt) throws IOException {
			write((char)chInt);
		}
		public void write(final char[] cbuf, final int off, final int len) throws IOException {
			final int end=off+len;
			for (int i=off; i> 10)) + (codePoint >> 10));
	}
	private static char getLowSurrogate(int codePoint) {
		return (char)(0xDC00 + (codePoint & 0x3FF));
	}
}