src.java.net.htmlparser.jericho.CharacterReference Maven / Gradle / Ivy
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.1
// Copyright (C) 2004-2009 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package net.htmlparser.jericho;
import java.util.*;
import java.io.*;
/**
* Represents an HTML Character Reference,
* implemented by the subclasses {@link CharacterEntityReference} and {@link NumericCharacterReference}.
*
* This class, together with its subclasses, contains static methods to perform most required operations
* without having to instantiate an object.
*
* Instances of this class are useful when the positions of character references in a source document are required,
* or to replace the found character references with customised text.
*
* CharacterReference
instances are obtained using one of the following methods:
*
* - {@link CharacterReference#parse(CharSequence characterReferenceText)}
*
- {@link Source#getNextCharacterReference(int pos)}
*
- {@link Source#getPreviousCharacterReference(int pos)}
*
- {@link Segment#getAllCharacterReferences()}
*
*/
public abstract class CharacterReference extends Segment {
int codePoint;
/**
* Represents an invalid unicode code point.
*
* This can be the result of parsing a numeric character reference outside of the valid unicode range of 0x000000-0x10FFFF, or any other invalid character reference.
*/
public static final int INVALID_CODE_POINT=-1;
static int MAX_ENTITY_REFERENCE_LENGTH; // set in CharacterEntityReference static class initialisation
/** The number of spaces used to simulate a tab when {@linkplain #encodeWithWhiteSpaceFormatting encoding with white space formatting}. */
private static final int TAB_LENGTH=4;
CharacterReference(final Source source, final int begin, final int end, final int codePoint) {
super(source,begin,end);
this.codePoint=codePoint;
}
/**
* Returns the unicode code point represented by this character reference.
* @return the unicode code point represented by this character reference.
* @see #appendCharTo(Appendable)
*/
public int getCodePoint() {
return codePoint;
}
/**
* Returns the character represented by this character reference.
*
* If this character reference represents a unicode
* supplimentary code point,
* any bits outside of the least significant 16 bits of the code point are truncated, yielding an incorrect result.
*
* To ensure that the character is correctly appended to an Appendable
object such as a Writer
, use the code:
*
characterReference.
{@link #appendCharTo(Appendable) appendCharTo}(appendable)
* instead of:
*
appendable.append(characterReference.getChar())
*
* @return the character represented by this character reference.
* @see #appendCharTo(Appendable)
* @see #getCodePoint()
*/
public char getChar() {
return (char)codePoint;
}
/**
* Appends the character represented by this character reference to the specified appendable object.
*
* If this character is a unicode supplementary character,
* then both the UTF-16 high/low surrogate char
values of the of the character are appended, as described in the
* Unicode character representations section of the
* java.lang.Character
class.
*
* If the static {@link Config#ConvertNonBreakingSpaces} property is set to true
(the default),
* then calling this method on a non-breaking space character reference ({@link CharacterEntityReference#_nbsp })
* results in a normal space being appended.
*
* @param appendable the object to append this character reference to.
*/
public final void appendCharTo(Appendable appendable) throws IOException {
appendCharTo(appendable,Config.ConvertNonBreakingSpaces);
}
private void appendCharTo(Appendable appendable, final boolean convertNonBreakingSpaces) throws IOException {
if (Character.isSupplementaryCodePoint(codePoint)) {
appendable.append(getHighSurrogate(codePoint));
appendable.append(getLowSurrogate(codePoint));
} else {
final char ch=getChar();
if (ch==CharacterEntityReference._nbsp && convertNonBreakingSpaces) {
appendable.append(' ');
} else {
appendable.append(ch);
}
}
}
/**
* Indicates whether this character reference is terminated by a semicolon (;
).
*
* Conversely, this library defines an unterminated character reference as one which does
* not end with a semicolon.
*
* The SGML specification allows unterminated character references in some circumstances, and because the
* HTML 4.01 specification states simply that
* "authors may use SGML character references",
* it follows that they are also valid in HTML documents, although their use is strongly discouraged.
*
* Unterminated character references are not allowed in XHTML documents.
*
* @return true
if this character reference is terminated by a semicolon, otherwise false
.
* @see #decode(CharSequence encodedText, boolean insideAttributeValue)
*/
public boolean isTerminated() {
return source.charAt(end-1)==';';
}
/**
* Encodes the specified text, escaping special characters into character references.
*
* Each character is encoded only if the {@link #requiresEncoding(char)} method would return true
for that character,
* using its {@link CharacterEntityReference} if available, or a decimal {@link NumericCharacterReference} if its unicode
* code point is greater than U+007F.
*
* The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
* which depending on the current setting of the static {@link Config#IsApostropheEncoded} property,
* is either left unencoded (default setting), or encoded as the numeric character reference "'
".
*
* This method never encodes an apostrophe into its character entity reference {@link CharacterEntityReference#_apos '}
* as this entity is not defined for use in HTML. See the comments in the {@link CharacterEntityReference} class for more information.
*
* To encode text using only numeric character references, use the
* {@link NumericCharacterReference#encode(CharSequence)} method instead.
*
* @param unencodedText the text to encode.
* @return the encoded string.
* @see #decode(CharSequence)
*/
public static String encode(final CharSequence unencodedText) {
if (unencodedText==null) return null;
try {
return appendEncode(new StringBuilder(unencodedText.length()*2),unencodedText,false).toString();
} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
}
/**
* Encodes the specified character into a character reference if {@linkplain #requiresEncoding(char) required}.
*
* The encoding of the character follows the same rules as for each character in the {@link #encode(CharSequence unencodedText)} method.
*
* @param ch the character to encode.
* @return a character reference if appropriate, otherwise a string containing the original character.
*/
public static String encode(final char ch) {
try {
return appendEncode(new StringBuilder(MAX_ENTITY_REFERENCE_LENGTH),ch).toString();
} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
}
/**
* {@linkplain #encode(CharSequence) Encodes} the specified text, preserving line breaks, tabs and spaces for rendering by converting them to markup.
*
* This performs the same encoding as the {@link #encode(CharSequence)} method, but also performs the following conversions:
*
* - Line breaks, being Carriage Return (U+000D) or Line Feed (U+000A) characters, and Form Feed characters (U+000C)
* are converted to "
<br />
". CR/LF pairs are treated as a single line break.
* - Multiple consecutive spaces are converted so that every second space is converted to "
"
* while ensuring the last is always a normal space.
* - Tab characters (U+0009) are converted as if they were four consecutive spaces.
*
*
* The conversion of multiple consecutive spaces to alternating space/non-breaking-space allows the correct number of
* spaces to be rendered, but also allows the line to wrap in the middle of it.
*
* Note that zero-width spaces (U+200B) are converted to the numeric character reference
* "​
" through the normal encoding process, but IE6 does not render them properly
* either encoded or unencoded.
*
* There is no method provided to reverse this encoding.
*
* @param unencodedText the text to encode.
* @return the encoded string with white space formatting converted to markup.
* @see #encode(CharSequence)
*/
public static String encodeWithWhiteSpaceFormatting(final CharSequence unencodedText) {
if (unencodedText==null) return null;
try {
return appendEncode(new StringBuilder(unencodedText.length()*2),unencodedText,true).toString();
} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
}
/**
* Decodes the specified HTML encoded text into normal text.
*
* All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
* are converted to their respective characters.
*
* This is equivalent to {@link #decode(CharSequence,boolean) decode(encodedText,false)}.
*
* Unterminated character references are dealt with according to the rules for
* text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
*
* If the static {@link Config#ConvertNonBreakingSpaces} property is set to true
(the default),
* then all non-breaking space ({@link CharacterEntityReference#_nbsp }) character entity references are converted to normal spaces.
*
* Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
* some browsers also recognise them in a case-insensitive way.
* For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
*
* @param encodedText the text to decode.
* @return the decoded string.
* @see #encode(CharSequence)
*/
public static String decode(final CharSequence encodedText) {
return decode(encodedText,false,Config.ConvertNonBreakingSpaces);
}
/**
* Decodes the specified HTML encoded text into normal text.
*
* All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references}
* are converted to their respective characters.
*
* Unterminated character references are dealt with according to the
* value of the insideAttributeValue
parameter and the
* {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
*
* If the static {@link Config#ConvertNonBreakingSpaces} property is set to true
(the default),
* then all non-breaking space ({@link CharacterEntityReference#_nbsp }) character entity references are converted to normal spaces.
*
* Although character entity reference names are case sensitive, and in some cases differ from other entity references only by their case,
* some browsers also recognise them in a case-insensitive way.
* For this reason, all decoding methods in this library recognise character entity reference names even if they are in the wrong case.
*
* @param encodedText the text to decode.
* @param insideAttributeValue specifies whether the encoded text is inside an attribute value.
* @return the decoded string.
* @see #decode(CharSequence)
* @see #encode(CharSequence)
*/
public static String decode(final CharSequence encodedText, final boolean insideAttributeValue) {
return decode(encodedText,insideAttributeValue,Config.ConvertNonBreakingSpaces);
}
static String decode(final CharSequence encodedText, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) {
if (encodedText==null) return null;
for (int i=0; i
* All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
*
* The result is how the text would normally be rendered by a
* user agent,
* assuming it does not contain any tags.
*
* If the static {@link Config#ConvertNonBreakingSpaces} property is set to true
(the default),
* then all non-breaking space ({@link CharacterEntityReference#_nbsp }) character entity references are converted to normal spaces.
*
* Unterminated character references are dealt with according to the rules for
* text outside of attribute values in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
* See the discussion of the insideAttributeValue
parameter of the {@link #decode(CharSequence, boolean insideAttributeValue)}
* method for a more detailed explanation of this topic.
*
* @param text the source text
* @return the decoded text with collapsed white space.
* @see FormControl#getPredefinedValues()
*/
public static String decodeCollapseWhiteSpace(final CharSequence text) {
return decodeCollapseWhiteSpace(text,Config.ConvertNonBreakingSpaces);
}
static String decodeCollapseWhiteSpace(final CharSequence text, final boolean convertNonBreakingSpaces) {
return decode(appendCollapseWhiteSpace(new StringBuilder(text.length()),text),false,convertNonBreakingSpaces);
}
/**
* Re-encodes the specified text, equivalent to {@linkplain #decode(CharSequence) decoding} and then {@linkplain #encode(CharSequence) encoding} again.
*
* This process ensures that the specified encoded text does not contain any remaining unencoded characters.
*
* IMPLEMENTATION NOTE: At present this method simply calls the {@link #decode(CharSequence) decode} method
* followed by the {@link #encode(CharSequence) encode} method, but a more efficient implementation
* may be used in future.
*
* @param encodedText the text to re-encode.
* @return the re-encoded string.
*/
public static String reencode(final CharSequence encodedText) {
return encode(decode(encodedText,true));
}
/**
* Returns the encoded form of this character reference.
*
* The exact behaviour of this method depends on the class of this object.
* See the {@link CharacterEntityReference#getCharacterReferenceString()} and
* {@link NumericCharacterReference#getCharacterReferenceString()} methods for more details.
*
*
* - Examples:
* CharacterReference.parse(">").getCharacterReferenceString()
returns ">
"
* CharacterReference.parse(">").getCharacterReferenceString()
returns "e;
"
*
*
* @return the encoded form of this character reference.
* @see #getCharacterReferenceString(int codePoint)
* @see #getDecimalCharacterReferenceString()
*/
public abstract String getCharacterReferenceString();
/**
* Returns the encoded form of the specified unicode code point.
*
* This method returns the {@linkplain CharacterEntityReference#getCharacterReferenceString(int) character entity reference} encoded form of the unicode code point
* if one exists, otherwise it returns the {@linkplain #getDecimalCharacterReferenceString(int) decimal character reference} encoded form.
*
* The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
* which is encoded as the numeric character reference "'
" instead of its character entity reference
* "'
".
*
*
* - Examples:
* CharacterReference.getCharacterReferenceString(62)
returns ">
"
* CharacterReference.getCharacterReferenceString('>')
returns ">
"
* CharacterReference.getCharacterReferenceString('☺')
returns "☺
"
*
*
* @param codePoint the unicode code point to encode.
* @return the encoded form of the specified unicode code point.
* @see #getHexadecimalCharacterReferenceString(int codePoint)
*/
public static String getCharacterReferenceString(final int codePoint) {
String characterReferenceString=null;
if (codePoint!=CharacterEntityReference._apos) characterReferenceString=CharacterEntityReference.getCharacterReferenceString(codePoint);
if (characterReferenceString==null) characterReferenceString=NumericCharacterReference.getCharacterReferenceString(codePoint);
return characterReferenceString;
}
/**
* Returns the decimal encoded form of this character reference.
*
* This is equivalent to {@link #getDecimalCharacterReferenceString(int) getDecimalCharacterReferenceString}(
{@link #getCodePoint()})
.
*
*
* - Example:
* CharacterReference.parse(">").getDecimalCharacterReferenceString()
returns ">
"
*
*
* @return the decimal encoded form of this character reference.
* @see #getCharacterReferenceString()
* @see #getHexadecimalCharacterReferenceString()
*/
public String getDecimalCharacterReferenceString() {
return getDecimalCharacterReferenceString(codePoint);
}
/**
* Returns the decimal encoded form of the specified unicode code point.
*
*
* - Example:
* CharacterReference.getDecimalCharacterReferenceString('>')
returns ">
"
*
*
* @param codePoint the unicode code point to encode.
* @return the decimal encoded form of the specified unicode code point.
* @see #getCharacterReferenceString(int codePoint)
* @see #getHexadecimalCharacterReferenceString(int codePoint)
*/
public static String getDecimalCharacterReferenceString(final int codePoint) {
try {
return appendDecimalCharacterReferenceString(new StringBuilder(),codePoint).toString();
} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
}
/**
* Returns the hexadecimal encoded form of this character reference.
*
* This is equivalent to {@link #getHexadecimalCharacterReferenceString(int) getHexadecimalCharacterReferenceString}(
{@link #getCodePoint()})
.
*
*
* - Example:
* CharacterReference.parse(">").getHexadecimalCharacterReferenceString()
returns ">
"
*
*
* @return the hexadecimal encoded form of this character reference.
* @see #getCharacterReferenceString()
* @see #getDecimalCharacterReferenceString()
*/
public String getHexadecimalCharacterReferenceString() {
return getHexadecimalCharacterReferenceString(codePoint);
}
/**
* Returns the hexadecimal encoded form of the specified unicode code point.
*
*
* - Example:
* CharacterReference.getHexadecimalCharacterReferenceString('>')
returns ">
"
*
*
* @param codePoint the unicode code point to encode.
* @return the hexadecimal encoded form of the specified unicode code point.
* @see #getCharacterReferenceString(int codePoint)
* @see #getDecimalCharacterReferenceString(int codePoint)
*/
public static String getHexadecimalCharacterReferenceString(final int codePoint) {
try {
return appendHexadecimalCharacterReferenceString(new StringBuilder(),codePoint).toString();
} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
}
/**
* Returns the unicode code point of this character reference in U+ notation.
*
* This is equivalent to {@link #getUnicodeText(int) getUnicodeText(getCodePoint())}.
*
*
* - Example:
* CharacterReference.parse(">").getUnicodeText()
returns "U+003E
"
*
*
* @return the unicode code point of this character reference in U+ notation.
* @see #getUnicodeText(int codePoint)
*/
public String getUnicodeText() {
return getUnicodeText(codePoint);
}
/**
* Returns the specified unicode code point in U+ notation.
*
*
* - Example:
* CharacterReference.getUnicodeText('>')
returns "U+003E
"
*
*
* @param codePoint the unicode code point.
* @return the specified unicode code point in U+ notation.
*/
public static String getUnicodeText(final int codePoint) {
try {
return appendUnicodeText(new StringBuilder(),codePoint).toString();
} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
}
static final Appendable appendUnicodeText(final Appendable appendable, final int codePoint) throws IOException {
appendable.append("U+");
final String hex=Integer.toString(codePoint,16).toUpperCase();
for (int i=4-hex.length(); i>0; i--) appendable.append('0');
appendable.append(hex);
return appendable;
}
/**
* Parses a single encoded character reference text into a CharacterReference
object.
*
* The character reference must be at the start of the given text, but may contain other characters at the end.
* The {@link #getEnd() getEnd()} method can be used on the resulting object to determine at which character position the character reference ended.
*
* If the text does not represent a valid character reference, this method returns null
.
*
* Unterminated character references are always accepted, regardless of the settings in the
* {@linkplain Config#CurrentCompatibilityMode current compatibility mode}.
*
* To decode all character references in a given text, use the {@link #decode(CharSequence)} method instead.
*
*
* - Example:
* CharacterReference.parse(">").getChar()
returns '>
'
*
*
* @param characterReferenceText the text containing a single encoded character reference.
* @return a CharacterReference
object representing the specified text, or null
if the text does not represent a valid character reference.
* @see #decode(CharSequence)
*/
public static CharacterReference parse(final CharSequence characterReferenceText) {
return construct(new Source(characterReferenceText,true),0,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
}
/**
* Parses a single encoded character reference text into a unicode code point.
*
* The character reference must be at the start of the given text, but may contain other characters at the end.
*
* If the text does not represent a valid character reference, this method returns {@link #INVALID_CODE_POINT}.
*
* This is equivalent to {@link #parse(CharSequence) parse(characterReferenceText)}.
{@link #getCodePoint()},
* except that it returns {@link #INVALID_CODE_POINT} if an invalid character reference is specified instead of throwing a
* NullPointerException
.
*
*
* - Example:
* CharacterReference.getCodePointFromCharacterReferenceString(">")
returns 38
*
*
* @param characterReferenceText the text containing a single encoded character reference.
* @return the unicode code point representing representing the specified text, or {@link #INVALID_CODE_POINT} if the text does not represent a valid character reference.
*/
public static int getCodePointFromCharacterReferenceString(final CharSequence characterReferenceText) {
final CharacterReference characterReference=parse(characterReferenceText);
return (characterReference!=null) ? characterReference.getCodePoint() : INVALID_CODE_POINT;
}
/**
* Indicates whether the specified character would need to be encoded in HTML text.
*
* This is the case if a {@linkplain CharacterEntityReference character entity reference} exists for the character, or the unicode code point is greater than U+007F.
*
* The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027),
* which only returns true
if the static {@link Config#IsApostropheEncoded} property
* is currently set to true
.
*
* @param ch the character to test.
* @return true
if the specified character would need to be encoded in HTML text, otherwise false
.
*/
public static final boolean requiresEncoding(final char ch) {
return ch>127 || (CharacterEntityReference.getName(ch)!=null && (ch!='\'' || Config.IsApostropheEncoded));
}
/**
* Returns a filter Writer
that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified Writer
.
*
* @param writer the destination for the encoded text
* @return a filter Writer
that {@linkplain #encode(CharSequence) encodes} all text before passing it through to the specified Writer
.
* @see #encode(CharSequence unencodedText)
*/
public static Writer getEncodingFilterWriter(final Writer writer) {
return new EncodingFilterWriter(writer);
}
private static final class EncodingFilterWriter extends FilterWriter {
StringBuilder sb=new StringBuilder(MAX_ENTITY_REFERENCE_LENGTH);
public EncodingFilterWriter(final Writer writer) {
super(writer);
}
public void write(final char ch) throws IOException {
sb.setLength(0);
appendEncode(sb,ch);
if (sb.length()==1)
out.write(sb.charAt(0));
else
out.append(sb);
}
public void write(final int chInt) throws IOException {
write((char)chInt);
}
public void write(final char[] cbuf, final int off, final int len) throws IOException {
final int end=off+len;
for (int i=off; i "); // add line break
continue;
} else {
spaceCount=TAB_LENGTH;
}
} else {
spaceCount=1;
}
while (nexti=2) {
appendable.append(" "); // use alternating and spaces to keep original number of spaces
spaceCount-=2;
}
// note that the last character is never a nbsp, so that word wrapping won't result in a nbsp before the first character in a line
i=nexti-1; // minus 1 because top level for loop will add it again
}
return appendable;
}
private static final boolean appendEncodeCheckForWhiteSpaceFormatting(final Appendable appendable, char ch, final boolean whiteSpaceFormatting) throws IOException {
final String characterEntityReferenceName=CharacterEntityReference.getName(ch);
if (characterEntityReferenceName!=null) {
if (ch=='\'') {
if (Config.IsApostropheEncoded)
appendable.append("'");
else
appendable.append(ch);
} else {
CharacterEntityReference.appendCharacterReferenceString(appendable,characterEntityReferenceName);
}
} else if (ch>127) {
appendDecimalCharacterReferenceString(appendable,ch);
} else if (!(whiteSpaceFormatting && isWhiteSpace(ch))) {
appendable.append(ch);
} else {
return false;
}
return true;
}
static CharacterReference getPrevious(final Source source, final int pos) {
return getPrevious(source,pos,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
}
static CharacterReference getNext(final Source source, final int pos) {
return getNext(source,pos,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
}
private static CharacterReference getPrevious(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
final ParseText parseText=source.getParseText();
pos=parseText.lastIndexOf('&',pos);
while (pos!=-1) {
final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings);
if (characterReference!=null) return characterReference;
pos=parseText.lastIndexOf('&',pos-1);
}
return null;
}
private static CharacterReference getNext(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
final ParseText parseText=source.getParseText();
pos=parseText.indexOf('&',pos);
while (pos!=-1) {
final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings);
if (characterReference!=null) return characterReference;
pos=parseText.indexOf('&',pos+1);
}
return null;
}
static final Appendable appendHexadecimalCharacterReferenceString(final Appendable appendable, final int codePoint) throws IOException {
return appendable.append("").append(Integer.toString(codePoint,16)).append(';');
}
static final Appendable appendDecimalCharacterReferenceString(final Appendable appendable, final int codePoint) throws IOException {
return appendable.append("").append(Integer.toString(codePoint)).append(';');
}
static CharacterReference construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
try {
if (source.getParseText().charAt(begin)!='&') return null;
return (source.getParseText().charAt(begin+1)=='#')
? NumericCharacterReference.construct(source,begin,unterminatedCharacterReferenceSettings)
: CharacterEntityReference.construct(source,begin,unterminatedCharacterReferenceSettings.characterEntityReferenceMaxCodePoint);
} catch (IndexOutOfBoundsException ex) {
return null;
}
}
private static Appendable appendDecode(final Appendable appendable, final CharSequence encodedText, int pos, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) throws IOException {
final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings=Config.CurrentCompatibilityMode.getUnterminatedCharacterReferenceSettings(insideAttributeValue);
int lastEnd=0;
final StreamedSource streamedSource=new StreamedSource(encodedText).setHandleTags(false).setSearchBegin(pos);
for (Segment segment : streamedSource) {
if (segment instanceof CharacterReference) {
((CharacterReference)segment).appendCharTo(appendable,convertNonBreakingSpaces);
} else {
appendable.append(segment.toString()); // benchmark tests reveal (surprisingly) that converting to a string before appending is faster than appending the specified section of the encodedText or segment directly.
// appendable.append(encodedText,segment.begin,segment.end);
// appendable.append(segment);
}
}
return appendable;
}
// pinched from http://svn.apache.org/repos/asf/abdera/java/trunk/dependencies/i18n/src/main/java/org/apache/abdera/i18n/text/CharUtils.java
private static char getHighSurrogate(int codePoint) {
return (char)((0xD800 - (0x10000 >> 10)) + (codePoint >> 10));
}
private static char getLowSurrogate(int codePoint) {
return (char)(0xDC00 + (codePoint & 0x3FF));
}
}