net.htmlparser.jericho.NumericCharacterReference Maven / Gradle / Ivy
Show all versions of jericho-html Show documentation
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.3
// Copyright (C) 2004-2009 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.
package net.htmlparser.jericho;
import java.util.*;
import java.io.*;
/**
* Represents an HTML Numeric Character Reference.
*
* A numeric character reference can be one of two types:
*
* - Decimal Character Reference
*
- A numeric character reference specifying the unicode code point in decimal notation.
* This is signified by the absence of an 'x
' character after the '#
', (eg ">
").
* - Hexadecimal Character Reference
*
- A numeric character reference specifying the unicode code point in hexadecimal notation.
* This is signified by the presence of an 'x
' character after the '#
', (eg ">
").
*
*
* Static methods to {@linkplain #encode(CharSequence) encode} and {@linkplain #decode(CharSequence) decode} strings
* and single characters can be found in the {@link CharacterReference} superclass.
*
* NumericCharacterReference
instances are obtained using one of the following methods:
*
* - {@link CharacterReference#parse(CharSequence characterReferenceText)}
*
- {@link Source#getNextCharacterReference(int pos)}
*
- {@link Source#getPreviousCharacterReference(int pos)}
*
- {@link Segment#getAllCharacterReferences()}
*
*
* @see CharacterReference
* @see CharacterEntityReference
*/
public class NumericCharacterReference extends CharacterReference {
private boolean hex;
private NumericCharacterReference(final Source source, final int begin, final int end, final int codePoint, final boolean hex) {
super(source,begin,end,codePoint);
this.hex=hex;
}
/**
* Indicates whether this numeric character reference specifies the unicode code point in decimal format.
*
* A numeric character reference in decimal format is referred to in this library as a
* decimal character reference.
*
* @return true
if this numeric character reference specifies the unicode code point in decimal format, otherwise false
.
* @see #isHexadecimal()
*/
public boolean isDecimal() {
return !hex;
}
/**
* Indicates whether this numeric character reference specifies the unicode code point in hexadecimal format.
*
* A numeric character reference in hexadecimal format is referred to in this library as a
* hexadecimal character reference.
*
* @return true
if this numeric character reference specifies the unicode code point in hexadecimal format, otherwise false
.
* @see #isDecimal()
*/
public boolean isHexadecimal() {
return hex;
}
/**
* Encodes the specified text, escaping special characters into numeric character references.
*
* Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return true
for that character.
*
* This method encodes all character references in decimal format, and is exactly the same as calling
* {@link #encodeDecimal(CharSequence)}.
*
* To encode text using both character entity references and numeric character references, use the
* {@link CharacterReference#encode(CharSequence)} method instead.
*
* To encode text using hexadecimal character references only,
* use the {@link #encodeHexadecimal(CharSequence)} method instead.
*
* @param unencodedText the text to encode.
* @return the encoded string.
* @see #decode(CharSequence)
*/
public static String encode(final CharSequence unencodedText) {
if (unencodedText==null) return null;
final StringBuilder sb=new StringBuilder(unencodedText.length()*2);
for (int i=0; idecimal character references.
*
* Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return true
for that character.
*
* To encode text using both character entity references and numeric character references, use the
* {@link CharacterReference#encode(CharSequence)} method instead.
*
* To encode text using hexadecimal character references only,
* use the {@link #encodeHexadecimal(CharSequence)} method instead.
*
* @param unencodedText the text to encode.
* @return the encoded string.
* @see #decode(CharSequence)
*/
public static String encodeDecimal(final CharSequence unencodedText) {
return encode(unencodedText);
}
/**
* Encodes the specified text, escaping special characters into hexadecimal character references.
*
* Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return true
for that character.
*
* To encode text using both character entity references and numeric character references, use the
* {@link CharacterReference#encode(CharSequence)} method instead.
*
* To encode text using decimal character references only,
* use the {@link #encodeDecimal(CharSequence)} method instead.
*
* @param unencodedText the text to encode.
* @return the encoded string.
* @see #decode(CharSequence)
*/
public static String encodeHexadecimal(final CharSequence unencodedText) {
if (unencodedText==null) return null;
final StringBuilder sb=new StringBuilder(unencodedText.length()*2);
for (int i=0; i
* The returned string uses the same radix as the original character reference in the source document,
* i.e. decimal format if {@link #isDecimal()} is true
, and hexadecimal format if {@link #isHexadecimal()} is true
.
*
* Note that the returned string is not necessarily the same as the original source text used to create this object.
* This library recognises certain invalid forms of character references,
* as detailed in the {@link #decode(CharSequence) decode(CharSequence)} method.
*
* To retrieve the original source text, use the {@link #toString() toString()} method instead.
*
*
* - Example:
* CharacterReference.parse(">").getCharacterReferenceString()
returns ">
"
*
*
* @return the correct encoded form of this numeric character reference.
* @see CharacterReference#getCharacterReferenceString(int codePoint)
*/
public String getCharacterReferenceString() {
return hex ? getHexadecimalCharacterReferenceString(codePoint) : getDecimalCharacterReferenceString(codePoint);
}
/**
* Returns the numeric character reference encoded form of the specified unicode code point.
*
* This method returns the character reference in decimal format, and is exactly the same as calling
* {@link #getDecimalCharacterReferenceString(int codePoint)}.
*
* To get either the character entity reference or numeric character reference, use the
* {@link CharacterReference#getCharacterReferenceString(int codePoint)} method instead.
*
* To get the character reference in hexadecimal format, use the {@link #getHexadecimalCharacterReferenceString(int codePoint)} method instead.
*
*
* - Examples:
* NumericCharacterReference.getCharacterReferenceString(62)
returns ">
"
* NumericCharacterReference.getCharacterReferenceString('>')
returns ">
"
*
*
* @return the numeric character reference encoded form of the specified unicode code point.
* @see CharacterReference#getCharacterReferenceString(int codePoint)
*/
public static String getCharacterReferenceString(final int codePoint) {
return getDecimalCharacterReferenceString(codePoint);
}
static CharacterReference construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
// only called from CharacterReference.construct(), so we can assume that first characters are ""
final ParseText parseText=source.getParseText();
int codePointStringBegin=begin+2;
boolean hex;
if (hex=(parseText.charAt(codePointStringBegin)=='x')) codePointStringBegin++;
final int unterminatedMaxCodePoint=hex ? unterminatedCharacterReferenceSettings.hexadecimalCharacterReferenceMaxCodePoint : unterminatedCharacterReferenceSettings.decimalCharacterReferenceMaxCodePoint;
final int maxSourcePos=source.end-1;
String codePointString;
int end;
int x=codePointStringBegin;
boolean unterminated=false;
while (true) {
final char ch=parseText.charAt(x);
if (ch==';') {
end=x+1;
codePointString=source.substring(codePointStringBegin,x);
break;
}
if ((ch>='0' && ch<='9') || (hex && ((ch>='a' && ch<='f') || (ch>='A' && ch<='F')))) {
// We have a valid decimal digit (if hex is false), or a hexadecimal digit (if hex is true)
if (x==maxSourcePos) {
// We are at the last position in the source text without the terminating semicolon.
unterminated=true;
x++; // include this digit
}
} else {
// We don't have a valid digit, meaning the character reference is unterminated.
unterminated=true;
}
if (unterminated) {
// Different browsers react differently to unterminated numeric character references.
// The behaviour of this method is determined by the settings in the unterminatedCharacterReferenceSettings parameter.
if (unterminatedMaxCodePoint==INVALID_CODE_POINT) {
// reject:
return null;
} else {
// accept:
end=x;
codePointString=source.substring(codePointStringBegin,x);
break;
}
}
x++;
}
if (codePointString.length()==0) return null;
int codePoint=INVALID_CODE_POINT;
try {
codePoint=Integer.parseInt(codePointString,hex?16:10);
if (unterminated && codePoint>unterminatedMaxCodePoint) return null;
if (codePoint>Character.MAX_CODE_POINT) codePoint=INVALID_CODE_POINT;
} catch (NumberFormatException ex) {
// This should only happen if number is larger than Integer.MAX_VALUE.
if (unterminated) return null;
// If it is a terminated reference just ignore the exception as codePoint will remain with its value of INVALID_CODE_POINT.
}
return new NumericCharacterReference(source,begin,end,codePoint,hex);
}
public String getDebugInfo() {
final StringBuilder sb=new StringBuilder();
sb.append('"');
try {
if (hex)
appendHexadecimalCharacterReferenceString(sb,codePoint);
else
appendDecimalCharacterReferenceString(sb,codePoint);
sb.append("\" ");
appendUnicodeText(sb,codePoint);
} catch (IOException ex) {throw new RuntimeException(ex);} // never happens
sb.append(' ').append(super.getDebugInfo());
return sb.toString();
}
}