All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.java.net.htmlparser.jericho.NumericCharacterReference Maven / Gradle / Ivy

Go to download

Jericho HTML Parser is a java library allowing analysis and manipulation of parts of an HTML document, including server-side tags, while reproducing verbatim any unrecognised or invalid HTML.

There is a newer version: 3.4
Show newest version
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.0
// Copyright (C) 2007 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import java.util.*;
import java.io.*;

/**
 * Represents an HTML Numeric Character Reference.
 * 

* A numeric character reference can be one of two types: *

*
Decimal Character Reference *
A numeric character reference specifying the unicode code point in decimal notation.
* This is signified by the absence of an 'x' character after the '#', (eg ">"). *
Hexadecimal Character Reference *
A numeric character reference specifying the unicode code point in hexadecimal notation.
* This is signified by the presence of an 'x' character after the '#', (eg ">"). *
*

* Static methods to {@linkplain #encode(CharSequence) encode} and {@linkplain #decode(CharSequence) decode} strings * and single characters can be found in the {@link CharacterReference} superclass. *

* NumericCharacterReference instances are obtained using one of the following methods: *

    *
  • {@link CharacterReference#parse(CharSequence characterReferenceText)} *
  • {@link Source#getNextCharacterReference(int pos)} *
  • {@link Source#getPreviousCharacterReference(int pos)} *
  • {@link Segment#getAllCharacterReferences()} *
* * @see CharacterReference * @see CharacterEntityReference */ public class NumericCharacterReference extends CharacterReference { private boolean hex; private NumericCharacterReference(final Source source, final int begin, final int end, final int codePoint, final boolean hex) { super(source,begin,end,codePoint); this.hex=hex; } /** * Indicates whether this numeric character reference specifies the unicode code point in decimal format. *

* A numeric character reference in decimal format is referred to in this library as a * decimal character reference. * * @return true if this numeric character reference specifies the unicode code point in decimal format, otherwise false. * @see #isHexadecimal() */ public boolean isDecimal() { return !hex; } /** * Indicates whether this numeric character reference specifies the unicode code point in hexadecimal format. *

* A numeric character reference in hexadecimal format is referred to in this library as a * hexadecimal character reference. * * @return true if this numeric character reference specifies the unicode code point in hexadecimal format, otherwise false. * @see #isDecimal() */ public boolean isHexadecimal() { return hex; } /** * Encodes the specified text, escaping special characters into numeric character references. *

* Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return true for that character. *

* This method encodes all character references in decimal format, and is exactly the same as calling * {@link #encodeDecimal(CharSequence)}. *

* To encode text using both character entity references and numeric character references, use the
* {@link CharacterReference#encode(CharSequence)} method instead. *

* To encode text using hexadecimal character references only, * use the {@link #encodeHexadecimal(CharSequence)} method instead. * * @param unencodedText the text to encode. * @return the encoded string. * @see #decode(CharSequence) */ public static String encode(final CharSequence unencodedText) { if (unencodedText==null) return null; final StringBuilder sb=new StringBuilder(unencodedText.length()*2); for (int i=0; idecimal character references. *

* Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return true for that character. *

* To encode text using both character entity references and numeric character references, use the
* {@link CharacterReference#encode(CharSequence)} method instead. *

* To encode text using hexadecimal character references only, * use the {@link #encodeHexadecimal(CharSequence)} method instead. * * @param unencodedText the text to encode. * @return the encoded string. * @see #decode(CharSequence) */ public static String encodeDecimal(final CharSequence unencodedText) { return encode(unencodedText); } /** * Encodes the specified text, escaping special characters into hexadecimal character references. *

* Each character is encoded only if the {@link #requiresEncoding(char) requiresEncoding(char)} method would return true for that character. *

* To encode text using both character entity references and numeric character references, use the
* {@link CharacterReference#encode(CharSequence)} method instead. *

* To encode text using decimal character references only, * use the {@link #encodeDecimal(CharSequence)} method instead. * * @param unencodedText the text to encode. * @return the encoded string. * @see #decode(CharSequence) */ public static String encodeHexadecimal(final CharSequence unencodedText) { if (unencodedText==null) return null; final StringBuilder sb=new StringBuilder(unencodedText.length()*2); for (int i=0; i * The returned string uses the same radix as the original character reference in the source document, * i.e. decimal format if {@link #isDecimal()} is true, and hexadecimal format if {@link #isHexadecimal()} is true. *

* Note that the returned string is not necessarily the same as the original source text used to create this object. * This library recognises certain invalid forms of character references, * as detailed in the {@link #decode(CharSequence) decode(CharSequence)} method. *

* To retrieve the original source text, use the {@link #toString() toString()} method instead. *

*

*
Example:
*
CharacterReference.parse("&#62").getCharacterReferenceString() returns ">"
*
* * @return the correct encoded form of this numeric character reference. * @see CharacterReference#getCharacterReferenceString(int codePoint) */ public String getCharacterReferenceString() { return hex ? getHexadecimalCharacterReferenceString(codePoint) : getDecimalCharacterReferenceString(codePoint); } /** * Returns the numeric character reference encoded form of the specified unicode code point. *

* This method returns the character reference in decimal format, and is exactly the same as calling * {@link #getDecimalCharacterReferenceString(int codePoint)}. *

* To get either the character entity reference or numeric character reference, use the
* {@link CharacterReference#getCharacterReferenceString(int codePoint)} method instead. *

* To get the character reference in hexadecimal format, use the {@link #getHexadecimalCharacterReferenceString(int codePoint)} method instead. *

*

*
Examples:
*
NumericCharacterReference.getCharacterReferenceString(62) returns ">"
*
NumericCharacterReference.getCharacterReferenceString('>') returns ">"
*
* * @return the numeric character reference encoded form of the specified unicode code point. * @see CharacterReference#getCharacterReferenceString(int codePoint) */ public static String getCharacterReferenceString(final int codePoint) { return getDecimalCharacterReferenceString(codePoint); } static CharacterReference construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) { // only called from CharacterReference.construct(), so we can assume that first characters are "&#" final ParseText parseText=source.getParseText(); int codePointStringBegin=begin+2; boolean hex; if (hex=(parseText.charAt(codePointStringBegin)=='x')) codePointStringBegin++; final int unterminatedMaxCodePoint=hex ? unterminatedCharacterReferenceSettings.hexadecimalCharacterReferenceMaxCodePoint : unterminatedCharacterReferenceSettings.decimalCharacterReferenceMaxCodePoint; final int maxSourcePos=parseText.length()-1; String codePointString; int end; int x=codePointStringBegin; boolean unterminated=false; while (true) { final char ch=parseText.charAt(x); if (ch==';') { end=x+1; codePointString=parseText.substring(codePointStringBegin,x); break; } if ((ch<'0' || ch>'9') && (!hex || ch<'a' || ch>'f')) { // At this point we were either expecting a decimal digit (if hex is false), or a hexadecimal digit (if hex is true), // but have found something else, meaning the character reference is unterminated. unterminated=true; } else if (x==maxSourcePos) { // At this point, we have a valid digit but are at the last position in the source text without the terminating semicolon. unterminated=true; x++; // include this digit } if (unterminated) { // Different browsers react differently to unterminated numeric character references. // The behaviour of this method is determined by the settings in the unterminatedCharacterReferenceSettings parameter. if (unterminatedMaxCodePoint==INVALID_CODE_POINT) { // reject: return null; } else { // accept: end=x; codePointString=parseText.substring(codePointStringBegin,x); break; } } x++; } if (codePointString.length()==0) return null; int codePoint=INVALID_CODE_POINT; try { codePoint=Integer.parseInt(codePointString,hex?16:10); if (unterminated && codePoint>unterminatedMaxCodePoint) return null; if (codePoint>Character.MAX_CODE_POINT) codePoint=INVALID_CODE_POINT; } catch (NumberFormatException ex) { // This should only happen if number is larger than Integer.MAX_VALUE. if (unterminated) return null; // If it is a terminated reference just ignore the exception as codePoint will remain with its value of INVALID_CODE_POINT. } return new NumericCharacterReference(source,begin,end,codePoint,hex); } public String getDebugInfo() { final StringBuilder sb=new StringBuilder(); sb.append('"'); try { if (hex) appendHexadecimalCharacterReferenceString(sb,codePoint); else appendDecimalCharacterReferenceString(sb,codePoint); sb.append("\" "); appendUnicodeText(sb,codePoint); } catch (IOException ex) {throw new RuntimeException(ex);} // never happens sb.append(' ').append(super.getDebugInfo()); return sb.toString(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy