All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.au.id.jericho.lib.html.CharacterReference Maven / Gradle / Ivy

Go to download

Jericho HTML Parser is a simple but powerful java library allowing analysis and manipulation of parts of an HTML document, including some common server-side tags, while reproducing verbatim any unrecognised or invalid HTML. It also provides high-level HTML form manipulation functions.

There is a newer version: 2.3
Show newest version
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 1.5
// Copyright (C) 2004 Martin Jericho
// http://jerichohtml.sourceforge.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// http://www.gnu.org/copyleft/lesser.html
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package au.id.jericho.lib.html;

import java.util.*;

/**
 * Represents either a {@link CharacterEntityReference} or {@link NumericCharacterReference}.
 * 

* This class, together with its subclasses, contains static methods to perform most required operations without ever having to instantiate an object. *

* Objects of this class are useful when the positions of character references in a source document are required, * or to replace the found character references with customised text. *

* Objects are created using one of the following methods: *

    *
  • {@link CharacterReference#parse(CharSequence characterReferenceText)} *
  • {@link Source#findNextCharacterReference(int pos)} *
  • {@link Source#findPreviousCharacterReference(int pos)} *
  • {@link Segment#findAllCharacterReferences()} *
*/ public abstract class CharacterReference extends Segment { int codePoint; /** * Represents an invalid Unicode code point. *

* This can be the result of parsing a numeric character reference outside of the valid Unicode range of 0x000000-0x10FFFF, or any other invalid character reference. */ public static final int INVALID_CODE_POINT=-1; /** * The maximum codepoint allowed by unicode, 0x10FFFF (decimal 1114111). * This can be replaced by Character.MAX_CODE_POINT in java 1.5 */ protected static final int MAX_CODE_POINT=0x10FFFF; /** The number of spaces used to simulate a tab when {@linkplain #encodeWithWhiteSpaceFormatting encoding with white space formatting}. */ private static final int TAB_LENGTH=4; CharacterReference(Source source, int begin, int end, int codePoint) { super(source,begin,end); this.codePoint=codePoint; } /** * Returns the Unicode code point represented by this character reference. * @return the Unicode code point represented by this character reference. */ public int getCodePoint() { return codePoint; } /** * Returns the character represented by this character reference. *

* If this character reference represents a Unicode * supplimentary code point, * any bits outside of the least significant 16 bits of the code point are truncated, yielding an incorrect result. * * @return the character represented by this character reference. */ public char getChar() { return (char)codePoint; } /** * Encodes the specified text, escaping special characters into character references. *

* Each character is encoded only if the {@link #requiresEncoding(char)} method would return true for that character, * using its {@link CharacterEntityReference} if available, or a decimal {@link NumericCharacterReference} if their Unicode * code point value is greater than U+007F. *

* The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027), * which depending on the current setting of the static {@link #ApostropheEncoded} property, * is either encoded as the numeric character reference "'" (default setting), or left unencoded. *

* This method will never encode an apostrophe into its character entity reference "'" as this * entity is not defined for use in HTML. See the comments in the {@link CharacterEntityReference} class for more information. *

* To encode text using only numeric character references, use the
* {@link NumericCharacterReference#encode(CharSequence unencodedText)} method instead. * * @param unencodedText the text to encode. * @return the encoded string. * @see #decode(CharSequence encodedText) */ public static String encode(CharSequence unencodedText) { if (unencodedText==null) return null; return appendEncode(new StringBuffer(unencodedText.length()*2),unencodedText,false).toString(); } /** * Determines whether apostrophes are encoded when calling the {@link #encode(CharSequence)} method. *

* This is a global setting which affects all threads. *

* Specifying a value of false means {@linkplain CharacterEntityReference#_apos apostrophe} * (U+0027) characters will not be encoded. * The only time apostrophes need to be encoded is within an attribute value delimited by * single quotes (apostrophes), so in most cases ignoring apostrophes is perfectly safe and * enhances readability of the source document. *

* The recommended setting is false, although the default value is true so that * the behaviour of the {@link #encode(CharSequence)} method is consistent with previous versions. */ public static boolean ApostropheEncoded=true; /** * {@linkplain #encode(CharSequence) Encodes} the specified text, preserving line breaks, tabs and spaces for rendering by converting them to markup. *

* This performs the same encoding as the {@link #encode(CharSequence)} method, but also performs the following conversions: *

    *
  • Line breaks, being Carriage Return (U+000D) or Line Feed (U+000A) characters, and Form Feed characters (U+000C) * are converted to "<br />". CR/LF pairs are treated as a single line break. *
  • Multiple consecutive spaces are converted so that every second space is converted to "&nbsp;" * while ensuring the last is always a normal space. *
  • Tab characters (U+0009) are converted as if they were four consecutive spaces. *
*

* The conversion of multiple consecutive spaces to alternating space/non-breaking-space allows the correct number of * spaces to be rendered, but also allows the line to wrap in the middle of it. *

* Note that zero-width spaces (U+200B) are converted to the numeric character reference * &#x200B; through the normal encoding process, but IE6 does not render them properly * either encoded or unencoded. *

* There is no method provided to reverse this encoding. * * @param unencodedText the text to encode. * @return the encoded string with whitespace formatting converted to markup. * @see #encode(CharSequence unencodedText) */ public static String encodeWithWhiteSpaceFormatting(CharSequence unencodedText) { if (unencodedText==null) return null; return appendEncode(new StringBuffer(unencodedText.length()*2),unencodedText,true).toString(); } /** * Decodes the specified HTML encoded text into normal text. *

* All {@linkplain CharacterEntityReference character entity references} and {@linkplain NumericCharacterReference numeric character references} are converted to their respective characters. *

* The SGML specification allows character references without a terminating semicolon (;) in some circumstances. * Although not permitted in HTML or XHTML, some browsers do accept them.
* The behaviour of this library is as follows: *

    *
  • {@linkplain CharacterEntityReference Character entity references} terminated by any non-alphabetic character are accepted *
  • {@linkplain NumericCharacterReference#encodeDecimal(CharSequence) Decimal numeric character references} terminated by any non-digit character are accepted *
  • {@linkplain NumericCharacterReference#encodeHexadecimal(CharSequence) Hexadecimal numeric character references} must be terminated correctly by a semicolon. *
*

* Although character entity references are case sensitive, and in some cases differ from other entity references only by their case, * some browsers will also recognise them in a case-insensitive way. * For this reason, all decoding methods in this library will recognise character entity references even if they are in the wrong case. * * @param encodedText the text to decode. * @return the decoded string. * @see #encode(CharSequence unencodedText) */ public static String decode(CharSequence encodedText) { if (encodedText==null) return null; String encodedString=encodedText.toString(); int pos=encodedString.indexOf('&'); if (pos==-1) return encodedString; return appendDecode(new StringBuffer(encodedString.length()),encodedString,pos).toString(); } /** * {@linkplain #decode(CharSequence) Decodes} the specified text after collapsing its {@linkplain #isWhiteSpace(char) white space}. *

* All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space. *

* The resultant text is what would normally be rendered by a user agent. * * @param text the source text * @return the decoded text with collapsed white space. * @see FormControl#getPredefinedValues() */ public static String decodeCollapseWhiteSpace(CharSequence text) { return decode(appendCollapseWhiteSpace(new StringBuffer(text.length()),text)); } /** * Re-encodes the specified text, equivalent to {@linkplain #decode(CharSequence) decoding} and then {@linkplain #encode(CharSequence) encoding} again. *

* This process ensures that the specified encoded text does not contain any remaining unencoded characters. *

* IMPLEMENTATION NOTE: At present this method simply calls the {@link #decode(CharSequence) decode} method * followed by the {@link #encode(CharSequence) encode} method, but a more efficient implementation * may be used in future. * * @param encodedText the text to re-encode. * @return the re-encoded string. */ public static String reencode(CharSequence encodedText) { return encode(decode(encodedText)); } /** * Returns the encoded form of this character reference. *

* The exact behaviour of this method depends on the class of this object. * See the {@link CharacterEntityReference#getCharacterReferenceString()} and * {@link NumericCharacterReference#getCharacterReferenceString()} methods for more details. *

*

*
Examples:
*
CharacterReference.parse("&GT;").getCharacterReferenceString() returns "&gt;"
*
CharacterReference.parse("&#x3E;").getCharacterReferenceString() returns "&#3e;"
*
* * @return the encoded form of this character reference. * @see #getCharacterReferenceString(int codePoint) * @see #getDecimalCharacterReferenceString() */ public abstract String getCharacterReferenceString(); /** * Returns the encoded form of the specified Unicode code point. *

* This method returns the {@linkplain CharacterEntityReference#getCharacterReferenceString(int) character entity reference} encoded form of the code point * if one exists, otherwise it returns the {@linkplain #getDecimalCharacterReferenceString(int) decimal numeric character reference} encoded form. *

* The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027), * which is encoded as the numeric character reference "&#39;" instead of its character entity reference * "&apos;". *

*

*
Examples:
*
CharacterReference.getCharacterReferenceString(62) returns "&gt;"
*
CharacterReference.getCharacterReferenceString('>') returns "&gt;"
*
CharacterReference.getCharacterReferenceString('☺') returns "&#9786;"
*
* * @param codePoint the Unicode code point to encode. * @return the encoded form of the specified Unicode code point. * @see #getHexadecimalCharacterReferenceString(int codePoint) */ public static String getCharacterReferenceString(int codePoint) { String characterReferenceString=null; if (codePoint!=CharacterEntityReference._apos) characterReferenceString=CharacterEntityReference.getCharacterReferenceString(codePoint); if (characterReferenceString==null) characterReferenceString=NumericCharacterReference.getCharacterReferenceString(codePoint); return characterReferenceString; } /** * Returns the decimal encoded form of this character reference. *

* This is equivalent to {@link #getDecimalCharacterReferenceString(int) getDecimalCharacterReferenceString(getCodePoint())}. *

*

*
Example:
*
CharacterReference.parse("&gt;").getDecimalCharacterReferenceString() returns "&#62;"
*
* * @return the decimal encoded form of this character reference. * @see #getCharacterReferenceString() * @see #getHexadecimalCharacterReferenceString() */ public String getDecimalCharacterReferenceString() { return getDecimalCharacterReferenceString(codePoint); } /** * Returns the decimal encoded form of the specified Unicode code point. *

*

*
Example:
*
CharacterReference.getDecimalCharacterReferenceString('>') returns "&#62;"
*
* * @param codePoint the Unicode code point to encode. * @return the decimal encoded form of the specified Unicode code point. * @see #getCharacterReferenceString(int codePoint) * @see #getHexadecimalCharacterReferenceString(int codePoint) */ public static String getDecimalCharacterReferenceString(int codePoint) { return appendDecimalCharacterReferenceString(new StringBuffer(),codePoint).toString(); } /** * Returns the hexadecimal encoded form of this character reference. *

* This is equivalent to {@link #getHexadecimalCharacterReferenceString(int) getHexadecimalCharacterReferenceString(getCodePoint())}. *

*

*
Example:
*
CharacterReference.parse("&gt;").getHexadecimalCharacterReferenceString() returns "&#x3e;"
*
* * @return the hexadecimal encoded form of this character reference. * @see #getCharacterReferenceString() * @see #getDecimalCharacterReferenceString() */ public String getHexadecimalCharacterReferenceString() { return getHexadecimalCharacterReferenceString(codePoint); } /** * Returns the hexadecimal encoded form of the specified Unicode code point. *

*

*
Example:
*
CharacterReference.getHexadecimalCharacterReferenceString('>') returns "&#x3e;"
*
* * @param codePoint the Unicode code point to encode. * @return the hexadecimal encoded form of the specified Unicode code point. * @see #getCharacterReferenceString(int codePoint) * @see #getDecimalCharacterReferenceString(int codePoint) */ public static String getHexadecimalCharacterReferenceString(int codePoint) { return appendHexadecimalCharacterReferenceString(new StringBuffer(),codePoint).toString(); } /** * Returns the Unicode code point of this character reference in U+ notation. *

* This is equivalent to {@link #getUnicodeText(int) getUnicodeText(getCodePoint())}. *

*

*
Example:
*
CharacterReference.parse("&gt;").getUnicodeText() returns "U+003E"
*
* * @return the Unicode code point of this character reference in U+ notation. * @see #getUnicodeText(int codePoint) */ public String getUnicodeText() { return getUnicodeText(codePoint); } /** * Returns the specified Unicode code point in U+ notation. *

*

*
Example:
*
CharacterReference.getUnicodeText('>') returns "U+003E"
*
* * @param codePoint the Unicode code point. * @return the specified Unicode code point in U+ notation. */ public static String getUnicodeText(int codePoint) { return appendUnicodeText(new StringBuffer(),codePoint).toString(); } protected static final StringBuffer appendUnicodeText(StringBuffer sb, int codePoint) { sb.append("U+"); String hex=Integer.toString(codePoint,16).toUpperCase(); for (int i=4-hex.length(); i>0; i--) sb.append('0'); sb.append(hex); return sb; } /** * Parses a single encoded character reference text into a CharacterReference object. *

* The character reference must be at the start of the given text, but may contain other characters at the end. * The {@link #getEnd() getEnd()} method can be used on the resulting object to determine at which character position the character reference ended. *

* If the text does not represent a valid character reference, this method returns null. *

* To decode all character references in a given text, use the {@link #decode(CharSequence encodedText)} method instead. *

*

*
Example:
*
CharacterReference.parse("&gt;").getChar() returns '>'
*
* * @param characterReferenceText the text containing a single encoded character reference. * @return a CharacterReference object representing the specified text, or null if the text does not represent a valid character reference. * @see #decode(CharSequence encodedText) */ public static CharacterReference parse(CharSequence characterReferenceText) { return construct(new Source(characterReferenceText.toString()),0); } /** * Parses a single encoded character reference text into a Unicode code point. *

* The character reference must be at the start of the given text, but may contain other characters at the end. *

* If the text does not represent a valid character reference, this method returns {@link #INVALID_CODE_POINT}. *

*

*
Example:
*
CharacterReference.getCodePointFromCharacterReferenceString("&gt;") returns 38
*
* * @param characterReferenceText the text containing a single encoded character reference. * @return the Unicode code point representing representing the specified text, or {@link #INVALID_CODE_POINT} if the text does not represent a valid character reference. */ public static int getCodePointFromCharacterReferenceString(CharSequence characterReferenceText) { CharacterReference characterReference=parse(characterReferenceText); return (characterReference!=null) ? characterReference.getCodePoint() : INVALID_CODE_POINT; } /** * Indicates whether the specified character would need to be encoded in HTML text. *

* This is the case if a {@linkplain CharacterEntityReference character entity reference} exists for the character, or the Unicode code point value is greater than U+007F. *

* The only exception to this is an {@linkplain CharacterEntityReference#_apos apostrophe} (U+0027), * which only returns true if the static {@link #ApostropheEncoded} property is currently set to true. * * @param ch the character to be tested. * @return true if the specified character would need to be encoded in HTML text, otherwise false. */ public static final boolean requiresEncoding(char ch) { return ch>127 || (CharacterEntityReference.getName(ch)!=null && (ch!='\'' || ApostropheEncoded)); } protected static StringBuffer appendEncode(StringBuffer sb, CharSequence unencodedText, boolean whiteSpaceFormatting) { if (unencodedText==null) return sb; int beginPos=0; int endPos=unencodedText.length(); if (unencodedText instanceof Segment) { // this might improve performance slightly Segment segment=(Segment)unencodedText; int segmentOffset=segment.getBegin(); beginPos=segmentOffset; endPos+=segmentOffset; unencodedText=segment.source.toString(); } for (int i=beginPos; i127) { appendDecimalCharacterReferenceString(sb,ch); } else if (!(whiteSpaceFormatting && isWhiteSpace(ch))) { sb.append(ch); } else { // whiteSpaceFormatting tries to simulate the formatting characters by converting them to markup int spaceCount; int nexti=i+1; if (ch!=' ') { if (ch!='\t') { // must be line feed, carriage return or form feed, since zero-width space should have been processed as a character reference string if (ch=='\r' && nexti"); // add line break continue; } else { spaceCount=TAB_LENGTH; } } else { spaceCount=1; } while (nexti=2) { sb.append("  "); // use alternating   and spaces to keep original number of spaces spaceCount-=2; } // note that the last character is never a nbsp, so that word wrapping won't result in a nbsp before the first character in a line i=nexti-1; // minus 1 because top level for loop will add it again } } return sb; } static CharacterReference findPreviousOrNext(Source source, int pos, boolean previous) { String lsource=source.getParseTextLowerCase(); pos=previous ? lsource.lastIndexOf('&',pos) : lsource.indexOf('&',pos); while (pos!=-1) { CharacterReference characterReference=construct(source,pos); if (characterReference!=null) return characterReference; pos=previous ? lsource.lastIndexOf('&',pos-1) : lsource.indexOf('&',pos+1); } return null; } protected static final StringBuffer appendHexadecimalCharacterReferenceString(StringBuffer sb, int codePoint) { return sb.append("&#x").append(Integer.toString(codePoint,16)).append(';'); } protected static final StringBuffer appendDecimalCharacterReferenceString(StringBuffer sb, int codePoint) { return sb.append("&#").append(codePoint).append(';'); } static CharacterReference construct(Source source, int begin) { try { if (source.getParseTextLowerCase().charAt(begin)!='&') return null; return (source.getParseTextLowerCase().charAt(begin+1)=='#') ? NumericCharacterReference.construct(source,begin) : CharacterEntityReference.construct(source,begin); } catch (IndexOutOfBoundsException ex) { return null; } } private static StringBuffer appendDecode(StringBuffer sb, String encodedString) { if (encodedString==null) return sb; int pos=encodedString.indexOf('&'); if (pos==-1) return sb.append(encodedString); return appendDecode(sb,encodedString,pos); } private static StringBuffer appendDecode(StringBuffer sb, String encodedString, int pos) { int lastEnd=0; Source source=new Source(encodedString); while (true) { CharacterReference characterReference=findPreviousOrNext(source,pos,false); if (characterReference==null) break; if (lastEnd!=characterReference.getBegin()) Util.appendTo(sb,encodedString,lastEnd,characterReference.getBegin()); sb.append((char)characterReference.codePoint); pos=lastEnd=characterReference.getEnd(); } if (lastEnd!=encodedString.length()) Util.appendTo(sb,encodedString,lastEnd,encodedString.length()); return sb; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy