org.htmlparser.util.Translate Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of bboss-htmlparser Show documentation
Show all versions of bboss-htmlparser Show documentation
bboss is a j2ee framework include aop/ioc,mvc,persistent,taglib,rpc,event ,bean-xml serializable and so on.http://www.bbossgroups.com
// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/Translate.java,v $
// $Author: derrickoswald $
// $Date: 2004/07/31 16:42:33 $
// $Revision: 1.46 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
package org.htmlparser.util;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import org.htmlparser.util.sort.Sort;
/**
* Extended character entity reference.
* Handles kernels within other strings, just for lookup purposes.
*/
class CharacterReferenceEx extends CharacterReference
{
/**
* The starting point in the string.
*/
protected int mStart;
/**
* The ending point in the string.
*/
protected int mEnd;
/**
* Zero args constructor.
* This object is only ever used after setting the kernel, start and end.
*/
public CharacterReferenceEx ()
{
super ("", 0);
}
/**
* Set the starting point of the kernel.
*/
public void setStart (int start)
{
mStart = start;
}
/**
* Set the supposed ending point.
* This only specifies an upper bound on the kernel length.
*/
public void setEnd (int end)
{
mEnd = end;
}
/**
* Get this CharacterReference's kernel.
* @return The kernel in the equivalent character entity reference.
*/
public String getKernel ()
{
return (mKernel.substring (mStart, mEnd));
}
//
// Ordered interface
//
/**
* Compare one reference to another.
* @see org.htmlparser.util.sort.Ordered
*/
public int compare (Object that)
{
CharacterReference r;
String kernel;
int length;
int ret;
ret = 0;
r = (CharacterReference)that;
kernel = r.getKernel ();
length = kernel.length ();
for (int i = mStart, j = 0; i < mEnd; i++, j++)
{
if (j >= length)
{
ret = 1;
break;
}
ret = mKernel.charAt (i) - kernel.charAt (j);
if (0 != ret)
break;
}
return (ret);
}
}
/**
* Translate numeric character references and character entity references to unicode characters.
* Based on tables found at
* http://www.w3.org/TR/REC-html40/sgml/entities.html
* Typical usage:
*
* String s = Translate.decode (getTextFromHtmlPage ());
*
* or
*
* String s = "<HTML>" + Translate.encode (getArbitraryText ()) + "</HTML>";
*
*/
public class Translate
{
/**
* If this member is set true
, decoding of streams is
* done line by line in order to reduce the maximum memory required.
*/
static public boolean DECODE_LINE_BY_LINE = false;
/**
* If this member is set true
, encoding of numeric character
* references uses hexadecimal digits, i.e. ○, instead of decimal
* digits.
*/
static public boolean ENCODE_HEXADECIMAL = false;
/**
* Table mapping entity reference kernel to character.
* This is sorted by kernel when the class is loaded.
*/
protected static final CharacterReference[] mCharacterReferences =
{
// Portions © International Organization for Standardization 1986
// Permission to copy in any form is granted for use with
// conforming SGML systems and applications as defined in
// ISO 8879, provided this notice is included in all copies.
// Character entity set. Typical invocation:
//
// %HTMLlat1;
new CharacterReference ("nbsp", '\u00a0'), // no-break space = non-breaking space, U+00A0 ISOnum
new CharacterReference ("iexcl", '\u00a1'), // inverted exclamation mark, U+00A1 ISOnum
new CharacterReference ("cent", '\u00a2'), // cent sign, U+00A2 ISOnum
new CharacterReference ("pound", '\u00a3'), // pound sign, U+00A3 ISOnum
new CharacterReference ("curren", '\u00a4'), // currency sign, U+00A4 ISOnum
new CharacterReference ("yen", '\u00a5'), // yen sign = yuan sign, U+00A5 ISOnum
new CharacterReference ("brvbar", '\u00a6'), // broken bar = broken vertical bar, U+00A6 ISOnum
new CharacterReference ("sect", '\u00a7'), // section sign, U+00A7 ISOnum
new CharacterReference ("uml", '\u00a8'), // diaeresis = spacing diaeresis, U+00A8 ISOdia
new CharacterReference ("copy", '\u00a9'), // copyright sign, U+00A9 ISOnum
new CharacterReference ("ordf", '\u00aa'), // feminine ordinal indicator, U+00AA ISOnum
new CharacterReference ("laquo", '\u00ab'), // left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum
new CharacterReference ("not", '\u00ac'), // not sign, U+00AC ISOnum
new CharacterReference ("shy", '\u00ad'), // soft hyphen = discretionary hyphen, U+00AD ISOnum
new CharacterReference ("reg", '\u00ae'), // registered sign = registered trade mark sign, U+00AE ISOnum
new CharacterReference ("macr", '\u00af'), // macron = spacing macron = overline = APL overbar, U+00AF ISOdia
new CharacterReference ("deg", '\u00b0'), // degree sign, U+00B0 ISOnum
new CharacterReference ("plusmn", '\u00b1'), // plus-minus sign = plus-or-minus sign, U+00B1 ISOnum
new CharacterReference ("sup2", '\u00b2'), // superscript two = superscript digit two = squared, U+00B2 ISOnum
new CharacterReference ("sup3", '\u00b3'), // superscript three = superscript digit three = cubed, U+00B3 ISOnum
new CharacterReference ("acute", '\u00b4'), // acute accent = spacing acute, U+00B4 ISOdia
new CharacterReference ("micro", '\u00b5'), // micro sign, U+00B5 ISOnum
new CharacterReference ("para", '\u00b6'), // pilcrow sign = paragraph sign, U+00B6 ISOnum
new CharacterReference ("middot", '\u00b7'), // middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum
new CharacterReference ("cedil", '\u00b8'), // cedilla = spacing cedilla, U+00B8 ISOdia
new CharacterReference ("sup1", '\u00b9'), // superscript one = superscript digit one, U+00B9 ISOnum
new CharacterReference ("ordm", '\u00ba'), // masculine ordinal indicator, U+00BA ISOnum
new CharacterReference ("raquo", '\u00bb'), // right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum
new CharacterReference ("frac14", '\u00bc'), // vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum
new CharacterReference ("frac12", '\u00bd'), // vulgar fraction one half = fraction one half, U+00BD ISOnum
new CharacterReference ("frac34", '\u00be'), // vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum
new CharacterReference ("iquest", '\u00bf'), // inverted question mark = turned question mark, U+00BF ISOnum
new CharacterReference ("Agrave", '\u00c0'), // latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1
new CharacterReference ("Aacute", '\u00c1'), // latin capital letter A with acute, U+00C1 ISOlat1
new CharacterReference ("Acirc", '\u00c2'), // latin capital letter A with circumflex, U+00C2 ISOlat1
new CharacterReference ("Atilde", '\u00c3'), // latin capital letter A with tilde, U+00C3 ISOlat1
new CharacterReference ("Auml", '\u00c4'), // latin capital letter A with diaeresis, U+00C4 ISOlat1
new CharacterReference ("Aring", '\u00c5'), // latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1
new CharacterReference ("AElig", '\u00c6'), // latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
new CharacterReference ("Ccedil", '\u00c7'), // latin capital letter C with cedilla, U+00C7 ISOlat1
new CharacterReference ("Egrave", '\u00c8'), // latin capital letter E with grave, U+00C8 ISOlat1
new CharacterReference ("Eacute", '\u00c9'), // latin capital letter E with acute, U+00C9 ISOlat1
new CharacterReference ("Ecirc", '\u00ca'), // latin capital letter E with circumflex, U+00CA ISOlat1
new CharacterReference ("Euml", '\u00cb'), // latin capital letter E with diaeresis, U+00CB ISOlat1
new CharacterReference ("Igrave", '\u00cc'), // latin capital letter I with grave, U+00CC ISOlat1
new CharacterReference ("Iacute", '\u00cd'), // latin capital letter I with acute, U+00CD ISOlat1
new CharacterReference ("Icirc", '\u00ce'), // latin capital letter I with circumflex, U+00CE ISOlat1
new CharacterReference ("Iuml", '\u00cf'), // latin capital letter I with diaeresis, U+00CF ISOlat1
new CharacterReference ("ETH", '\u00d0'), // latin capital letter ETH, U+00D0 ISOlat1
new CharacterReference ("Ntilde", '\u00d1'), // latin capital letter N with tilde, U+00D1 ISOlat1
new CharacterReference ("Ograve", '\u00d2'), // latin capital letter O with grave, U+00D2 ISOlat1
new CharacterReference ("Oacute", '\u00d3'), // latin capital letter O with acute, U+00D3 ISOlat1
new CharacterReference ("Ocirc", '\u00d4'), // latin capital letter O with circumflex, U+00D4 ISOlat1
new CharacterReference ("Otilde", '\u00d5'), // latin capital letter O with tilde, U+00D5 ISOlat1
new CharacterReference ("Ouml", '\u00d6'), // latin capital letter O with diaeresis, U+00D6 ISOlat1
new CharacterReference ("times", '\u00d7'), // multiplication sign, U+00D7 ISOnum
new CharacterReference ("Oslash", '\u00d8'), // latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1
new CharacterReference ("Ugrave", '\u00d9'), // latin capital letter U with grave, U+00D9 ISOlat1
new CharacterReference ("Uacute", '\u00da'), // latin capital letter U with acute, U+00DA ISOlat1
new CharacterReference ("Ucirc", '\u00db'), // latin capital letter U with circumflex, U+00DB ISOlat1
new CharacterReference ("Uuml", '\u00dc'), // latin capital letter U with diaeresis, U+00DC ISOlat1
new CharacterReference ("Yacute", '\u00dd'), // latin capital letter Y with acute, U+00DD ISOlat1
new CharacterReference ("THORN", '\u00de'), // latin capital letter THORN, U+00DE ISOlat1
new CharacterReference ("szlig", '\u00df'), // latin small letter sharp s = ess-zed, U+00DF ISOlat1
new CharacterReference ("agrave", '\u00e0'), // latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1
new CharacterReference ("aacute", '\u00e1'), // latin small letter a with acute, U+00E1 ISOlat1
new CharacterReference ("acirc", '\u00e2'), // latin small letter a with circumflex, U+00E2 ISOlat1
new CharacterReference ("atilde", '\u00e3'), // latin small letter a with tilde, U+00E3 ISOlat1
new CharacterReference ("auml", '\u00e4'), // latin small letter a with diaeresis, U+00E4 ISOlat1
new CharacterReference ("aring", '\u00e5'), // latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1
new CharacterReference ("aelig", '\u00e6'), // latin small letter ae = latin small ligature ae, U+00E6 ISOlat1
new CharacterReference ("ccedil", '\u00e7'), // latin small letter c with cedilla, U+00E7 ISOlat1
new CharacterReference ("egrave", '\u00e8'), // latin small letter e with grave, U+00E8 ISOlat1
new CharacterReference ("eacute", '\u00e9'), // latin small letter e with acute, U+00E9 ISOlat1
new CharacterReference ("ecirc", '\u00ea'), // latin small letter e with circumflex, U+00EA ISOlat1
new CharacterReference ("euml", '\u00eb'), // latin small letter e with diaeresis, U+00EB ISOlat1
new CharacterReference ("igrave", '\u00ec'), // latin small letter i with grave, U+00EC ISOlat1
new CharacterReference ("iacute", '\u00ed'), // latin small letter i with acute, U+00ED ISOlat1
new CharacterReference ("icirc", '\u00ee'), // latin small letter i with circumflex, U+00EE ISOlat1
new CharacterReference ("iuml", '\u00ef'), // latin small letter i with diaeresis, U+00EF ISOlat1
new CharacterReference ("eth", '\u00f0'), // latin small letter eth, U+00F0 ISOlat1
new CharacterReference ("ntilde", '\u00f1'), // latin small letter n with tilde, U+00F1 ISOlat1
new CharacterReference ("ograve", '\u00f2'), // latin small letter o with grave, U+00F2 ISOlat1
new CharacterReference ("oacute", '\u00f3'), // latin small letter o with acute, U+00F3 ISOlat1
new CharacterReference ("ocirc", '\u00f4'), // latin small letter o with circumflex, U+00F4 ISOlat1
new CharacterReference ("otilde", '\u00f5'), // latin small letter o with tilde, U+00F5 ISOlat1
new CharacterReference ("ouml", '\u00f6'), // latin small letter o with diaeresis, U+00F6 ISOlat1
new CharacterReference ("divide", '\u00f7'), // division sign, U+00F7 ISOnum
new CharacterReference ("oslash", '\u00f8'), // latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1
new CharacterReference ("ugrave", '\u00f9'), // latin small letter u with grave, U+00F9 ISOlat1
new CharacterReference ("uacute", '\u00fa'), // latin small letter u with acute, U+00FA ISOlat1
new CharacterReference ("ucirc", '\u00fb'), // latin small letter u with circumflex, U+00FB ISOlat1
new CharacterReference ("uuml", '\u00fc'), // latin small letter u with diaeresis, U+00FC ISOlat1
new CharacterReference ("yacute", '\u00fd'), // latin small letter y with acute, U+00FD ISOlat1
new CharacterReference ("thorn", '\u00fe'), // latin small letter thorn, U+00FE ISOlat1
new CharacterReference ("yuml", '\u00ff'), // latin small letter y with diaeresis, U+00FF ISOlat1
// Mathematical, Greek and Symbolic characters for HTML
// Character entity set. Typical invocation:
//
// %HTMLsymbol;
// Portions © International Organization for Standardization 1986:
// Permission to copy in any form is granted for use with
// conforming SGML systems and applications as defined in
// ISO 8879, provided this notice is included in all copies.
// Relevant ISO entity set is given unless names are newly introduced.
// New names (i.e., not in ISO 8879 list) do not clash with any
// existing ISO 8879 entity names. ISO 10646 character numbers
// are given for each character, in hex. CDATA values are decimal
// conversions of the ISO 10646 values and refer to the document
// character set. Names are ISO 10646 names.
// Latin Extended-B
new CharacterReference ("fnof", '\u0192'), // latin small f with hook = function = florin, U+0192 ISOtech
// Greek
new CharacterReference ("Alpha", '\u0391'), // greek capital letter alpha, U+0391
new CharacterReference ("Beta", '\u0392'), // greek capital letter beta, U+0392
new CharacterReference ("Gamma", '\u0393'), // greek capital letter gamma, U+0393 ISOgrk3
new CharacterReference ("Delta", '\u0394'), // greek capital letter delta, U+0394 ISOgrk3
new CharacterReference ("Epsilon", '\u0395'), // greek capital letter epsilon, U+0395
new CharacterReference ("Zeta", '\u0396'), // greek capital letter zeta, U+0396
new CharacterReference ("Eta", '\u0397'), // greek capital letter eta, U+0397
new CharacterReference ("Theta", '\u0398'), // greek capital letter theta, U+0398 ISOgrk3
new CharacterReference ("Iota", '\u0399'), // greek capital letter iota, U+0399
new CharacterReference ("Kappa", '\u039a'), // greek capital letter kappa, U+039A
new CharacterReference ("Lambda", '\u039b'), // greek capital letter lambda, U+039B ISOgrk3
new CharacterReference ("Mu", '\u039c'), // greek capital letter mu, U+039C
new CharacterReference ("Nu", '\u039d'), // greek capital letter nu, U+039D
new CharacterReference ("Xi", '\u039e'), // greek capital letter xi, U+039E ISOgrk3
new CharacterReference ("Omicron", '\u039f'), // greek capital letter omicron, U+039F
new CharacterReference ("Pi", '\u03a0'), // greek capital letter pi, U+03A0 ISOgrk3
new CharacterReference ("Rho", '\u03a1'), // greek capital letter rho, U+03A1
// there is no Sigmaf, and no U+03A2 character either
new CharacterReference ("Sigma", '\u03a3'), // greek capital letter sigma, U+03A3 ISOgrk3
new CharacterReference ("Tau", '\u03a4'), // greek capital letter tau, U+03A4
new CharacterReference ("Upsilon", '\u03a5'), // greek capital letter upsilon, U+03A5 ISOgrk3
new CharacterReference ("Phi", '\u03a6'), // greek capital letter phi, U+03A6 ISOgrk3
new CharacterReference ("Chi", '\u03a7'), // greek capital letter chi, U+03A7
new CharacterReference ("Psi", '\u03a8'), // greek capital letter psi, U+03A8 ISOgrk3
new CharacterReference ("Omega", '\u03a9'), // greek capital letter omega, U+03A9 ISOgrk3
new CharacterReference ("alpha", '\u03b1'), // greek small letter alpha, U+03B1 ISOgrk3
new CharacterReference ("beta", '\u03b2'), // greek small letter beta, U+03B2 ISOgrk3
new CharacterReference ("gamma", '\u03b3'), // greek small letter gamma, U+03B3 ISOgrk3
new CharacterReference ("delta", '\u03b4'), // greek small letter delta, U+03B4 ISOgrk3
new CharacterReference ("epsilon", '\u03b5'), // greek small letter epsilon, U+03B5 ISOgrk3
new CharacterReference ("zeta", '\u03b6'), // greek small letter zeta, U+03B6 ISOgrk3
new CharacterReference ("eta", '\u03b7'), // greek small letter eta, U+03B7 ISOgrk3
new CharacterReference ("theta", '\u03b8'), // greek small letter theta, U+03B8 ISOgrk3
new CharacterReference ("iota", '\u03b9'), // greek small letter iota, U+03B9 ISOgrk3
new CharacterReference ("kappa", '\u03ba'), // greek small letter kappa, U+03BA ISOgrk3
new CharacterReference ("lambda", '\u03bb'), // greek small letter lambda, U+03BB ISOgrk3
new CharacterReference ("mu", '\u03bc'), // greek small letter mu, U+03BC ISOgrk3
new CharacterReference ("nu", '\u03bd'), // greek small letter nu, U+03BD ISOgrk3
new CharacterReference ("xi", '\u03be'), // greek small letter xi, U+03BE ISOgrk3
new CharacterReference ("omicron", '\u03bf'), // greek small letter omicron, U+03BF NEW
new CharacterReference ("pi", '\u03c0'), // greek small letter pi, U+03C0 ISOgrk3
new CharacterReference ("rho", '\u03c1'), // greek small letter rho, U+03C1 ISOgrk3
new CharacterReference ("sigmaf", '\u03c2'), // greek small letter final sigma, U+03C2 ISOgrk3
new CharacterReference ("sigma", '\u03c3'), // greek small letter sigma, U+03C3 ISOgrk3
new CharacterReference ("tau", '\u03c4'), // greek small letter tau, U+03C4 ISOgrk3
new CharacterReference ("upsilon", '\u03c5'), // greek small letter upsilon, U+03C5 ISOgrk3
new CharacterReference ("phi", '\u03c6'), // greek small letter phi, U+03C6 ISOgrk3
new CharacterReference ("chi", '\u03c7'), // greek small letter chi, U+03C7 ISOgrk3
new CharacterReference ("psi", '\u03c8'), // greek small letter psi, U+03C8 ISOgrk3
new CharacterReference ("omega", '\u03c9'), // greek small letter omega, U+03C9 ISOgrk3
new CharacterReference ("thetasym", '\u03d1'), // greek small letter theta symbol, U+03D1 NEW
new CharacterReference ("upsih", '\u03d2'), // greek upsilon with hook symbol, U+03D2 NEW
new CharacterReference ("piv", '\u03d6'), // greek pi symbol, U+03D6 ISOgrk3
// General Punctuation
new CharacterReference ("bull", '\u2022'), // bullet = black small circle, U+2022 ISOpub
// bullet is NOT the same as bullet operator, U+2219
new CharacterReference ("hellip", '\u2026'), // horizontal ellipsis = three dot leader, U+2026 ISOpub
new CharacterReference ("prime", '\u2032'), // prime = minutes = feet, U+2032 ISOtech
new CharacterReference ("Prime", '\u2033'), // double prime = seconds = inches, U+2033 ISOtech
new CharacterReference ("oline", '\u203e'), // overline = spacing overscore, U+203E NEW
new CharacterReference ("frasl", '\u2044'), // fraction slash, U+2044 NEW
// Letterlike Symbols
new CharacterReference ("weierp", '\u2118'), // script capital P = power set = Weierstrass p, U+2118 ISOamso
new CharacterReference ("image", '\u2111'), // blackletter capital I = imaginary part, U+2111 ISOamso
new CharacterReference ("real", '\u211c'), // blackletter capital R = real part symbol, U+211C ISOamso
new CharacterReference ("trade", '\u2122'), // trade mark sign, U+2122 ISOnum
new CharacterReference ("alefsym", '\u2135'), // alef symbol = first transfinite cardinal, U+2135 NEW
// alef symbol is NOT the same as hebrew letter alef,
// U+05D0 although the same glyph could be used to depict both characters
// Arrows
new CharacterReference ("larr", '\u2190'), // leftwards arrow, U+2190 ISOnum
new CharacterReference ("uarr", '\u2191'), // upwards arrow, U+2191 ISOnum
new CharacterReference ("rarr", '\u2192'), // rightwards arrow, U+2192 ISOnum
new CharacterReference ("darr", '\u2193'), // downwards arrow, U+2193 ISOnum
new CharacterReference ("harr", '\u2194'), // left right arrow, U+2194 ISOamsa
new CharacterReference ("crarr", '\u21b5'), // downwards arrow with corner leftwards = carriage return, U+21B5 NEW
new CharacterReference ("lArr", '\u21d0'), // leftwards double arrow, U+21D0 ISOtech
// ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
// but also does not have any other character for that function. So ? lArr can
// be used for 'is implied by' as ISOtech suggests
new CharacterReference ("uArr", '\u21d1'), // upwards double arrow, U+21D1 ISOamsa
new CharacterReference ("rArr", '\u21d2'), // rightwards double arrow, U+21D2 ISOtech
// ISO 10646 does not say this is the 'implies' character but does not have
// another character with this function so ?
// rArr can be used for 'implies' as ISOtech suggests
new CharacterReference ("dArr", '\u21d3'), // downwards double arrow, U+21D3 ISOamsa
new CharacterReference ("hArr", '\u21d4'), // left right double arrow, U+21D4 ISOamsa
// Mathematical Operators
new CharacterReference ("forall", '\u2200'), // for all, U+2200 ISOtech
new CharacterReference ("part", '\u2202'), // partial differential, U+2202 ISOtech
new CharacterReference ("exist", '\u2203'), // there exists, U+2203 ISOtech
new CharacterReference ("empty", '\u2205'), // empty set = null set = diameter, U+2205 ISOamso
new CharacterReference ("nabla", '\u2207'), // nabla = backward difference, U+2207 ISOtech
new CharacterReference ("isin", '\u2208'), // element of, U+2208 ISOtech
new CharacterReference ("notin", '\u2209'), // not an element of, U+2209 ISOtech
new CharacterReference ("ni", '\u220b'), // contains as member, U+220B ISOtech
// should there be a more memorable name than 'ni'?
new CharacterReference ("prod", '\u220f'), // n-ary product = product sign, U+220F ISOamsb
// prod is NOT the same character as U+03A0 'greek capital letter pi' though
// the same glyph might be used for both
new CharacterReference ("sum", '\u2211'), // n-ary sumation, U+2211 ISOamsb
// sum is NOT the same character as U+03A3 'greek capital letter sigma'
// though the same glyph might be used for both
new CharacterReference ("minus", '\u2212'), // minus sign, U+2212 ISOtech
new CharacterReference ("lowast", '\u2217'), // asterisk operator, U+2217 ISOtech
new CharacterReference ("radic", '\u221a'), // square root = radical sign, U+221A ISOtech
new CharacterReference ("prop", '\u221d'), // proportional to, U+221D ISOtech
new CharacterReference ("infin", '\u221e'), // infinity, U+221E ISOtech
new CharacterReference ("ang", '\u2220'), // angle, U+2220 ISOamso
new CharacterReference ("and", '\u2227'), // logical and = wedge, U+2227 ISOtech
new CharacterReference ("or", '\u2228'), // logical or = vee, U+2228 ISOtech
new CharacterReference ("cap", '\u2229'), // intersection = cap, U+2229 ISOtech
new CharacterReference ("cup", '\u222a'), // union = cup, U+222A ISOtech
new CharacterReference ("int", '\u222b'), // integral, U+222B ISOtech
new CharacterReference ("there4", '\u2234'), // therefore, U+2234 ISOtech
new CharacterReference ("sim", '\u223c'), // tilde operator = varies with = similar to, U+223C ISOtech
// tilde operator is NOT the same character as the tilde, U+007E,
// although the same glyph might be used to represent both
new CharacterReference ("cong", '\u2245'), // approximately equal to, U+2245 ISOtech
new CharacterReference ("asymp", '\u2248'), // almost equal to = asymptotic to, U+2248 ISOamsr
new CharacterReference ("ne", '\u2260'), // not equal to, U+2260 ISOtech
new CharacterReference ("equiv", '\u2261'), // identical to, U+2261 ISOtech
new CharacterReference ("le", '\u2264'), // less-than or equal to, U+2264 ISOtech
new CharacterReference ("ge", '\u2265'), // greater-than or equal to, U+2265 ISOtech
new CharacterReference ("sub", '\u2282'), // subset of, U+2282 ISOtech
new CharacterReference ("sup", '\u2283'), // superset of, U+2283 ISOtech
// note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
// font encoding and is not included. Should it be, for symmetry?
// It is in ISOamsn
new CharacterReference ("nsub", '\u2284'), // not a subset of, U+2284 ISOamsn
new CharacterReference ("sube", '\u2286'), // subset of or equal to, U+2286 ISOtech
new CharacterReference ("supe", '\u2287'), // superset of or equal to, U+2287 ISOtech
new CharacterReference ("oplus", '\u2295'), // circled plus = direct sum, U+2295 ISOamsb
new CharacterReference ("otimes", '\u2297'), // circled times = vector product, U+2297 ISOamsb
new CharacterReference ("perp", '\u22a5'), // up tack = orthogonal to = perpendicular, U+22A5 ISOtech
new CharacterReference ("sdot", '\u22c5'), // dot operator, U+22C5 ISOamsb
// dot operator is NOT the same character as U+00B7 middle dot
// Miscellaneous Technical
new CharacterReference ("lceil", '\u2308'), // left ceiling = apl upstile, U+2308 ISOamsc
new CharacterReference ("rceil", '\u2309'), // right ceiling, U+2309 ISOamsc
new CharacterReference ("lfloor", '\u230a'), // left floor = apl downstile, U+230A ISOamsc
new CharacterReference ("rfloor", '\u230b'), // right floor, U+230B ISOamsc
new CharacterReference ("lang", '\u2329'), // left-pointing angle bracket = bra, U+2329 ISOtech
// lang is NOT the same character as U+003C 'less than'
// or U+2039 'single left-pointing angle quotation mark'
new CharacterReference ("rang", '\u232a'), // right-pointing angle bracket = ket, U+232A ISOtech
// rang is NOT the same character as U+003E 'greater than'
// or U+203A 'single right-pointing angle quotation mark'
// Geometric Shapes
new CharacterReference ("loz", '\u25ca'), // lozenge, U+25CA ISOpub
// Miscellaneous Symbols
new CharacterReference ("spades", '\u2660'), // black spade suit, U+2660 ISOpub
// black here seems to mean filled as opposed to hollow
new CharacterReference ("clubs", '\u2663'), // black club suit = shamrock, U+2663 ISOpub
new CharacterReference ("hearts", '\u2665'), // black heart suit = valentine, U+2665 ISOpub
new CharacterReference ("diams", '\u2666'), // black diamond suit, U+2666 ISOpub
// Special characters for HTML
// Character entity set. Typical invocation:
//
// %HTMLspecial;
// Portions © International Organization for Standardization 1986:
// Permission to copy in any form is granted for use with
// conforming SGML systems and applications as defined in
// ISO 8879, provided this notice is included in all copies.
// Relevant ISO entity set is given unless names are newly introduced.
// New names (i.e., not in ISO 8879 list) do not clash with any
// existing ISO 8879 entity names. ISO 10646 character numbers
// are given for each character, in hex. CDATA values are decimal
// conversions of the ISO 10646 values and refer to the document
// character set. Names are ISO 10646 names.
// C0 Controls and Basic Latin
new CharacterReference ("quot", '\u0022'), // quotation mark = APL quote, U+0022 ISOnum
new CharacterReference ("amp", '\u0026'), // ampersand, U+0026 ISOnum
new CharacterReference ("lt", '\u003c'), // less-than sign, U+003C ISOnum
new CharacterReference ("gt", '\u003e'), // greater-than sign, U+003E ISOnum
// Latin Extended-A
new CharacterReference ("OElig", '\u0152'), // latin capital ligature OE, U+0152 ISOlat2
new CharacterReference ("oelig", '\u0153'), // latin small ligature oe, U+0153 ISOlat2
// ligature is a misnomer, this is a separate character in some languages
new CharacterReference ("Scaron", '\u0160'), // latin capital letter S with caron, U+0160 ISOlat2
new CharacterReference ("scaron", '\u0161'), // latin small letter s with caron, U+0161 ISOlat2
new CharacterReference ("Yuml", '\u0178'), // latin capital letter Y with diaeresis, U+0178 ISOlat2
// Spacing Modifier Letters
new CharacterReference ("circ", '\u02c6'), // modifier letter circumflex accent, U+02C6 ISOpub
new CharacterReference ("tilde", '\u02dc'), // small tilde, U+02DC ISOdia
// General Punctuation
new CharacterReference ("ensp", '\u2002'), // en space, U+2002 ISOpub
new CharacterReference ("emsp", '\u2003'), // em space, U+2003 ISOpub
new CharacterReference ("thinsp", '\u2009'), // thin space, U+2009 ISOpub
new CharacterReference ("zwnj", '\u200c'), // zero width non-joiner, U+200C NEW RFC 2070
new CharacterReference ("zwj", '\u200d'), // zero width joiner, U+200D NEW RFC 2070
new CharacterReference ("lrm", '\u200e'), // left-to-right mark, U+200E NEW RFC 2070
new CharacterReference ("rlm", '\u200f'), // right-to-left mark, U+200F NEW RFC 2070
new CharacterReference ("ndash", '\u2013'), // en dash, U+2013 ISOpub
new CharacterReference ("mdash", '\u2014'), // em dash, U+2014 ISOpub
new CharacterReference ("lsquo", '\u2018'), // left single quotation mark, U+2018 ISOnum
new CharacterReference ("rsquo", '\u2019'), // right single quotation mark, U+2019 ISOnum
new CharacterReference ("sbquo", '\u201a'), // single low-9 quotation mark, U+201A NEW
new CharacterReference ("ldquo", '\u201c'), // left double quotation mark, U+201C ISOnum
new CharacterReference ("rdquo", '\u201d'), // right double quotation mark, U+201D ISOnum
new CharacterReference ("bdquo", '\u201e'), // double low-9 quotation mark, U+201E NEW
new CharacterReference ("dagger", '\u2020'), // dagger, U+2020 ISOpub
new CharacterReference ("Dagger", '\u2021'), // double dagger, U+2021 ISOpub
new CharacterReference ("permil", '\u2030'), // per mille sign, U+2030 ISOtech
new CharacterReference ("lsaquo", '\u2039'), // single left-pointing angle quotation mark, U+2039 ISO proposed
// lsaquo is proposed but not yet ISO standardized
new CharacterReference ("rsaquo", '\u203a'), // single right-pointing angle quotation mark, U+203A ISO proposed
// rsaquo is proposed but not yet ISO standardized
new CharacterReference ("euro", '\u20ac'), // euro sign, U+20AC NEW
};
/**
* The dividing point between a simple table lookup and a binary search.
* Characters below the break point are stored in a sparse array allowing
* direct index lookup.
*/
protected static final int BREAKPOINT = 0x100;
/**
* List of references sorted by character.
* The first part of this array, up to BREAKPOINT
is stored
* in a direct translational table, indexing into the table with a character
* yields the reference. The second part is dense and sorted by character,
* suitable for binary lookup.
*/
protected static final CharacterReference[] mCharacterList;
static
{
int index;
CharacterReference item;
int character;
// count below the break point
index = 0;
for (int i = 0; i < mCharacterReferences.length; i++)
if (mCharacterReferences[i].getCharacter () < BREAKPOINT)
index++;
// allocate enough for the linear table and remainder
mCharacterList = new CharacterReference[BREAKPOINT + mCharacterReferences.length - index];
index = BREAKPOINT;
for (int i = 0; i < mCharacterReferences.length; i++)
{
item = mCharacterReferences[i];
character = mCharacterReferences[i].getCharacter ();
if (character < BREAKPOINT)
mCharacterList[character] = item;
else
{
// use a linear search and insertion sort, done only once
int x = BREAKPOINT;
while (x < index)
if (mCharacterList[x].getCharacter () > character)
break;
else
x++;
int y = index - 1;
while (y >= x)
{
mCharacterList[y + 1] = mCharacterList[y];
y--;
}
mCharacterList[x] = item;
index++;
}
}
// reorder the original array into kernel order
Sort.QuickSort (mCharacterReferences);
}
/**
* Private constructor.
* This class is fully static and thread safe.
*/
private Translate ()
{
}
/**
* Binary search for a reference.
* @param array The array of CharacterReference
objects.
* @param ref The character to search for.
* @param lo The lower index within which to look.
* @param hi The upper index within which to look.
* @return The index at which reference was found or is to be inserted.
*/
protected static int lookup (CharacterReference[] array, char ref, int lo, int hi)
{ int num;
int mid;
int half;
int result;
int ret;
ret = -1;
num = (hi - lo) + 1;
while ((-1 == ret) && (lo <= hi))
{
half = num / 2;
mid = lo + ((0 != (num & 1)) ? half : half - 1);
result = ref - array[mid].getCharacter ();
if (0 == result)
ret = mid;
else if (0 > result)
{
hi = mid - 1;
num = ((0 != (num & 1)) ? half : half - 1);
}
else
{
lo = mid + 1;
num = half;
}
}
if (-1 == ret)
ret = lo;
return (ret);
}
/**
* Look up a reference by character.
* Use a combination of direct table lookup and binary search to find
* the reference corresponding to the character.
* @param character The character to be looked up.
* @return The entity reference for that character or null
.
*/
public static CharacterReference lookup (char character)
{
int index;
CharacterReference ret;
if (character < BREAKPOINT)
ret = mCharacterList[character];
else
{
index = lookup (mCharacterList, character, BREAKPOINT, mCharacterList.length - 1);
if (index < mCharacterList.length)
{
ret = mCharacterList[index];
if (character != ret.getCharacter ())
ret = null;
}
else
ret = null;
}
return (ret);
}
/**
* Look up a reference by kernel.
* Use a binary search on the ordered list of known references.
* Since the binary search returns the position at which a new item should
* be inserted, we check the references earlier in the list if there is
* a failure.
* @param key A character reference with the kernel set to the string
* to be found. It need not be truncated at the exact end of the reference.
*/
protected static CharacterReference lookup (CharacterReference key)
{
String string;
int index;
String kernel;
char character;
CharacterReference test;
CharacterReference ret;
// Care should be taken here because some entity references are
// prefixes of others, i.e.:
// \u2209[notin] \u00ac[not]
// \u00ba[ordm] \u2228[or]
// \u03d6[piv] \u03c0[pi]
// \u00b3[sup3] \u2283[sup]
ret = null;
index = Sort.bsearch (mCharacterReferences, key);
string = key.getKernel ();
if (index < mCharacterReferences.length)
{
ret = mCharacterReferences[index];
kernel = ret.getKernel ();
if (!string.regionMatches (
0,
kernel,
0,
kernel.length ()))
{ // not exact, check references starting with same character
// to see if a subset matches
ret = null;
}
}
if (null == ret)
{
character = string.charAt (0);
while (--index >= 0)
{
test = mCharacterReferences[index];
kernel = test.getKernel ();
if (character == kernel.charAt (0))
{
if (string.regionMatches (
0,
kernel,
0,
kernel.length ()))
{
ret = test;
break;
}
}
else
break;
}
}
return (ret);
}
/**
* Look up a reference by kernel.
* Use a binary search on the ordered list of known references.
* This is not very efficient, use {@link org.htmlparser.util.Translate#lookup(org.htmlparser.util.CharacterReference) lookup(CharacterReference)}
* instead.
* @param kernel The string to lookup, i.e. "amp".
* @param start The starting point in the string of the kernel.
* @param end The ending point in the string of the kernel.
* This should be the index of the semicolon if it exists, or failing that,
* at least an index past the last character of the kernel.
* @return The reference that matches the given string, or null
* if it wasn't found.
*/
public static CharacterReference lookup (String kernel, int start, int end)
{
CharacterReferenceEx probe;
probe = new CharacterReferenceEx ();
probe.setKernel (kernel);
probe.setStart (start);
probe.setEnd (end);
return (lookup (probe));
}
/**
* Convert a reference to a unicode character.
* Convert a single numeric character reference or character entity reference
* to a unicode character.
* @param string The string to convert. Of the form &xxxx; or &#xxxx; with
* or without the leading ampersand or trailing semi-colon.
* @param start The starting pooint in the string to look for a character reference.
* @param end The ending point in the string to stop looking for a character reference.
* @return The converted character or ' ' (zero) if the string is an
* invalid reference.
* @deprecated Use {@link #decode(String) decode}.
*/
public static char convertToChar (String string, int start, int end)
{
return (decode (string.substring (start, end)).charAt (0));
}
/**
* Convert a reference to a unicode character.
* Convert a single numeric character reference or character entity reference
* to a unicode character.
* @param string The string to convert. Of the form &xxxx; or &#xxxx; with
* or without the leading ampersand or trailing semi-colon.
* @return The converted character or ' ' (zero) if the string is an
* invalid reference.
* @deprecated Use {@link #decode(String) decode}.
*/
public static char convertToChar (String string)
{
return (decode (string).charAt (0));
}
/**
* Decode a string containing references.
* Change all numeric character reference and character entity references
* to unicode characters.
* @param string The string to translate.
*/
public static String decode (String string)
{
CharacterReferenceEx key;
int amp;
int index;
int length;
StringBuffer buffer;
char character;
int number;
int radix;
int i;
int semi;
boolean done;
CharacterReference item;
String ret;
if (-1 == (amp = string.indexOf ('&')))
ret = string;
else
{
key = null;
index = 0;
length = string.length ();
buffer = new StringBuffer (length);
do
{
// equivalent to buffer.append (string.substring (index, amp));
// but without the allocation of a new String
while (index < amp)
buffer.append (string.charAt (index++));
index++;
if (index < length)
{
character = string.charAt (index);
if ('#' == character)
{
// numeric character reference
index++;
number = 0;
radix = 0;
i = index;
done = false;
while ((i < length) && !done)
{
character = string.charAt (i);
switch (character)
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
if (0 == radix)
radix = 10;
number = number * radix + (character - '0');
break;
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
if (16 == radix)
number = number * radix + (character - 'A' + 10);
else
done = true;
break;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
if (16 == radix)
number = number * radix + (character - 'a' + 10);
else
done = true;
break;
case 'x':
case 'X':
if (0 == radix)
radix = 16;
else
done = true;
break;
case ';':
done = true;
i++;
break;
default:
done = true;
break;
}
if (!done)
i++;
}
if (0 != number)
{
buffer.append ((char)number);
index = i;
amp = index;
}
}
else if (Character.isLetter (character)) // really can't start with a digit eh...
{
// character entity reference
i = index + 1;
done = false;
semi = length;
while ((i < length) && !done)
{
character = string.charAt (i);
if (';' == character)
{
done = true;
semi = i;
i++;
}
else if (Character.isLetterOrDigit (character))
i++;
else
{
done = true;
semi = i;
}
}
// new CharacterReference (string.substring (index, semi), 0);
if (null == key)
key = new CharacterReferenceEx ();
key.setKernel (string);
key.setStart (index);
key.setEnd (semi);
item = lookup (key);
if (null != item)
{
buffer.append ((char)item.getCharacter ());
index += item.getKernel ().length ();
if ((index < length) && (';' == string.charAt (index)))
index++;
amp = index;
}
}
else
{
// need do nothing here, the ampersand will be consumed below
}
}
// gather up unconsumed characters
while (amp < index)
buffer.append (string.charAt (amp++));
}
while ((index < length) && (-1 != (amp = string.indexOf ('&', index))));
// equivalent to buffer.append (string.substring (index));
// but without the allocation of a new String
while (index < length)
buffer.append (string.charAt (index++));
ret = buffer.toString ();
}
return (ret);
}
/**
* Decode the characters in a string buffer containing references.
* Change all numeric character reference and character entity references
* to unicode characters.
* @param buffer The StringBuffer containing references.
* @return The decoded string.
*/
public static String decode (StringBuffer buffer)
{
return decode (buffer.toString());
}
/**
* Decode a stream containing references.
* Change all numeric character reference and character entity references
* to unicode characters. If DECODE_LINE_BY_LINE
is true,
* the input stream is broken up into lines, terminated by either
* carriage return or newline, in order to reduce the latency and maximum
* buffering memory size required.
* @param in The stream to translate. It is assumed that the input
* stream is encoded with ISO-8859-1 since the table of character
* entity references in this class applies only to ISO-8859-1.
* @param out The stream to write the decoded stream to.
*/
public static void decode (InputStream in, PrintStream out)
{
Reader reader;
StringBuffer buffer;
int character;
String string;
boolean newlines;
try
{
try
{
reader = new BufferedReader (new InputStreamReader (in, "ISO-8859-1"));
}
catch (UnsupportedEncodingException use)
{
// yeah, like this will happen; OK, assume the default is ISO-8859-1
reader = new BufferedReader (new InputStreamReader (in));
}
buffer = new StringBuffer (1024);
newlines = false;
if (DECODE_LINE_BY_LINE)
while (-1 != (character = reader.read ()))
{
if (('\r' == character) || ('\n' == character))
{
if (!newlines)
{
string = decode (buffer.toString ());
out.print (string);
buffer.setLength (0);
newlines = true;
}
buffer.append ((char)character);
}
else
{
if (newlines)
{
out.print (buffer.toString ());
buffer.setLength (0);
newlines = false;
}
buffer.append ((char)character);
}
}
else
while (-1 != (character = reader.read ()))
buffer.append ((char)character);
if (0 != buffer.length ())
{
if (newlines)
out.print (buffer.toString ());
else
{
string = decode (buffer.toString ());
out.print (string);
}
}
}
catch (IOException ioe)
{
out.println ();
out.println (ioe.getMessage ());
}
finally
{
out.flush ();
}
}
/**
* Convert a character to a numeric character reference.
* Convert a unicode character to a numeric character reference of
* the form &#xxxx;.
* @param character The character to convert.
* @return The converted character.
* @deprecated Use {@link #encode(int) encode}.
*/
public static String convertToString (int character)
{
return (encode (character));
}
/**
* Convert a character to a numeric character reference.
* Convert a unicode character to a numeric character reference of
* the form &#xxxx;.
* @param character The character to convert.
* @return The converted character.
*/
public static String encode (int character)
{
StringBuffer ret;
ret = new StringBuffer (13); /* */
ret.append ("");
if (ENCODE_HEXADECIMAL)
{
ret.append ("x");
ret.append (Integer.toHexString (character));
}
else
ret.append (character);
ret.append (';');
return (ret.toString ());
}
/**
* Encode a string to use references.
* Change all characters that are not ISO-8859-1 to their numeric character
* reference or character entity reference.
* @param string The string to translate.
* @return The encoded string.
*/
public static String encode (String string)
{
int length;
char c;
CharacterReference candidate;
StringBuffer ret;
ret = new StringBuffer (string.length () * 6);
length = string.length ();
for (int i = 0; i < length; i++)
{
c = string.charAt (i);
candidate = lookup (c);
if (null != candidate)
{
ret.append ('&');
ret.append (candidate.getKernel ());
ret.append (';');
}
else if (!(c < 0x007F))
{
ret.append ("");
if (ENCODE_HEXADECIMAL)
{
ret.append ("x");
ret.append (Integer.toHexString (c));
}
else
ret.append ((int)c);
ret.append (';');
}
else
ret.append (c);
}
return (ret.toString ());
}
/**
* Encode a stream to use references.
* Change all characters that are not ISO-8859-1 to their numeric character
* reference or character entity reference.
* @param in The stream to translate. It is assumed that the input
* stream is encoded with ISO-8859-1 since the table of character
* entity references in this class applies only to ISO-8859-1.
* @param out The stream to write the decoded stream to.
*/
public static void encode (InputStream in, PrintStream out)
{
Reader reader;
char c;
int index;
CharacterReference candidate;
PrintWriter output;
try
{
reader = new BufferedReader (new InputStreamReader (in, "ISO-8859-1"));
output = new PrintWriter (new BufferedWriter (new OutputStreamWriter (out, "ISO-8859-1")));
}
catch (UnsupportedEncodingException use)
{
// yeah, like this will happen; OK, assume default is ISO-8859-1
reader = new BufferedReader (new InputStreamReader (in));
output = new PrintWriter (new BufferedWriter (new OutputStreamWriter (out)));
}
try
{
while (-1 != (index = reader.read ()))
{
c = (char)index;
candidate = lookup (c);
if (null != candidate)
{
output.print ('&');
output.print (candidate.getKernel ());
output.print (';');
}
else if (!(c < 0x007F))
{
output.print ("");
if (ENCODE_HEXADECIMAL)
{
output.print ("x");
output.print (Integer.toHexString (c));
}
else
output.print ((int)c);
output.print (';');
}
else
output.print (c);
}
}
catch (IOException ioe)
{
output.println ();
output.println (ioe.getMessage ());
}
finally
{
output.flush ();
}
}
/**
* Numeric character reference and character entity reference to unicode codec.
* Translate the System.in
input into an encoded or decoded
* stream and send the results to System.out
.
* @param args If arg[0] is -encode
perform an encoding on
* System.in
, otherwise perform a decoding.
*/
public static void main (String[] args)
{
boolean encode;
if (0 < args.length && args[0].equalsIgnoreCase ("-encode"))
encode = true;
else
encode = false;
if (encode)
encode (System.in, System.out);
else
decode (System.in, System.out);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy