com.yahoo.text.HTML Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of vespajlib Show documentation
Show all versions of vespajlib Show documentation
Library for use in Java components of Vespa. Shared code which do
not fit anywhere else.
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.text;
import java.util.Map;
import java.util.HashMap;
/**
* Static HTML escaping stuff
*
* @author Bjorn Borud
*/
public class HTML {
static Object[][] entities = {
// {"#39", Integer.valueOf(39)}, // ' - apostrophe
{"quot", 34}, // " - double-quote
{"amp", 38}, // & - ampersand
{"lt", 60}, // < - less-than
{"gt", 62}, // > - greater-than
{"nbsp", 160}, // non-breaking space
{"copy", 169}, // \u00A9 - copyright
{"reg", 174}, // \u00AE - registered trademark
{"Agrave", 192}, // \u00C0 - uppercase A, grave accent
{"Aacute", 193}, // \u00C1 - uppercase A, acute accent
{"Acirc", 194}, // \u00C2 - uppercase A, circumflex accent
{"Atilde", 195}, // \u00C3 - uppercase A, tilde
{"Auml", 196}, // \u00C4 - uppercase A, umlaut
{"Aring", 197}, // \u00C5 - uppercase A, ring
{"AElig", 198}, // \u00C6 - uppercase AE
{"Ccedil", 199}, // \u00C7 - uppercase C, cedilla
{"Egrave", 200}, // \u00C8 - uppercase E, grave accent
{"Eacute", 201}, // \u00C9 - uppercase E, acute accent
{"Ecirc", 202}, // \u00CA - uppercase E, circumflex accent
{"Euml", 203}, // \u00CB - uppercase E, umlaut
{"Igrave", 204}, // \u00CC - uppercase I, grave accent
{"Iacute", 205}, // \u00CD - uppercase I, acute accent
{"Icirc", 206}, // \u00CE - uppercase I, circumflex accent
{"Iuml", 207}, // \u00CF - uppercase I, umlaut
{"ETH", 208}, // \u00D0 - uppercase Eth, Icelandic
{"Ntilde", 209}, // \u00D1 - uppercase N, tilde
{"Ograve", 210}, // \u00D2 - uppercase O, grave accent
{"Oacute", 211}, // \u00D3 - uppercase O, acute accent
{"Ocirc", 212}, // \u00D4 - uppercase O, circumflex accent
{"Otilde", 213}, // \u00D5 - uppercase O, tilde
{"Ouml", 214}, // \u00D6 - uppercase O, umlaut
{"Oslash", 216}, // \u00D8 - uppercase O, slash
{"Ugrave", 217}, // \u00D9 - uppercase U, grave accent
{"Uacute", 218}, // \u00DA - uppercase U, acute accent
{"Ucirc", 219}, // \u00DB - uppercase U, circumflex accent
{"Uuml", 220}, // \u00DC - uppercase U, umlaut
{"Yacute", 221}, // \u00DD - uppercase Y, acute accent
{"THORN", 222}, // \u00DE - uppercase THORN, Icelandic
{"szlig", 223}, // \u00DF - lowercase sharps, German
{"agrave", 224}, // \u00E0 - lowercase a, grave accent
{"aacute", 225}, // \u00E1 - lowercase a, acute accent
{"acirc", 226}, // \u00E2 - lowercase a, circumflex accent
{"atilde", 227}, // \u00E3 - lowercase a, tilde
{"auml", 228}, // \u00E4 - lowercase a, umlaut
{"aring", 229}, // \u00E5 - lowercase a, ring
{"aelig", 230}, // \u00E6 - lowercase ae
{"ccedil", 231}, // \u00E7 - lowercase c, cedilla
{"egrave", 232}, // \u00E8 - lowercase e, grave accent
{"eacute", 233}, // \u00E9 - lowercase e, acute accent
{"ecirc", 234}, // \u00EA - lowercase e, circumflex accent
{"euml", 235}, // \u00EB - lowercase e, umlaut
{"igrave", 236}, // \u00EC - lowercase i, grave accent
{"iacute", 237}, // \u00ED - lowercase i, acute accent
{"icirc", 238}, // \u00EE - lowercase i, circumflex accent
{"iuml", 239}, // \u00EF - lowercase i, umlaut
{"igrave", 236}, // \u00EC - lowercase i, grave accent
{"iacute", 237}, // \u00ED - lowercase i, acute accent
{"icirc", 238}, // \u00EE - lowercase i, circumflex accent
{"iuml", 239}, // \u00EF - lowercase i, umlaut
{"eth", 240}, // \u00F0 - lowercase eth, Icelandic
{"ntilde", 241}, // \u00F1 - lowercase n, tilde
{"ograve", 242}, // \u00F2 - lowercase o, grave accent
{"oacute", 243}, // \u00F3 - lowercase o, acute accent
{"ocirc", 244}, // \u00F4 - lowercase o, circumflex accent
{"otilde", 245}, // \u00F5 - lowercase o, tilde
{"ouml", 246}, // \u00F6 - lowercase o, umlaut
{"oslash", 248}, // \u00F8 - lowercase o, slash
{"ugrave", 249}, // \u00F9 - lowercase u, grave accent
{"uacute", 250}, // \u00FA - lowercase u, acute accent
{"ucirc", 251}, // \u00FB - lowercase u, circumflex accent
{"uuml", 252}, // \u00FC - lowercase u, umlaut
{"yacute", 253}, // \u00FD - lowercase y, acute accent
{"thorn", 254}, // \u00FE - lowercase thorn, Icelandic
{"yuml", 255}, // \u00FF - lowercase y, umlaut
{"euro", 8364}, // Euro symbol
};
static Map e2i = new HashMap<>();
static Map i2e = new HashMap<>();
static {
for (Object[] entity : entities) {
e2i.put((String) entity[0], (Integer) entity[1]);
i2e.put((Integer) entity[1], (String) entity[0]);
}
}
public static String htmlescape(String s1) {
if (s1 == null) return "";
int len = s1.length();
// about 20% guess
StringBuilder buf = new StringBuilder((int) (len * 1.2));
int i;
for (i = 0; i < len; ++i) {
char ch = s1.charAt(i);
String entity = i2e.get((int) ch);
if (entity == null) {
if (((int) ch) > 128) buf.append("").append((int) ch).append(";");
else buf.append(ch);
} else {
buf.append("&").append(entity).append(";");
}
}
return buf.toString();
}
}