All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.text.HTML Maven / Gradle / Ivy

Go to download

Library for use in Java components of Vespa. Shared code which do not fit anywhere else.

There is a newer version: 8.441.21
Show newest version
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.text;

import java.util.Map;
import java.util.HashMap;

/**
 * Static HTML escaping stuff
 *
 * @author Bjorn Borud
 */
public class HTML {

    static Object[][] entities = {
        // {"#39", Integer.valueOf(39)},     // ' - apostrophe
        {"quot", 34},    // " - double-quote
        {"amp", 38},     // & - ampersand
        {"lt", 60},      // < - less-than
        {"gt", 62},      // > - greater-than
        {"nbsp", 160},   // non-breaking space
        {"copy", 169},   // \u00A9 - copyright
        {"reg", 174},    // \u00AE - registered trademark
        {"Agrave", 192}, // \u00C0 - uppercase A, grave accent
        {"Aacute", 193}, // \u00C1 - uppercase A, acute accent
        {"Acirc", 194},  // \u00C2 - uppercase A, circumflex accent
        {"Atilde", 195}, // \u00C3 - uppercase A, tilde
        {"Auml", 196},   // \u00C4 - uppercase A, umlaut
        {"Aring", 197},  // \u00C5 - uppercase A, ring
        {"AElig", 198},  // \u00C6 - uppercase AE
        {"Ccedil", 199}, // \u00C7 - uppercase C, cedilla
        {"Egrave", 200}, // \u00C8 - uppercase E, grave accent
        {"Eacute", 201}, // \u00C9 - uppercase E, acute accent
        {"Ecirc", 202},  // \u00CA - uppercase E, circumflex accent
        {"Euml", 203},   // \u00CB - uppercase E, umlaut
        {"Igrave", 204}, // \u00CC - uppercase I, grave accent
        {"Iacute", 205}, // \u00CD - uppercase I, acute accent
        {"Icirc", 206},  // \u00CE - uppercase I, circumflex accent
        {"Iuml", 207},   // \u00CF - uppercase I, umlaut
        {"ETH", 208},    // \u00D0 - uppercase Eth, Icelandic
        {"Ntilde", 209}, // \u00D1 - uppercase N, tilde
        {"Ograve", 210}, // \u00D2 - uppercase O, grave accent
        {"Oacute", 211}, // \u00D3 - uppercase O, acute accent
        {"Ocirc", 212},  // \u00D4 - uppercase O, circumflex accent
        {"Otilde", 213}, // \u00D5 - uppercase O, tilde
        {"Ouml", 214},   // \u00D6 - uppercase O, umlaut
        {"Oslash", 216}, // \u00D8 - uppercase O, slash
        {"Ugrave", 217}, // \u00D9 - uppercase U, grave accent
        {"Uacute", 218}, // \u00DA - uppercase U, acute accent
        {"Ucirc", 219},  // \u00DB - uppercase U, circumflex accent
        {"Uuml", 220},   // \u00DC - uppercase U, umlaut
        {"Yacute", 221}, // \u00DD - uppercase Y, acute accent
        {"THORN", 222},  // \u00DE - uppercase THORN, Icelandic
        {"szlig", 223},  // \u00DF - lowercase sharps, German
        {"agrave", 224}, // \u00E0 - lowercase a, grave accent
        {"aacute", 225}, // \u00E1 - lowercase a, acute accent
        {"acirc", 226},  // \u00E2 - lowercase a, circumflex accent
        {"atilde", 227}, // \u00E3 - lowercase a, tilde
        {"auml", 228},   // \u00E4 - lowercase a, umlaut
        {"aring", 229},  // \u00E5 - lowercase a, ring
        {"aelig", 230},  // \u00E6 - lowercase ae
        {"ccedil", 231}, // \u00E7 - lowercase c, cedilla
        {"egrave", 232}, // \u00E8 - lowercase e, grave accent
        {"eacute", 233}, // \u00E9 - lowercase e, acute accent
        {"ecirc", 234},  // \u00EA - lowercase e, circumflex accent
        {"euml", 235},   // \u00EB - lowercase e, umlaut
        {"igrave", 236}, // \u00EC - lowercase i, grave accent
        {"iacute", 237}, // \u00ED - lowercase i, acute accent
        {"icirc", 238},  // \u00EE - lowercase i, circumflex accent
        {"iuml", 239},   // \u00EF - lowercase i, umlaut
        {"igrave", 236}, // \u00EC - lowercase i, grave accent
        {"iacute", 237}, // \u00ED - lowercase i, acute accent
        {"icirc", 238},  // \u00EE - lowercase i, circumflex accent
        {"iuml", 239},   // \u00EF - lowercase i, umlaut
        {"eth", 240},    // \u00F0 - lowercase eth, Icelandic
        {"ntilde", 241}, // \u00F1 - lowercase n, tilde
        {"ograve", 242}, // \u00F2 - lowercase o, grave accent
        {"oacute", 243}, // \u00F3 - lowercase o, acute accent
        {"ocirc", 244},  // \u00F4 - lowercase o, circumflex accent
        {"otilde", 245}, // \u00F5 - lowercase o, tilde
        {"ouml", 246},   // \u00F6 - lowercase o, umlaut
        {"oslash", 248}, // \u00F8 - lowercase o, slash
        {"ugrave", 249}, // \u00F9 - lowercase u, grave accent
        {"uacute", 250}, // \u00FA - lowercase u, acute accent
        {"ucirc", 251},  // \u00FB - lowercase u, circumflex accent
        {"uuml", 252},   // \u00FC - lowercase u, umlaut
        {"yacute", 253}, // \u00FD - lowercase y, acute accent
        {"thorn", 254},  // \u00FE - lowercase thorn, Icelandic
        {"yuml", 255},   // \u00FF - lowercase y, umlaut
        {"euro", 8364},  // Euro symbol
    };

    static Map e2i = new HashMap<>();
    static Map i2e = new HashMap<>();

    static {
        for (Object[] entity : entities) {
            e2i.put((String) entity[0], (Integer) entity[1]);
            i2e.put((Integer) entity[1], (String) entity[0]);
        }
    }

    public static String htmlescape(String s1) {
        if (s1 == null) return "";

        int len = s1.length();
        // about 20% guess
        StringBuilder buf = new StringBuilder((int) (len * 1.2));
        int i;

        for (i = 0; i < len; ++i) {
            char ch = s1.charAt(i);
            String entity = i2e.get((int) ch);

            if (entity == null) {
                if (((int) ch) > 128) buf.append("&#").append((int) ch).append(";");
                else buf.append(ch);
            } else {
                buf.append("&").append(entity).append(";");
            }
        }
        return buf.toString();
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy