All Downloads are FREE. Search and download functionalities are using the official Maven repository.

xmlparser.utils.Escaping Maven / Gradle / Ivy

There is a newer version: 3.2.0
Show newest version
package xmlparser.utils;

import java.util.Map;

import static xmlparser.utils.Constants.*;

public enum Escaping {;

    public interface UnEscape {
        String unescape(String input);
    }

    public static String unescapeXml(final String text) {
        StringBuilder result = new StringBuilder(text.length());
        int i = 0;
        int n = text.length();
        while (i < n) {
            char charAt = text.charAt(i);
            if (charAt != CHAR_AMPERSAND) {
                result.append(charAt);
                i++;
            } else {
                if (text.startsWith(ENCODED_AMPERSAND, i)) {
                    result.append(CHAR_AMPERSAND);
                    i += 5;
                } else if (text.startsWith(ENCODED_SINGLE_QUOTE, i)) {
                    result.append(CHAR_SINGLE_QUOTE);
                    i += 6;
                } else if (text.startsWith(ENCODED_DOUBLE_QUOTE, i)) {
                    result.append(CHAR_DOUBLE_QUOTE);
                    i += 6;
                } else if (text.startsWith(ENCODED_LESS_THAN, i)) {
                    result.append(CHAR_LESS_THAN);
                    i += 4;
                } else if (text.startsWith(ENCODED_GREATER_THAN, i)) {
                    result.append(CHAR_GREATER_THAN);
                    i += 4;
                } else if (text.startsWith(ENCODED_UTF8, i)) {
                    final int index = text.indexOf(';', i);
                    result.append(charFromDecimal(text.substring(i+2, index)));
                    i = index+1;
                }
                else i++;
            }
        }
        return result.toString();
    }

    // https://www.freeformatter.com/html-entities.html
    private static final Map NAMED_HTML_ENTITIES = Builder.newHashMap()
        .put("&", "&").put("<", "<").put(">", ">").put("À", "À")
        .put("Á", "Á").put("Â", "Â").put("Ã", "Ã").put("Ä", "Ä")
        .put("Å", "Å").put("Æ", "Æ").put("Ç", "Ç").put("È", "È")
        .put("É", "É").put("Ê", "Ê").put("Ë", "Ë").put("Ì", "Ì")
        .put("Í", "Í").put("Î", "Î").put("Ï", "Ï").put("Ð", "Ð")
        .put("Ñ", "Ñ").put("Ò", "Ò").put("Ó", "Ó").put("Ô", "Ô")
        .put("Õ", "Õ").put("Ö", "Ö").put("Ø", "Ø").put("Ù", "Ù")
        .put("Ú", "Ú").put("Û", "Û").put("Ü", "Ü").put("Ý", "Ý")
        .put("Þ", "Þ").put("ß", "ß").put("à", "à").put("á", "á")
        .put("â", "â").put("ã", "ã").put("ä", "ä").put("å", "å")
        .put("æ", "æ").put("ç", "ç").put("è", "è").put("é", "é")
        .put("ê", "ê").put("ë", "ë").put("ì", "ì").put("í", "í")
        .put("î", "î").put("ï", "ï").put("ð", "ð").put("ñ", "ñ")
        .put("ò", "ò").put("ó", "ó").put("ô", "ô").put("õ", "õ")
        .put("ö", "ö").put("ø", "ø").put("ù", "ù").put("ú", "ú")
        .put("û", "û").put("ü", "ü").put("ý", "ý").put("þ", "þ")
        .put("ÿ", "ÿ").put(" ", " ").put("¡", "¡").put("¢", "¢")
        .put("£", "£").put("¤", "¤").put("¥", "¥").put("¦", "¦")
        .put("§", "§").put("¨", "¨").put("©", "©").put("ª", "ª")
        .put("«", "«").put("¬", "¬").put("­", "\u00ad").put("®", "®")
        .put("¯", "¯").put("°", "°").put("±", "±").put("²", "²")
        .put("³", "³").put("´", "´").put("µ", "µ").put("¶", "¶")
        .put("¸", "¸").put("¹", "¹").put("º", "º").put("»", "»")
        .put("¼", "¼").put("½", "½").put("¾", "¾").put("¿", "¿")
        .put("×", "×").put("÷", "÷").put("∀", "∀").put("∂", "∂")
        .put("∃", "∃").put("∅", "∅").put("∇", "∇").put("∈", "∈")
        .put("∉", "∉").put("∋", "∋").put("∏", "∏").put("∑", "∑")
        .put("−", "−").put("∗", "∗").put("√", "√").put("∝", "∝")
        .put("∞", "∞").put("∠", "∠").put("∧", "∧").put("∨", "∨")
        .put("∩", "∩").put("∪", "∪").put("∫", "∫").put("∴", "∴")
        .put("∼", "∼").put("≅", "≅").put("≈", "≈").put("≠", "≠")
        .put("≡", "≡").put("≤", "≤").put("≥", "≥").put("⊂", "⊂")
        .put("⊃", "⊃").put("⊄", "⊄").put("⊆", "⊆").put("⊇", "⊇")
        .put("⊕", "⊕").put("⊗", "⊗").put("⊥", "⊥").put("⋅", "⋅")
        .put("Α", "Α").put("Β", "Β").put("Γ", "Γ").put("Δ", "Δ")
        .put("Ε", "Ε").put("Ζ", "Ζ").put("Η", "Η").put("Θ", "Θ")
        .put("Ι", "Ι").put("Κ", "Κ").put("Λ", "Λ").put("Μ", "Μ")
        .put("Ν", "Ν").put("Ξ", "Ξ").put("Ο", "Ο").put("Π", "Π")
        .put("Ρ", "Ρ").put("Σ", "Σ").put("Τ", "Τ").put("Υ", "Υ")
        .put("Φ", "Φ").put("Χ", "Χ").put("Ψ", "Ψ").put("Ω", "Ω")
        .put("α", "α").put("β", "β").put("γ", "γ").put("δ", "δ")
        .put("ε", "ε").put("ζ", "ζ").put("η", "η").put("θ", "θ")
        .put("ι", "ι").put("κ", "κ").put("λ", "λ").put("μ", "μ")
        .put("ν", "ν").put("ξ", "ξ").put("ο", "ο").put("π", "π")
        .put("ρ", "ρ").put("ς", "ς").put("σ", "σ").put("τ", "τ")
        .put("υ", "υ").put("φ", "φ").put("χ", "χ").put("ψ", "ψ")
        .put("ω", "ω").put("ϑ", "ϑ").put("ϒ", "ϒ").put("ϖ", "ϖ")
        .put("Œ", "Œ").put("œ", "œ").put("Š", "Š").put("š", "š")
        .put("Ÿ", "Ÿ").put("ƒ", "ƒ").put("ˆ", "ˆ").put("˜", "˜")
        .put(" ", "\u2002").put(" ", "\u2003").put(" ", "\u2009").put("‌", "\u200c")
        .put("‍", "\u200d").put("‎", "\u200e").put("‏", "\u200f").put("–", "–")
        .put("—", "—").put("‘", "‘").put("’", "’").put("‚", "‚")
        .put("“", "“").put("”", "”").put("„", "„").put("†", "†")
        .put("‡", "‡").put("•", "•").put("…", "…").put("‰", "‰")
        .put("′", "′").put("″", "″").put("‹", "‹").put("›", "›")
        .put("‾", "‾").put("€", "€").put("™", "™").put("←", "←")
        .put("↑", "↑").put("→", "→").put("↓", "↓").put("↔", "↔")
        .put("↵", "↵").put("⌈", "⌈").put("⌉", "⌉").put("⌊", "⌊")
        .put("⌋", "⌋").put("◊", "◊").put("♠", "♠").put("♣", "♣")
        .put("♥", "♥").put("♦", "♦").build();

    public static String unescapeHtml(final String text) {
        final StringBuilder result = new StringBuilder(text.length());
        int i = 0;
        while (i < text.length()) {
            final char charAt = text.charAt(i);
            if (charAt != CHAR_AMPERSAND) {
                result.append(charAt);
                i++;
                continue;
            }

            final int index = text.indexOf(';', i);
            if (index == -1) {
                result.append("&");
                i++;
                continue;
            }

            final String entity = text.substring(i, index+1);
            final String decode = NAMED_HTML_ENTITIES.get(entity);
            if (decode != null) {
                result.append(decode);
                i = index+1;
                continue;
            }

            if (text.charAt(i+1) == '#') {
                final char real = text.charAt(i+2) == 'x'
                        ? charFromHex(text.substring(i+3, index))
                        : charFromDecimal(text.substring(i+2, index));
                result.append(real);
                i = index+1;
                continue;
            }

            result.append("&");
            i++;
        }
        return result.toString();
    }

    private static char charFromDecimal(final String substring) {
        return (char) Integer.parseInt(substring);
    }
    private static char charFromHex(final String substring) {
        return (char) Integer.parseInt(substring, 16);
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy