xmlparser.utils.Escaping Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of simplexml Show documentation
Show all versions of simplexml Show documentation
A clean and simple XML parser, serializer, and deserializer.
package xmlparser.utils;
import java.util.Map;
import static xmlparser.utils.Constants.*;
public enum Escaping {;
public interface UnEscape {
String unescape(String input);
}
public static String unescapeXml(final String text) {
StringBuilder result = new StringBuilder(text.length());
int i = 0;
int n = text.length();
while (i < n) {
char charAt = text.charAt(i);
if (charAt != CHAR_AMPERSAND) {
result.append(charAt);
i++;
} else {
if (text.startsWith(ENCODED_AMPERSAND, i)) {
result.append(CHAR_AMPERSAND);
i += 5;
} else if (text.startsWith(ENCODED_SINGLE_QUOTE, i)) {
result.append(CHAR_SINGLE_QUOTE);
i += 6;
} else if (text.startsWith(ENCODED_DOUBLE_QUOTE, i)) {
result.append(CHAR_DOUBLE_QUOTE);
i += 6;
} else if (text.startsWith(ENCODED_LESS_THAN, i)) {
result.append(CHAR_LESS_THAN);
i += 4;
} else if (text.startsWith(ENCODED_GREATER_THAN, i)) {
result.append(CHAR_GREATER_THAN);
i += 4;
} else if (text.startsWith(ENCODED_UTF8, i)) {
final int index = text.indexOf(';', i);
result.append(charFromDecimal(text.substring(i+2, index)));
i = index+1;
}
else i++;
}
}
return result.toString();
}
// https://www.freeformatter.com/html-entities.html
private static final Map NAMED_HTML_ENTITIES = Builder.newHashMap()
.put("&", "&").put("<", "<").put(">", ">").put("À", "À")
.put("Á", "Á").put("Â", "Â").put("Ã", "Ã").put("Ä", "Ä")
.put("Å", "Å").put("Æ", "Æ").put("Ç", "Ç").put("È", "È")
.put("É", "É").put("Ê", "Ê").put("Ë", "Ë").put("Ì", "Ì")
.put("Í", "Í").put("Î", "Î").put("Ï", "Ï").put("Ð", "Ð")
.put("Ñ", "Ñ").put("Ò", "Ò").put("Ó", "Ó").put("Ô", "Ô")
.put("Õ", "Õ").put("Ö", "Ö").put("Ø", "Ø").put("Ù", "Ù")
.put("Ú", "Ú").put("Û", "Û").put("Ü", "Ü").put("Ý", "Ý")
.put("Þ", "Þ").put("ß", "ß").put("à", "à").put("á", "á")
.put("â", "â").put("ã", "ã").put("ä", "ä").put("å", "å")
.put("æ", "æ").put("ç", "ç").put("è", "è").put("é", "é")
.put("ê", "ê").put("ë", "ë").put("ì", "ì").put("í", "í")
.put("î", "î").put("ï", "ï").put("ð", "ð").put("ñ", "ñ")
.put("ò", "ò").put("ó", "ó").put("ô", "ô").put("õ", "õ")
.put("ö", "ö").put("ø", "ø").put("ù", "ù").put("ú", "ú")
.put("û", "û").put("ü", "ü").put("ý", "ý").put("þ", "þ")
.put("ÿ", "ÿ").put(" ", " ").put("¡", "¡").put("¢", "¢")
.put("£", "£").put("¤", "¤").put("¥", "¥").put("¦", "¦")
.put("§", "§").put("¨", "¨").put("©", "©").put("ª", "ª")
.put("«", "«").put("¬", "¬").put("", "\u00ad").put("®", "®")
.put("¯", "¯").put("°", "°").put("±", "±").put("²", "²")
.put("³", "³").put("´", "´").put("µ", "µ").put("¶", "¶")
.put("¸", "¸").put("¹", "¹").put("º", "º").put("»", "»")
.put("¼", "¼").put("½", "½").put("¾", "¾").put("¿", "¿")
.put("×", "×").put("÷", "÷").put("∀", "∀").put("∂", "∂")
.put("∃", "∃").put("∅", "∅").put("∇", "∇").put("∈", "∈")
.put("∉", "∉").put("∋", "∋").put("∏", "∏").put("∑", "∑")
.put("−", "−").put("∗", "∗").put("√", "√").put("∝", "∝")
.put("∞", "∞").put("∠", "∠").put("∧", "∧").put("∨", "∨")
.put("∩", "∩").put("∪", "∪").put("∫", "∫").put("∴", "∴")
.put("∼", "∼").put("≅", "≅").put("≈", "≈").put("≠", "≠")
.put("≡", "≡").put("≤", "≤").put("≥", "≥").put("⊂", "⊂")
.put("⊃", "⊃").put("⊄", "⊄").put("⊆", "⊆").put("⊇", "⊇")
.put("⊕", "⊕").put("⊗", "⊗").put("⊥", "⊥").put("⋅", "⋅")
.put("Α", "Α").put("Β", "Β").put("Γ", "Γ").put("Δ", "Δ")
.put("Ε", "Ε").put("Ζ", "Ζ").put("Η", "Η").put("Θ", "Θ")
.put("Ι", "Ι").put("Κ", "Κ").put("Λ", "Λ").put("Μ", "Μ")
.put("Ν", "Ν").put("Ξ", "Ξ").put("Ο", "Ο").put("Π", "Π")
.put("Ρ", "Ρ").put("Σ", "Σ").put("Τ", "Τ").put("Υ", "Υ")
.put("Φ", "Φ").put("Χ", "Χ").put("Ψ", "Ψ").put("Ω", "Ω")
.put("α", "α").put("β", "β").put("γ", "γ").put("δ", "δ")
.put("ε", "ε").put("ζ", "ζ").put("η", "η").put("θ", "θ")
.put("ι", "ι").put("κ", "κ").put("λ", "λ").put("μ", "μ")
.put("ν", "ν").put("ξ", "ξ").put("ο", "ο").put("π", "π")
.put("ρ", "ρ").put("ς", "ς").put("σ", "σ").put("τ", "τ")
.put("υ", "υ").put("φ", "φ").put("χ", "χ").put("ψ", "ψ")
.put("ω", "ω").put("ϑ", "ϑ").put("ϒ", "ϒ").put("ϖ", "ϖ")
.put("Œ", "Œ").put("œ", "œ").put("Š", "Š").put("š", "š")
.put("Ÿ", "Ÿ").put("ƒ", "ƒ").put("ˆ", "ˆ").put("˜", "˜")
.put(" ", "\u2002").put(" ", "\u2003").put(" ", "\u2009").put("", "\u200c")
.put("", "\u200d").put("", "\u200e").put("", "\u200f").put("–", "–")
.put("—", "—").put("‘", "‘").put("’", "’").put("‚", "‚")
.put("“", "“").put("”", "”").put("„", "„").put("†", "†")
.put("‡", "‡").put("•", "•").put("…", "…").put("‰", "‰")
.put("′", "′").put("″", "″").put("‹", "‹").put("›", "›")
.put("‾", "‾").put("€", "€").put("™", "™").put("←", "←")
.put("↑", "↑").put("→", "→").put("↓", "↓").put("↔", "↔")
.put("↵", "↵").put("⌈", "⌈").put("⌉", "⌉").put("⌊", "⌊")
.put("⌋", "⌋").put("◊", "◊").put("♠", "♠").put("♣", "♣")
.put("♥", "♥").put("♦", "♦").build();
public static String unescapeHtml(final String text) {
final StringBuilder result = new StringBuilder(text.length());
int i = 0;
while (i < text.length()) {
final char charAt = text.charAt(i);
if (charAt != CHAR_AMPERSAND) {
result.append(charAt);
i++;
continue;
}
final int index = text.indexOf(';', i);
if (index == -1) {
result.append("&");
i++;
continue;
}
final String entity = text.substring(i, index+1);
final String decode = NAMED_HTML_ENTITIES.get(entity);
if (decode != null) {
result.append(decode);
i = index+1;
continue;
}
if (text.charAt(i+1) == '#') {
final char real = text.charAt(i+2) == 'x'
? charFromHex(text.substring(i+3, index))
: charFromDecimal(text.substring(i+2, index));
result.append(real);
i = index+1;
continue;
}
result.append("&");
i++;
}
return result.toString();
}
private static char charFromDecimal(final String substring) {
return (char) Integer.parseInt(substring);
}
private static char charFromHex(final String substring) {
return (char) Integer.parseInt(substring, 16);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy