com.moviejukebox.tools.HTMLTools Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of yamj Show documentation
Show all versions of yamj Show documentation
Static analysis of MovieJukebox project
/*
* Copyright (c) 2004-2012 YAMJ Members
* http://code.google.com/p/moviejukebox/people/list
*
* Web: http://code.google.com/p/moviejukebox/
*
* This software is licensed under a Creative Commons License
* See this page: http://code.google.com/p/moviejukebox/wiki/License
*
* For any reuse or distribution, you must make clear to others the
* license terms of this work.
*/
package com.moviejukebox.tools;
import com.moviejukebox.model.Movie;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
public class HTMLTools {
private static final Map AGGRESSIVE_HTML_ENCODE_MAP = new HashMap();
private static final Map DEFENSIVE_HTML_ENCODE_MAP = new HashMap();
private static final Map HTML_DECODE_MAP = new HashMap();
private static final Logger logger = Logger.getLogger(HTMLTools.class);
static {
/*
* Html encoding mapping according to the HTML 4.0 spec
* http://www.w3.org/TR/REC-html40/sgml/entities.html
*/
// Special characters for HTML
AGGRESSIVE_HTML_ENCODE_MAP.put('\u0026', "&");
AGGRESSIVE_HTML_ENCODE_MAP.put('\u003C', "<");
AGGRESSIVE_HTML_ENCODE_MAP.put('\u003E', ">");
AGGRESSIVE_HTML_ENCODE_MAP.put('\u0022', """);
DEFENSIVE_HTML_ENCODE_MAP.put('\u0152', "Œ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u0153', "œ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u0160', "Š");
DEFENSIVE_HTML_ENCODE_MAP.put('\u0161', "š");
DEFENSIVE_HTML_ENCODE_MAP.put('\u0178', "Ÿ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u02C6', "ˆ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u02DC', "˜");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2002', " ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2003', " ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2009', " ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u200C', "");
DEFENSIVE_HTML_ENCODE_MAP.put('\u200D', "");
DEFENSIVE_HTML_ENCODE_MAP.put('\u200E', "");
DEFENSIVE_HTML_ENCODE_MAP.put('\u200F', "");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2013', "–");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2014', "—");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2018', "‘");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2019', "’");
DEFENSIVE_HTML_ENCODE_MAP.put('\u201A', "‚");
DEFENSIVE_HTML_ENCODE_MAP.put('\u201C', "“");
DEFENSIVE_HTML_ENCODE_MAP.put('\u201D', "”");
DEFENSIVE_HTML_ENCODE_MAP.put('\u201E', "„");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2020', "†");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2021', "‡");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2030', "‰");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2039', "‹");
DEFENSIVE_HTML_ENCODE_MAP.put('\u203A', "›");
DEFENSIVE_HTML_ENCODE_MAP.put('\u20AC', "€");
// Character entity references for ISO 8859-1 characters
DEFENSIVE_HTML_ENCODE_MAP.put('\u00A0', " ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00A1', "¡");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00A2', "¢");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00A3', "£");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00A4', "¤");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00A5', "¥");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00A6', "¦");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00A7', "§");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00A8', "¨");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00A9', "©");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00AA', "ª");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00AB', "«");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00AC', "¬");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00AD', "");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00AE', "®");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00AF', "¯");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00B0', "°");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00B1', "±");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00B2', "²");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00B3', "³");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00B4', "´");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00B5', "µ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00B6', "¶");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00B7', "·");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00B8', "¸");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00B9', "¹");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00BA', "º");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00BB', "»");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00BC', "¼");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00BD', "½");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00BE', "¾");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00BF', "¿");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00C0', "À");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00C1', "Á");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00C2', "Â");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00C3', "Ã");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00C4', "Ä");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00C5', "Å");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00C6', "Æ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00C7', "Ç");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00C8', "È");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00C9', "É");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00CA', "Ê");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00CB', "Ë");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00CC', "Ì");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00CD', "Í");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00CE', "Î");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00CF', "Ï");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00D0', "Ð");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00D1', "Ñ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00D2', "Ò");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00D3', "Ó");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00D4', "Ô");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00D5', "Õ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00D6', "Ö");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00D7', "×");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00D8', "Ø");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00D9', "Ù");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00DA', "Ú");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00DB', "Û");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00DC', "Ü");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00DD', "Ý");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00DE', "Þ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00DF', "ß");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00E0', "à");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00E1', "á");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00E2', "â");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00E3', "ã");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00E4', "ä");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00E5', "å");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00E6', "æ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00E7', "ç");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00E8', "è");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00E9', "é");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00EA', "ê");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00EB', "ë");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00EC', "ì");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00ED', "í");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00EE', "î");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00EF', "ï");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00F0', "ð");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00F1', "ñ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00F2', "ò");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00F3', "ó");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00F4', "ô");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00F5', "õ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00F6', "ö");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00F7', "÷");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00F8', "ø");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00F9', "ù");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00FA', "ú");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00FB', "û");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00FC', "ü");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00FD', "ý");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00FE', "þ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u00FF', "ÿ");
// Mathematical, Greek and Symbolic characters for HTML
DEFENSIVE_HTML_ENCODE_MAP.put('\u0192', "ƒ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u0391', "Α");
DEFENSIVE_HTML_ENCODE_MAP.put('\u0392', "Β");
DEFENSIVE_HTML_ENCODE_MAP.put('\u0393', "Γ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u0394', "Δ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u0395', "Ε");
DEFENSIVE_HTML_ENCODE_MAP.put('\u0396', "Ζ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u0397', "Η");
DEFENSIVE_HTML_ENCODE_MAP.put('\u0398', "Θ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u0399', "Ι");
DEFENSIVE_HTML_ENCODE_MAP.put('\u039A', "Κ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u039B', "Λ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u039C', "Μ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u039D', "Ν");
DEFENSIVE_HTML_ENCODE_MAP.put('\u039E', "Ξ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u039F', "Ο");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03A0', "Π");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03A1', "Ρ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03A3', "Σ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03A4', "Τ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03A5', "Υ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03A6', "Φ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03A7', "Χ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03A8', "Ψ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03A9', "Ω");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03B1', "α");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03B2', "β");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03B3', "γ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03B4', "δ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03B5', "ε");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03B6', "ζ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03B7', "η");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03B8', "θ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03B9', "ι");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03BA', "κ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03BB', "λ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03BC', "μ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03BD', "ν");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03BE', "ξ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03BF', "ο");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03C0', "π");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03C1', "ρ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03C2', "ς");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03C3', "σ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03C4', "τ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03C5', "υ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03C6', "φ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03C7', "χ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03C8', "ψ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03C9', "ω");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03D1', "ϑ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03D2', "ϒ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u03D6', "ϖ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2022', "•");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2026', "…");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2032', "′");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2033', "″");
DEFENSIVE_HTML_ENCODE_MAP.put('\u203E', "‾");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2044', "⁄");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2118', "℘");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2111', "ℑ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u211C', "ℜ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2122', "™");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2135', "ℵ");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2190', "←");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2191', "↑");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2192', "→");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2193', "↓");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2194', "↔");
DEFENSIVE_HTML_ENCODE_MAP.put('\u21B5', "↵");
DEFENSIVE_HTML_ENCODE_MAP.put('\u21D0', "⇐");
DEFENSIVE_HTML_ENCODE_MAP.put('\u21D1', "⇑");
DEFENSIVE_HTML_ENCODE_MAP.put('\u21D2', "⇒");
DEFENSIVE_HTML_ENCODE_MAP.put('\u21D3', "⇓");
DEFENSIVE_HTML_ENCODE_MAP.put('\u21D4', "⇔");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2200', "∀");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2202', "∂");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2203', "∃");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2205', "∅");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2207', "∇");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2208', "∈");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2209', "∉");
DEFENSIVE_HTML_ENCODE_MAP.put('\u220B', "∋");
DEFENSIVE_HTML_ENCODE_MAP.put('\u220F', "∏");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2211', "∑");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2212', "−");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2217', "∗");
DEFENSIVE_HTML_ENCODE_MAP.put('\u221A', "√");
DEFENSIVE_HTML_ENCODE_MAP.put('\u221D', "∝");
DEFENSIVE_HTML_ENCODE_MAP.put('\u221E', "∞");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2220', "∠");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2227', "∧");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2228', "∨");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2229', "∩");
DEFENSIVE_HTML_ENCODE_MAP.put('\u222A', "∪");
DEFENSIVE_HTML_ENCODE_MAP.put('\u222B', "∫");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2234', "∴");
DEFENSIVE_HTML_ENCODE_MAP.put('\u223C', "∼");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2245', "≅");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2248', "≈");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2260', "≠");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2261', "≡");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2264', "≤");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2265', "≥");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2282', "⊂");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2283', "⊃");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2284', "⊄");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2286', "⊆");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2287', "⊇");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2295', "⊕");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2297', "⊗");
DEFENSIVE_HTML_ENCODE_MAP.put('\u22A5', "⊥");
DEFENSIVE_HTML_ENCODE_MAP.put('\u22C5', "⋅");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2308', "⌈");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2309', "⌉");
DEFENSIVE_HTML_ENCODE_MAP.put('\u230A', "⌊");
DEFENSIVE_HTML_ENCODE_MAP.put('\u230B', "⌋");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2329', "〈");
DEFENSIVE_HTML_ENCODE_MAP.put('\u232A', "〉");
DEFENSIVE_HTML_ENCODE_MAP.put('\u25CA', "◊");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2660', "♠");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2663', "♣");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2665', "♥");
DEFENSIVE_HTML_ENCODE_MAP.put('\u2666', "♦");
Set> aggresiveEntries = AGGRESSIVE_HTML_ENCODE_MAP.entrySet();
for (Map.Entry entry : aggresiveEntries) {
HTML_DECODE_MAP.put(entry.getValue(), entry.getKey());
}
Set> defensiveEntries = DEFENSIVE_HTML_ENCODE_MAP.entrySet();
for (Map.Entry entry : defensiveEntries) {
HTML_DECODE_MAP.put(entry.getValue(), entry.getKey());
}
}
public static String decodeHtml(String source) {
if (null == source || 0 == source.length()) {
return source;
}
int currentIndex = 0;
int delimiterStartIndex;
int delimiterEndIndex;
StringBuilder result = null;
while (currentIndex <= source.length()) {
delimiterStartIndex = source.indexOf('&', currentIndex);
if (delimiterStartIndex != -1) {
delimiterEndIndex = source.indexOf(';', delimiterStartIndex + 1);
if (delimiterEndIndex != -1) {
// ensure that the string builder is setup correctly
if (null == result) {
result = new StringBuilder();
}
// add the text that leads up to this match
if (delimiterStartIndex > currentIndex) {
result.append(new String(source.substring(currentIndex, delimiterStartIndex)));
}
// add the decoded entity
String entity = new String(source.substring(delimiterStartIndex, delimiterEndIndex + 1));
currentIndex = delimiterEndIndex + 1;
// try to decoded numeric entities
if (entity.charAt(1) == '#') {
int start = 2;
int radix = 10;
// check if the number is hexadecimal
if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') {
start++;
radix = 16;
}
try {
Character c = Character.valueOf((char) Integer.parseInt(entity.substring(start, entity.length() - 1), radix));
result.append(c);
} // when the number of the entity can't be parsed, add the entity as-is
catch (NumberFormatException error) {
result.append(entity);
}
} else {
// try to decode the entity as a literal
Character decoded = HTML_DECODE_MAP.get(entity);
if (decoded != null) {
result.append(decoded);
} // if there was no match, add the entity as-is
else {
result.append(entity);
}
}
} else {
break;
}
} else {
break;
}
}
if (null == result) {
return source;
} else if (currentIndex < source.length()) {
result.append(new String(source.substring(currentIndex)));
}
return result.toString();
}
public static String decodeUrl(String url) {
if (url != null && url.length() != 0) {
try {
return URLDecoder.decode(url, "UTF-8");
} catch (UnsupportedEncodingException ignored) {
logger.info("Could not decode URL string: " + url + ", will proceed with undecoded string.");
}
}
return url;
}
public static String encodeUrl(String url) {
String returnUrl = url;
if (url != null && url.length() != 0) {
try {
returnUrl = URLEncoder.encode(url, "UTF-8");
returnUrl = returnUrl.replace((CharSequence) "+", (CharSequence) "%20"); // why does URLEncoder do that??!!
} catch (UnsupportedEncodingException ignored) {
logger.info("Could not decode URL string: " + returnUrl + ", will proceed with undecoded string.");
}
}
return returnUrl;
}
public static String encodeUrlPath(String url) {
if (url != null && url.length() != 0) {
int slash = url.lastIndexOf('/');
String parentPart = "";
if (slash != -1) {
parentPart = encodeUrlPath(new String(url.substring(0, slash))) + '/';
}
return parentPart + encodeUrl(new String(url.substring(slash + 1)));
}
return url;
}
public static List extractHtmlTags(String src, String sectionStart, String sectionEnd, String startTag, String endTag) {
ArrayList tags = new ArrayList();
int index = src.indexOf(sectionStart);
if (index == -1) {
return tags;
}
index += sectionStart.length();
int endIndex = src.indexOf(sectionEnd, index);
if (endIndex == -1) {
return tags;
}
String sectionText = new String(src.substring(index, endIndex));
int lastIndex = sectionText.length();
index = 0;
int endLen = endTag.length();
if (startTag != null) {
index = sectionText.indexOf(startTag);
}
while (index != -1) {
endIndex = sectionText.indexOf(endTag, index);
if (endIndex == -1) {
endIndex = lastIndex;
}
endIndex += endLen;
String text = new String(sectionText.substring(index, endIndex));
tags.add(text);
if (endIndex > lastIndex) {
break;
}
if (startTag != null) {
index = sectionText.indexOf(startTag, endIndex);
} else {
index = endIndex;
}
}
return tags;
}
public static String extractTag(String src, String findStr) {
return extractTag(src, findStr, 0);
}
public static String extractTag(String src, String findStr, int skip) {
return extractTag(src, findStr, skip, "><");
}
public static String extractTag(String src, String findStr, int skip, String separator) {
return extractTag(src, findStr, skip, separator, true);
}
public static String extractTag(String src, String findStr, int skip, String separator, boolean checkDirty) {
int beginIndex = src.indexOf(findStr);
String value = Movie.UNKNOWN;
if (beginIndex >= 0) {
StringTokenizer st = new StringTokenizer(new String(src.substring(beginIndex + findStr.length())), separator);
for (int i = 0; i < skip; i++) {
st.nextToken();
}
value = HTMLTools.decodeHtml(st.nextToken().trim());
if (checkDirty && value.indexOf("uiv=\"content-ty") != -1 || value.indexOf("cast") != -1 || value.indexOf("title") != -1 || value.indexOf('<') != -1) {
value = Movie.UNKNOWN;
}
}
return value;
}
public static String extractTag(String src, String startStr, String endStr) {
int beginIndex = src.indexOf(startStr);
if (beginIndex < 0) {
return Movie.UNKNOWN;
}
try {
String subString = new String(src.substring(beginIndex + startStr.length()));
int endIndex = subString.indexOf(endStr);
if (endIndex < 0) {
return Movie.UNKNOWN;
}
subString = new String(subString.substring(0, endIndex));
return HTMLTools.decodeHtml(subString.trim());
} catch (Exception error) {
return Movie.UNKNOWN;
}
}
public static List extractTags(String src, String sectionStart) {
return extractTags(src, sectionStart, "