All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.moviejukebox.tools.HTMLTools Maven / Gradle / Ivy

There is a newer version: 2.9
Show newest version
/*
 *      Copyright (c) 2004-2012 YAMJ Members
 *      http://code.google.com/p/moviejukebox/people/list
 *
 *      Web: http://code.google.com/p/moviejukebox/
 *
 *      This software is licensed under a Creative Commons License
 *      See this page: http://code.google.com/p/moviejukebox/wiki/License
 *
 *      For any reuse or distribution, you must make clear to others the
 *      license terms of this work.
 */
package com.moviejukebox.tools;

import com.moviejukebox.model.Movie;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;

public class HTMLTools {

    private static final Map AGGRESSIVE_HTML_ENCODE_MAP = new HashMap();
    private static final Map DEFENSIVE_HTML_ENCODE_MAP = new HashMap();
    private static final Map HTML_DECODE_MAP = new HashMap();
    private static final Logger logger = Logger.getLogger(HTMLTools.class);

    static {
        /*
         * Html encoding mapping according to the HTML 4.0 spec
         * http://www.w3.org/TR/REC-html40/sgml/entities.html
         */

        // Special characters for HTML
        AGGRESSIVE_HTML_ENCODE_MAP.put('\u0026', "&");
        AGGRESSIVE_HTML_ENCODE_MAP.put('\u003C', "<");
        AGGRESSIVE_HTML_ENCODE_MAP.put('\u003E', ">");
        AGGRESSIVE_HTML_ENCODE_MAP.put('\u0022', """);

        DEFENSIVE_HTML_ENCODE_MAP.put('\u0152', "Œ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0153', "œ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0160', "Š");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0161', "š");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0178', "Ÿ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u02C6', "ˆ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u02DC', "˜");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2002', " ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2003', " ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2009', " ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u200C', "‌");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u200D', "‍");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u200E', "‎");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u200F', "‏");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2013', "–");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2014', "—");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2018', "‘");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2019', "’");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u201A', "‚");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u201C', "“");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u201D', "”");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u201E', "„");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2020', "†");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2021', "‡");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2030', "‰");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2039', "‹");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u203A', "›");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u20AC', "€");

        // Character entity references for ISO 8859-1 characters
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00A0', " ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00A1', "¡");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00A2', "¢");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00A3', "£");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00A4', "¤");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00A5', "¥");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00A6', "¦");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00A7', "§");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00A8', "¨");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00A9', "©");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00AA', "ª");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00AB', "«");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00AC', "¬");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00AD', "­");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00AE', "®");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00AF', "¯");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00B0', "°");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00B1', "±");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00B2', "²");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00B3', "³");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00B4', "´");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00B5', "µ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00B6', "¶");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00B7', "·");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00B8', "¸");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00B9', "¹");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00BA', "º");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00BB', "»");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00BC', "¼");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00BD', "½");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00BE', "¾");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00BF', "¿");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00C0', "À");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00C1', "Á");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00C2', "Â");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00C3', "Ã");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00C4', "Ä");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00C5', "Å");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00C6', "Æ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00C7', "Ç");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00C8', "È");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00C9', "É");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00CA', "Ê");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00CB', "Ë");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00CC', "Ì");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00CD', "Í");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00CE', "Î");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00CF', "Ï");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00D0', "Ð");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00D1', "Ñ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00D2', "Ò");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00D3', "Ó");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00D4', "Ô");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00D5', "Õ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00D6', "Ö");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00D7', "×");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00D8', "Ø");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00D9', "Ù");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00DA', "Ú");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00DB', "Û");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00DC', "Ü");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00DD', "Ý");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00DE', "Þ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00DF', "ß");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00E0', "à");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00E1', "á");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00E2', "â");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00E3', "ã");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00E4', "ä");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00E5', "å");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00E6', "æ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00E7', "ç");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00E8', "è");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00E9', "é");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00EA', "ê");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00EB', "ë");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00EC', "ì");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00ED', "í");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00EE', "î");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00EF', "ï");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00F0', "ð");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00F1', "ñ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00F2', "ò");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00F3', "ó");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00F4', "ô");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00F5', "õ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00F6', "ö");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00F7', "÷");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00F8', "ø");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00F9', "ù");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00FA', "ú");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00FB', "û");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00FC', "ü");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00FD', "ý");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00FE', "þ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u00FF', "ÿ");

        // Mathematical, Greek and Symbolic characters for HTML
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0192', "ƒ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0391', "Α");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0392', "Β");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0393', "Γ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0394', "Δ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0395', "Ε");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0396', "Ζ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0397', "Η");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0398', "Θ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u0399', "Ι");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u039A', "Κ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u039B', "Λ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u039C', "Μ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u039D', "Ν");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u039E', "Ξ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u039F', "Ο");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03A0', "Π");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03A1', "Ρ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03A3', "Σ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03A4', "Τ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03A5', "Υ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03A6', "Φ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03A7', "Χ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03A8', "Ψ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03A9', "Ω");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03B1', "α");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03B2', "β");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03B3', "γ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03B4', "δ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03B5', "ε");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03B6', "ζ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03B7', "η");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03B8', "θ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03B9', "ι");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03BA', "κ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03BB', "λ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03BC', "μ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03BD', "ν");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03BE', "ξ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03BF', "ο");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03C0', "π");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03C1', "ρ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03C2', "ς");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03C3', "σ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03C4', "τ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03C5', "υ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03C6', "φ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03C7', "χ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03C8', "ψ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03C9', "ω");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03D1', "ϑ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03D2', "ϒ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u03D6', "ϖ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2022', "•");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2026', "…");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2032', "′");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2033', "″");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u203E', "‾");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2044', "⁄");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2118', "℘");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2111', "ℑ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u211C', "ℜ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2122', "™");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2135', "ℵ");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2190', "←");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2191', "↑");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2192', "→");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2193', "↓");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2194', "↔");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u21B5', "↵");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u21D0', "⇐");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u21D1', "⇑");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u21D2', "⇒");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u21D3', "⇓");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u21D4', "⇔");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2200', "∀");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2202', "∂");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2203', "∃");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2205', "∅");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2207', "∇");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2208', "∈");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2209', "∉");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u220B', "∋");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u220F', "∏");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2211', "∑");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2212', "−");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2217', "∗");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u221A', "√");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u221D', "∝");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u221E', "∞");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2220', "∠");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2227', "∧");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2228', "∨");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2229', "∩");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u222A', "∪");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u222B', "∫");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2234', "∴");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u223C', "∼");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2245', "≅");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2248', "≈");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2260', "≠");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2261', "≡");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2264', "≤");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2265', "≥");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2282', "⊂");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2283', "⊃");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2284', "⊄");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2286', "⊆");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2287', "⊇");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2295', "⊕");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2297', "⊗");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u22A5', "⊥");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u22C5', "⋅");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2308', "⌈");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2309', "⌉");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u230A', "⌊");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u230B', "⌋");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2329', "⟨");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u232A', "⟩");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u25CA', "◊");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2660', "♠");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2663', "♣");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2665', "♥");
        DEFENSIVE_HTML_ENCODE_MAP.put('\u2666', "♦");

        Set> aggresiveEntries = AGGRESSIVE_HTML_ENCODE_MAP.entrySet();
        for (Map.Entry entry : aggresiveEntries) {
            HTML_DECODE_MAP.put(entry.getValue(), entry.getKey());
        }

        Set> defensiveEntries = DEFENSIVE_HTML_ENCODE_MAP.entrySet();
        for (Map.Entry entry : defensiveEntries) {
            HTML_DECODE_MAP.put(entry.getValue(), entry.getKey());
        }
    }

    public static String decodeHtml(String source) {
        if (null == source || 0 == source.length()) {
            return source;
        }

        int currentIndex = 0;
        int delimiterStartIndex;
        int delimiterEndIndex;

        StringBuilder result = null;

        while (currentIndex <= source.length()) {
            delimiterStartIndex = source.indexOf('&', currentIndex);
            if (delimiterStartIndex != -1) {
                delimiterEndIndex = source.indexOf(';', delimiterStartIndex + 1);
                if (delimiterEndIndex != -1) {
                    // ensure that the string builder is setup correctly
                    if (null == result) {
                        result = new StringBuilder();
                    }

                    // add the text that leads up to this match
                    if (delimiterStartIndex > currentIndex) {
                        result.append(new String(source.substring(currentIndex, delimiterStartIndex)));
                    }

                    // add the decoded entity
                    String entity = new String(source.substring(delimiterStartIndex, delimiterEndIndex + 1));

                    currentIndex = delimiterEndIndex + 1;

                    // try to decoded numeric entities
                    if (entity.charAt(1) == '#') {
                        int start = 2;
                        int radix = 10;
                        // check if the number is hexadecimal
                        if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') {
                            start++;
                            radix = 16;
                        }
                        try {
                            Character c = Character.valueOf((char) Integer.parseInt(entity.substring(start, entity.length() - 1), radix));
                            result.append(c);
                        } // when the number of the entity can't be parsed, add the entity as-is
                        catch (NumberFormatException error) {
                            result.append(entity);
                        }
                    } else {
                        // try to decode the entity as a literal
                        Character decoded = HTML_DECODE_MAP.get(entity);
                        if (decoded != null) {
                            result.append(decoded);
                        } // if there was no match, add the entity as-is
                        else {
                            result.append(entity);
                        }
                    }
                } else {
                    break;
                }
            } else {
                break;
            }
        }

        if (null == result) {
            return source;
        } else if (currentIndex < source.length()) {
            result.append(new String(source.substring(currentIndex)));
        }

        return result.toString();
    }

    public static String decodeUrl(String url) {
        if (url != null && url.length() != 0) {
            try {
                return URLDecoder.decode(url, "UTF-8");
            } catch (UnsupportedEncodingException ignored) {
                logger.info("Could not decode URL string: " + url + ", will proceed with undecoded string.");
            }
        }
        return url;
    }

    public static String encodeUrl(String url) {
        String returnUrl = url;

        if (url != null && url.length() != 0) {
            try {
                returnUrl = URLEncoder.encode(url, "UTF-8");
                returnUrl = returnUrl.replace((CharSequence) "+", (CharSequence) "%20"); // why does URLEncoder do that??!!
            } catch (UnsupportedEncodingException ignored) {
                logger.info("Could not decode URL string: " + returnUrl + ", will proceed with undecoded string.");
            }
        }
        return returnUrl;
    }

    public static String encodeUrlPath(String url) {
        if (url != null && url.length() != 0) {
            int slash = url.lastIndexOf('/');
            String parentPart = "";
            if (slash != -1) {
                parentPart = encodeUrlPath(new String(url.substring(0, slash))) + '/';
            }
            return parentPart + encodeUrl(new String(url.substring(slash + 1)));
        }
        return url;
    }

    public static List extractHtmlTags(String src, String sectionStart, String sectionEnd, String startTag, String endTag) {
        ArrayList tags = new ArrayList();
        int index = src.indexOf(sectionStart);
        if (index == -1) {
            return tags;
        }
        index += sectionStart.length();
        int endIndex = src.indexOf(sectionEnd, index);
        if (endIndex == -1) {
            return tags;
        }

        String sectionText = new String(src.substring(index, endIndex));
        int lastIndex = sectionText.length();
        index = 0;
        int endLen = endTag.length();

        if (startTag != null) {
            index = sectionText.indexOf(startTag);
        }

        while (index != -1) {
            endIndex = sectionText.indexOf(endTag, index);
            if (endIndex == -1) {
                endIndex = lastIndex;
            }
            endIndex += endLen;
            String text = new String(sectionText.substring(index, endIndex));
            tags.add(text);
            if (endIndex > lastIndex) {
                break;
            }
            if (startTag != null) {
                index = sectionText.indexOf(startTag, endIndex);
            } else {
                index = endIndex;
            }
        }
        return tags;
    }

    public static String extractTag(String src, String findStr) {
        return extractTag(src, findStr, 0);
    }

    public static String extractTag(String src, String findStr, int skip) {
        return extractTag(src, findStr, skip, "><");
    }

    public static String extractTag(String src, String findStr, int skip, String separator) {
        return extractTag(src, findStr, skip, separator, true);
    }

    public static String extractTag(String src, String findStr, int skip, String separator, boolean checkDirty) {
        int beginIndex = src.indexOf(findStr);

        String value = Movie.UNKNOWN;

        if (beginIndex >= 0) {
            StringTokenizer st = new StringTokenizer(new String(src.substring(beginIndex + findStr.length())), separator);
            for (int i = 0; i < skip; i++) {
                st.nextToken();
            }

            value = HTMLTools.decodeHtml(st.nextToken().trim());

            if (checkDirty && value.indexOf("uiv=\"content-ty") != -1 || value.indexOf("cast") != -1 || value.indexOf("title") != -1 || value.indexOf('<') != -1) {
                value = Movie.UNKNOWN;
            }
        }

        return value;
    }

    public static String extractTag(String src, String startStr, String endStr) {
        int beginIndex = src.indexOf(startStr);

        if (beginIndex < 0) {
            return Movie.UNKNOWN;
        }

        try {
            String subString = new String(src.substring(beginIndex + startStr.length()));
            int endIndex = subString.indexOf(endStr);
            if (endIndex < 0) {
                return Movie.UNKNOWN;
            }
            subString = new String(subString.substring(0, endIndex));
            return HTMLTools.decodeHtml(subString.trim());
        } catch (Exception error) {
            return Movie.UNKNOWN;
        }
    }

    public static List extractTags(String src, String sectionStart) {
        return extractTags(src, sectionStart, "
"); } public static List extractTags(String src, String sectionStart, String sectionEnd) { return extractTags(src, sectionStart, sectionEnd, null, "|"); } public static List extractTags(String src, String sectionStart, String sectionEnd, String startTag, String endTag) { return extractTags(src, sectionStart, sectionEnd, startTag, endTag, true); } public static List extractTags(String src, String sectionStart, String sectionEnd, String startTag, String endTag, boolean forceCloseTag) { ArrayList tags = new ArrayList(); int startIndex = src.indexOf(sectionStart); if (startIndex == -1) { return tags; } startIndex += sectionStart.length(); int endIndex = src.indexOf(sectionEnd, startIndex); if (endIndex == -1) { return tags; } String sectionText = new String(src.substring(startIndex, endIndex)); int lastIndex = sectionText.length(); startIndex = 0; int startLen = 0; int endLen = endTag.length(); if (startTag != null) { startIndex = sectionText.indexOf(startTag); startLen = startTag.length(); } while (startIndex != -1) { startIndex += startLen; if (forceCloseTag) { int close = sectionText.indexOf('>', startIndex); if (close != -1) { startIndex = close + 1; } } endIndex = sectionText.indexOf(endTag, startIndex); if (endIndex == -1) { endIndex = lastIndex; } String text = new String(sectionText.substring(startIndex, endIndex)); tags.add(HTMLTools.decodeHtml(text.trim())); endIndex += endLen; if (endIndex > lastIndex) { break; } if (startTag != null) { startIndex = sectionText.indexOf(startTag, endIndex); } else { startIndex = endIndex; } } return tags; } public static String getTextAfterElem(String src, String findStr) { return getTextAfterElem(src, findStr, 0); } public static String getTextAfterElem(String src, String findStr, int skip) { return getTextAfterElem(src, findStr, skip, 0); } /** * Example: src = "
my text
findStr = "specialID" result = "my text" * * @param src html text * @param findStr string to find in src * @param skip count of found texts to skip * @param fromIndex begin index in src * @return string from html text which is plain text without html tags */ public static String getTextAfterElem(String src, String findStr, int skip, int fromIndex) { int beginIndex = src.indexOf(findStr, fromIndex); if (beginIndex == -1) { return Movie.UNKNOWN; } StringTokenizer st = new StringTokenizer(new String(src.substring(beginIndex + findStr.length())), "<"); int i = 0; while (st.hasMoreElements()) { String elem = st.nextToken().replaceAll(" | ", "").trim(); if (elem.length() != 0 && !elem.endsWith(">") && i++ >= skip) { String[] elems = elem.split(">"); if (elems.length > 1) { return HTMLTools.decodeHtml(elems[1].trim()); } else { return HTMLTools.decodeHtml(elems[0].trim()); } } } return Movie.UNKNOWN; } public static String removeHtmlTags(String src) { return src.replaceAll("\\<.*?>", ""); } public static String stripTags(String s) { Pattern stripTagsRegex = Pattern.compile("([^\\<]*)(?:\\<[^\\>]*\\>)?"); Matcher m = stripTagsRegex.matcher(s); StringBuilder res = new StringBuilder(); while (m.find()) { res.append(m.group(1)); } // Replace escaped spaces String finalRes = res.toString().replaceAll(" ", " "); return finalRes.trim(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy