All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jodd.util.HtmlDecoder Maven / Gradle / Ivy

// Copyright (c) 2003-present, Jodd Team (http://jodd.org)
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.

package jodd.util;

import java.io.Closeable;
import java.io.Flushable;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;

/**
 * HTML decoder.
 */
public class HtmlDecoder {

    private static final Map ENTITY_MAP;
    private static final char[][] ENTITY_NAMES;

    static {
        Properties entityReferences = new Properties();

        String propertiesName = HtmlDecoder.class.getSimpleName() + ".properties";

        InputStream is = HtmlDecoder.class.getResourceAsStream(propertiesName);

        try {
            entityReferences.load(is);
        } catch (Exception ex) {
            throw new IllegalStateException(ex);
        } finally {
            close(is);
        }

        ENTITY_MAP = new HashMap<>(entityReferences.size());

        Enumeration keys = entityReferences.propertyNames();
        while (keys.hasMoreElements()) {
            String name = (String) keys.nextElement();
            String values = entityReferences.getProperty(name);
            String[] array = StringUtil.splitc(values, ',');

            char[] chars;

            String hex = array[0];
            char value = (char) Integer.parseInt(hex, 16);

            if (array.length == 2) {
                String hex2 = array[1];
                char value2 = (char) Integer.parseInt(hex2, 16);

                chars = new char[]{value, value2};
            } else {
                chars = new char[]{value};
            }

            ENTITY_MAP.put(name, chars);
        }

        // create sorted list of entry names

        ENTITY_NAMES = new char[ENTITY_MAP.size()][];

        int i = 0;
        for (String name : ENTITY_MAP.keySet()) {
            ENTITY_NAMES[i++] = name.toCharArray();
        }

        Arrays.sort(ENTITY_NAMES, new Comparator() {
            public int compare(char[] o1, char[] o2) {
                return new String(o1).compareTo(new String(o2));
            }
        });
    }

    /**
     * Decodes HTML text. Assumes that all character references are properly closed with semi-colon.
     */
    public static String decode(String html) {

        int ndx = html.indexOf('&');
        if (ndx == -1) {
            return html;
        }

        StringBuilder result = new StringBuilder(html.length());

        int lastIndex = 0;
        int len = html.length();
        mainloop:
        while (ndx != -1) {
            result.append(html.substring(lastIndex, ndx));

            lastIndex = ndx;
            while (html.charAt(lastIndex) != ';') {
                lastIndex++;
                if (lastIndex == len) {
                    lastIndex = ndx;
                    break mainloop;
                }
            }

            if (html.charAt(ndx + 1) == '#') {
                // decimal/hex
                char c = html.charAt(ndx + 2);
                int radix;
                if ((c == 'x') || (c == 'X')) {
                    radix = 16;
                    ndx += 3;
                } else {
                    radix = 10;
                    ndx += 2;
                }

                String number = html.substring(ndx, lastIndex);
                int i = Integer.parseInt(number, radix);
                result.append((char) i);
                lastIndex++;
            } else {
                // token
                String encodeToken = html.substring(ndx + 1, lastIndex);

                char[] replacement = ENTITY_MAP.get(encodeToken);
                if (replacement == null) {
                    result.append('&');
                    lastIndex = ndx + 1;
                } else {
                    result.append(replacement);
                    lastIndex++;
                }
            }
            ndx = html.indexOf('&', lastIndex);
        }
        result.append(html.substring(lastIndex));
        return result.toString();
    }

    private static final class Ptr {
        public int offset;
        public char c;
    }

    /**
     * Detects the longest character reference name on given position in char array.
     */
    public static String detectName(final char[] input, int ndx) {
        final Ptr ptr = new Ptr();

        int firstIndex = 0;
        int lastIndex = ENTITY_NAMES.length - 1;
        int len = input.length;
        char[] lastName = null;

        BinarySearchBase binarySearch = new BinarySearchBase() {
            @Override
            protected int compare(int index) {
                char[] name = ENTITY_NAMES[index];

                if (ptr.offset >= name.length) {
                    return -1;
                }

                return name[ptr.offset] - ptr.c;
            }
        };

        while (true) {
            ptr.c = input[ndx];

            if (!isAlphaOrDigit(ptr.c)) {
                return lastName != null ? new String(lastName) : null;
            }

            firstIndex = binarySearch.findFirst(firstIndex, lastIndex);
            if (firstIndex < 0) {
                return lastName != null ? new String(lastName) : null;
            }

            char[] element = ENTITY_NAMES[firstIndex];

            if (element.length == ptr.offset + 1) {
                // total match, remember position, continue for finding the longer name
                lastName = ENTITY_NAMES[firstIndex];
            }

            lastIndex = binarySearch.findLast(firstIndex, lastIndex);

            if (firstIndex == lastIndex) {
                // only one element found, check the rest
                for (int i = ptr.offset; i < element.length; i++) {
                    if (element[i] != input[ndx]) {
                        return lastName != null ? new String(lastName) : null;
                    }
                    ndx++;
                }
                return new String(element);
            }

            ptr.offset++;

            ndx++;
            if (ndx == len) {
                return lastName != null ? new String(lastName) : null;
            }
        }
    }

    /**
     * Returns replacement chars for given character reference.
     */
    public static char[] lookup(String name) {
        return ENTITY_MAP.get(name);
    }

    public static boolean isAlphaOrDigit(char c) {
        return isDigit(c) || isAlpha(c);
    }

    public static boolean isWordChar(char c) {
        return isDigit(c) || isAlpha(c) || (c == '_');
    }

    public static boolean isPropertyNameChar(char c) {
        return isDigit(c) || isAlpha(c) || (c == '_') || (c == '.') || (c == '[') || (c == ']');
    }

    // ---------------------------------------------------------------- RFC

    /**
     * Indicates whether the given character is in the {@code ALPHA} set.
     *
     * @see RFC 3986, appendix A
     */
    public static boolean isAlpha(char c) {
        return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z'));
    }

    /**
     * Indicates whether the given character is in the {@code DIGIT} set.
     *
     * @see RFC 3986, appendix A
     */
    public static boolean isDigit(char c) {
        return c >= '0' && c <= '9';
    }

    // ---------------------------------------------------------------- silent close

    /**
     * Closes silently the closable object. If it is FLushable, it
     * will be flushed first. No exception will be thrown if an I/O error occurs.
     */
    public static void close(Closeable closeable) {
        if (closeable != null) {
            if (closeable instanceof Flushable) {
                try {
                    ((Flushable) closeable).flush();
                } catch (IOException ignored) {
                }
            }

            try {
                closeable.close();
            } catch (IOException ignored) {
            }
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy