All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.w3c.tidy.TidyUtils Maven / Gradle / Ivy

Go to download

JTidy is a Java port of HTML Tidy, a HTML syntax checker and pretty printer. Like its non-Java cousin, JTidy can be used as a tool for cleaning up malformed and faulty HTML. In addition, JTidy provides a DOM interface to the document that is being processed, which effectively makes you able to use JTidy as a DOM parser for real-world HTML.

There is a newer version: 1.0.5
Show newest version
/*
 *  Java HTML Tidy - JTidy
 *  HTML parser and pretty printer
 *
 *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
 *  Institute of Technology, Institut National de Recherche en
 *  Informatique et en Automatique, Keio University). All Rights
 *  Reserved.
 *
 *  Contributing Author(s):
 *
 *     Dave Raggett 
 *     Andy Quick  (translation to Java)
 *     Gary L Peskin  (Java development)
 *     Sami Lempinen  (release management)
 *     Fabrizio Giustina 
 *
 *  The contributing author(s) would like to thank all those who
 *  helped with testing, bug fixes, and patience.  This wouldn't
 *  have been possible without all of you.
 *
 *  COPYRIGHT NOTICE:
 *
 *  This software and documentation is provided "as is," and
 *  the copyright holders and contributing author(s) make no
 *  representations or warranties, express or implied, including
 *  but not limited to, warranties of merchantability or fitness
 *  for any particular purpose or that the use of the software or
 *  documentation will not infringe any third party patents,
 *  copyrights, trademarks or other rights.
 *
 *  The copyright holders and contributing author(s) will not be
 *  liable for any direct, indirect, special or consequential damages
 *  arising out of any use of the software or documentation, even if
 *  advised of the possibility of such damage.
 *
 *  Permission is hereby granted to use, copy, modify, and distribute
 *  this source code, or portions hereof, documentation and executables,
 *  for any purpose, without fee, subject to the following restrictions:
 *
 *  1. The origin of this source code must not be misrepresented.
 *  2. Altered versions must be plainly marked as such and must
 *     not be misrepresented as being the original source.
 *  3. This Copyright notice may not be removed or altered from any
 *     source or altered source distribution.
 *
 *  The copyright holders and contributing author(s) specifically
 *  permit, without fee, and encourage the use of this source code
 *  as a component for supporting the Hypertext Markup Language in
 *  commercial products. If you use this source code in a product,
 *  acknowledgment is not required but would be appreciated.
 *
 */

package org.w3c.tidy;

import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;


/**
 * Utility class with handy methods, mainly for String handling or for reproducing c behaviours.
 * @author Fabrizio Giustina
 * @version $Revision $ ($Author $)
 */
public final class TidyUtils
{

    /**
     * char type: digit.
     */
    private static final short DIGIT = 1;

    /**
     * char type: letter.
     */
    private static final short LETTER = 2;

    /**
     * char type: namechar.
     */
    private static final short NAMECHAR = 4;

    /**
     * char type: whitespace.
     */
    private static final short WHITE = 8;

    /**
     * char type: newline.
     */
    private static final short NEWLINE = 16;

    /**
     * char type: lowercase.
     */
    private static final short LOWERCASE = 32;

    /**
     * char type: uppercase.
     */
    private static final short UPPERCASE = 64;

    /**
     * used to classify chars for lexical purposes.
     */
    private static short[] lexmap = new short[128];

    static
    {
        mapStr("\r\n\f", (short) (NEWLINE | WHITE));
        mapStr(" \t", WHITE);
        mapStr("-.:_", NAMECHAR);
        mapStr("0123456789", (short) (DIGIT | NAMECHAR));
        mapStr("abcdefghijklmnopqrstuvwxyz", (short) (LOWERCASE | LETTER | NAMECHAR));
        mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short) (UPPERCASE | LETTER | NAMECHAR));
    }

    /**
     * utility class, don't instantiate.
     */
    private TidyUtils()
    {
        // unused
    }

    /**
     * Converts a int to a boolean.
     * @param value int value
     * @return true if value is != 0
     */
    static boolean toBoolean(int value)
    {
        return value != 0;
    }

    /**
     * convert an int to unsigned (& 0xFF).
     * @param c signed int
     * @return unsigned int
     */
    static int toUnsigned(int c)
    {
        return c & 0xFF;
    }

    /**
     * check if the first String contains the second one.
     * @param s1 full String
     * @param len1 maximum position in String
     * @param s2 String to search for
     * @return true if s1 contains s2 in the range 0-len1
     */
    static boolean wsubstrn(String s1, int len1, String s2)
    {
        int searchIndex = s1.indexOf(s2);
        return searchIndex > -1 && searchIndex <= len1;
    }

    /**
     * check if the first String contains the second one (ignore case).
     * @param s1 full String
     * @param len1 maximum position in String
     * @param s2 String to search for
     * @return true if s1 contains s2 in the range 0-len1
     */
    static boolean wsubstrncase(String s1, int len1, String s2)
    {
        return wsubstrn(s1.toLowerCase(), len1, s2.toLowerCase());
    }

    /**
     * return offset of cc from beginning of s1, -1 if not found.
     * @param s1 String
     * @param len1 maximum offset (values > than lenl are ignored and returned as -1)
     * @param cc character to search for
     * @return index of cc in s1
     */
    static int wstrnchr(String s1, int len1, char cc)
    {
        int indexOf = s1.indexOf(cc);
        if (indexOf < len1)
        {
            return indexOf;
        }

        return -1;
    }

    /**
     * Same as wsubstrn, but without a specified length.
     * @param s1 full String
     * @param s2 String to search for
     * @return true if s2 is found in s2 (case insensitive search)
     */
    static boolean wsubstr(String s1, String s2)
    {
        int i;
        int len1 = s1.length();
        int len2 = s2.length();

        for (i = 0; i <= len1 - len2; ++i)
        {
            if (s2.equalsIgnoreCase(s1.substring(i)))
            {
                return true;
            }
        }

        return false;
    }

    /**
     * Is the character a hex digit?
     * @param c char
     * @return true if he given character is a hex digit
     */
    static boolean isxdigit(char c)
    {
        return Character.isDigit(c) || (Character.toLowerCase(c) >= 'a' && Character.toLowerCase(c) <= 'f');
    }

    /**
     * Check if the string valueToCheck is contained in validValues array (case insesitie comparison).
     * @param validValues array of valid values
     * @param valueToCheck value to search for
     * @return true if valueToCheck is found in validValues
     */
    static boolean isInValuesIgnoreCase(String[] validValues, String valueToCheck)
    {
        int len = validValues.length;
        for (String validValue : validValues)
        {
            if (validValue.equalsIgnoreCase(valueToCheck))
            {
                return true;
            }
        }
        return false;
    }

    /**
     * Return true if substring s is in p and isn't all in upper case. This is used to check the case of SYSTEM, PUBLIC,
     * DTD and EN.
     * @param s substring
     * @param p full string
     * @param len how many chars to check in p
     * @return true if substring s is in p and isn't all in upper case
     */
    public static boolean findBadSubString(String s, String p, int len)
    {
        int n = s.length();
        int i = 0;
        String ps;

        while (n < len)
        {
            ps = p.substring(i, i + n);
            if (s.equalsIgnoreCase(ps))
            {
                return (!s.equals(ps));
            }

            ++i;
            --len;
        }

        return false;
    }

    /**
     * Is the given char a valid xml letter?
     * @param c char
     * @return true if the char is a valid xml letter
     */
    static boolean isXMLLetter(char c)
    {
        return ((c >= 0x41 && c <= 0x5a)
            || (c >= 0x61 && c <= 0x7a)
            || (c >= 0xc0 && c <= 0xd6)
            || (c >= 0xd8 && c <= 0xf6)
            || (c >= 0xf8 && c <= 0xff)
            || (c >= 0x100 && c <= 0x131)
            || (c >= 0x134 && c <= 0x13e)
            || (c >= 0x141 && c <= 0x148)
            || (c >= 0x14a && c <= 0x17e)
            || (c >= 0x180 && c <= 0x1c3)
            || (c >= 0x1cd && c <= 0x1f0)
            || (c >= 0x1f4 && c <= 0x1f5)
            || (c >= 0x1fa && c <= 0x217)
            || (c >= 0x250 && c <= 0x2a8)
            || (c >= 0x2bb && c <= 0x2c1)
            || c == 0x386
            || (c >= 0x388 && c <= 0x38a)
            || c == 0x38c
            || (c >= 0x38e && c <= 0x3a1)
            || (c >= 0x3a3 && c <= 0x3ce)
            || (c >= 0x3d0 && c <= 0x3d6)
            || c == 0x3da
            || c == 0x3dc
            || c == 0x3de
            || c == 0x3e0
            || (c >= 0x3e2 && c <= 0x3f3)
            || (c >= 0x401 && c <= 0x40c)
            || (c >= 0x40e && c <= 0x44f)
            || (c >= 0x451 && c <= 0x45c)
            || (c >= 0x45e && c <= 0x481)
            || (c >= 0x490 && c <= 0x4c4)
            || (c >= 0x4c7 && c <= 0x4c8)
            || (c >= 0x4cb && c <= 0x4cc)
            || (c >= 0x4d0 && c <= 0x4eb)
            || (c >= 0x4ee && c <= 0x4f5)
            || (c >= 0x4f8 && c <= 0x4f9)
            || (c >= 0x531 && c <= 0x556)
            || c == 0x559
            || (c >= 0x561 && c <= 0x586)
            || (c >= 0x5d0 && c <= 0x5ea)
            || (c >= 0x5f0 && c <= 0x5f2)
            || (c >= 0x621 && c <= 0x63a)
            || (c >= 0x641 && c <= 0x64a)
            || (c >= 0x671 && c <= 0x6b7)
            || (c >= 0x6ba && c <= 0x6be)
            || (c >= 0x6c0 && c <= 0x6ce)
            || (c >= 0x6d0 && c <= 0x6d3)
            || c == 0x6d5
            || (c >= 0x6e5 && c <= 0x6e6)
            || (c >= 0x905 && c <= 0x939)
            || c == 0x93d
            || (c >= 0x958 && c <= 0x961)
            || (c >= 0x985 && c <= 0x98c)
            || (c >= 0x98f && c <= 0x990)
            || (c >= 0x993 && c <= 0x9a8)
            || (c >= 0x9aa && c <= 0x9b0)
            || c == 0x9b2
            || (c >= 0x9b6 && c <= 0x9b9)
            || (c >= 0x9dc && c <= 0x9dd)
            || (c >= 0x9df && c <= 0x9e1)
            || (c >= 0x9f0 && c <= 0x9f1)
            || (c >= 0xa05 && c <= 0xa0a)
            || (c >= 0xa0f && c <= 0xa10)
            || (c >= 0xa13 && c <= 0xa28)
            || (c >= 0xa2a && c <= 0xa30)
            || (c >= 0xa32 && c <= 0xa33)
            || (c >= 0xa35 && c <= 0xa36)
            || (c >= 0xa38 && c <= 0xa39)
            || (c >= 0xa59 && c <= 0xa5c)
            || c == 0xa5e
            || (c >= 0xa72 && c <= 0xa74)
            || (c >= 0xa85 && c <= 0xa8b)
            || c == 0xa8d
            || (c >= 0xa8f && c <= 0xa91)
            || (c >= 0xa93 && c <= 0xaa8)
            || (c >= 0xaaa && c <= 0xab0)
            || (c >= 0xab2 && c <= 0xab3)
            || (c >= 0xab5 && c <= 0xab9)
            || c == 0xabd
            || c == 0xae0
            || (c >= 0xb05 && c <= 0xb0c)
            || (c >= 0xb0f && c <= 0xb10)
            || (c >= 0xb13 && c <= 0xb28)
            || (c >= 0xb2a && c <= 0xb30)
            || (c >= 0xb32 && c <= 0xb33)
            || (c >= 0xb36 && c <= 0xb39)
            || c == 0xb3d
            || (c >= 0xb5c && c <= 0xb5d)
            || (c >= 0xb5f && c <= 0xb61)
            || (c >= 0xb85 && c <= 0xb8a)
            || (c >= 0xb8e && c <= 0xb90)
            || (c >= 0xb92 && c <= 0xb95)
            || (c >= 0xb99 && c <= 0xb9a)
            || c == 0xb9c
            || (c >= 0xb9e && c <= 0xb9f)
            || (c >= 0xba3 && c <= 0xba4)
            || (c >= 0xba8 && c <= 0xbaa)
            || (c >= 0xbae && c <= 0xbb5)
            || (c >= 0xbb7 && c <= 0xbb9)
            || (c >= 0xc05 && c <= 0xc0c)
            || (c >= 0xc0e && c <= 0xc10)
            || (c >= 0xc12 && c <= 0xc28)
            || (c >= 0xc2a && c <= 0xc33)
            || (c >= 0xc35 && c <= 0xc39)
            || (c >= 0xc60 && c <= 0xc61)
            || (c >= 0xc85 && c <= 0xc8c)
            || (c >= 0xc8e && c <= 0xc90)
            || (c >= 0xc92 && c <= 0xca8)
            || (c >= 0xcaa && c <= 0xcb3)
            || (c >= 0xcb5 && c <= 0xcb9)
            || c == 0xcde
            || (c >= 0xce0 && c <= 0xce1)
            || (c >= 0xd05 && c <= 0xd0c)
            || (c >= 0xd0e && c <= 0xd10)
            || (c >= 0xd12 && c <= 0xd28)
            || (c >= 0xd2a && c <= 0xd39)
            || (c >= 0xd60 && c <= 0xd61)
            || (c >= 0xe01 && c <= 0xe2e)
            || c == 0xe30
            || (c >= 0xe32 && c <= 0xe33)
            || (c >= 0xe40 && c <= 0xe45)
            || (c >= 0xe81 && c <= 0xe82)
            || c == 0xe84
            || (c >= 0xe87 && c <= 0xe88)
            || c == 0xe8a
            || c == 0xe8d
            || (c >= 0xe94 && c <= 0xe97)
            || (c >= 0xe99 && c <= 0xe9f)
            || (c >= 0xea1 && c <= 0xea3)
            || c == 0xea5
            || c == 0xea7
            || (c >= 0xeaa && c <= 0xeab)
            || (c >= 0xead && c <= 0xeae)
            || c == 0xeb0
            || (c >= 0xeb2 && c <= 0xeb3)
            || c == 0xebd
            || (c >= 0xec0 && c <= 0xec4)
            || (c >= 0xf40 && c <= 0xf47)
            || (c >= 0xf49 && c <= 0xf69)
            || (c >= 0x10a0 && c <= 0x10c5)
            || (c >= 0x10d0 && c <= 0x10f6)
            || c == 0x1100
            || (c >= 0x1102 && c <= 0x1103)
            || (c >= 0x1105 && c <= 0x1107)
            || c == 0x1109
            || (c >= 0x110b && c <= 0x110c)
            || (c >= 0x110e && c <= 0x1112)
            || c == 0x113c
            || c == 0x113e
            || c == 0x1140
            || c == 0x114c
            || c == 0x114e
            || c == 0x1150
            || (c >= 0x1154 && c <= 0x1155)
            || c == 0x1159
            || (c >= 0x115f && c <= 0x1161)
            || c == 0x1163
            || c == 0x1165
            || c == 0x1167
            || c == 0x1169
            || (c >= 0x116d && c <= 0x116e)
            || (c >= 0x1172 && c <= 0x1173)
            || c == 0x1175
            || c == 0x119e
            || c == 0x11a8
            || c == 0x11ab
            || (c >= 0x11ae && c <= 0x11af)
            || (c >= 0x11b7 && c <= 0x11b8)
            || c == 0x11ba
            || (c >= 0x11bc && c <= 0x11c2)
            || c == 0x11eb
            || c == 0x11f0
            || c == 0x11f9
            || (c >= 0x1e00 && c <= 0x1e9b)
            || (c >= 0x1ea0 && c <= 0x1ef9)
            || (c >= 0x1f00 && c <= 0x1f15)
            || (c >= 0x1f18 && c <= 0x1f1d)
            || (c >= 0x1f20 && c <= 0x1f45)
            || (c >= 0x1f48 && c <= 0x1f4d)
            || (c >= 0x1f50 && c <= 0x1f57)
            || c == 0x1f59
            || c == 0x1f5b
            || c == 0x1f5d
            || (c >= 0x1f5f && c <= 0x1f7d)
            || (c >= 0x1f80 && c <= 0x1fb4)
            || (c >= 0x1fb6 && c <= 0x1fbc)
            || c == 0x1fbe
            || (c >= 0x1fc2 && c <= 0x1fc4)
            || (c >= 0x1fc6 && c <= 0x1fcc)
            || (c >= 0x1fd0 && c <= 0x1fd3)
            || (c >= 0x1fd6 && c <= 0x1fdb)
            || (c >= 0x1fe0 && c <= 0x1fec)
            || (c >= 0x1ff2 && c <= 0x1ff4)
            || (c >= 0x1ff6 && c <= 0x1ffc)
            || c == 0x2126
            || (c >= 0x212a && c <= 0x212b)
            || c == 0x212e
            || (c >= 0x2180 && c <= 0x2182)
            || (c >= 0x3041 && c <= 0x3094)
            || (c >= 0x30a1 && c <= 0x30fa)
            || (c >= 0x3105 && c <= 0x312c)
            || (c >= 0xac00 && c <= 0xd7a3)
            || (c >= 0x4e00 && c <= 0x9fa5)
            || c == 0x3007
            || (c >= 0x3021 && c <= 0x3029)
            || (c >= 0x4e00 && c <= 0x9fa5)
            || c == 0x3007 || (c >= 0x3021 && c <= 0x3029));
    }

    /**
     * Is the given char valid in xml name?
     * @param c char
     * @return true if the char is a valid xml name char
     */
    static boolean isXMLNamechar(char c)
    {
        return (isXMLLetter(c)
            || c == '.'
            || c == '_'
            || c == ':'
            || c == '-'
            || (c >= 0x300 && c <= 0x345)
            || (c >= 0x360 && c <= 0x361)
            || (c >= 0x483 && c <= 0x486)
            || (c >= 0x591 && c <= 0x5a1)
            || (c >= 0x5a3 && c <= 0x5b9)
            || (c >= 0x5bb && c <= 0x5bd)
            || c == 0x5bf
            || (c >= 0x5c1 && c <= 0x5c2)
            || c == 0x5c4
            || (c >= 0x64b && c <= 0x652)
            || c == 0x670
            || (c >= 0x6d6 && c <= 0x6dc)
            || (c >= 0x6dd && c <= 0x6df)
            || (c >= 0x6e0 && c <= 0x6e4)
            || (c >= 0x6e7 && c <= 0x6e8)
            || (c >= 0x6ea && c <= 0x6ed)
            || (c >= 0x901 && c <= 0x903)
            || c == 0x93c
            || (c >= 0x93e && c <= 0x94c)
            || c == 0x94d
            || (c >= 0x951 && c <= 0x954)
            || (c >= 0x962 && c <= 0x963)
            || (c >= 0x981 && c <= 0x983)
            || c == 0x9bc
            || c == 0x9be
            || c == 0x9bf
            || (c >= 0x9c0 && c <= 0x9c4)
            || (c >= 0x9c7 && c <= 0x9c8)
            || (c >= 0x9cb && c <= 0x9cd)
            || c == 0x9d7
            || (c >= 0x9e2 && c <= 0x9e3)
            || c == 0xa02
            || c == 0xa3c
            || c == 0xa3e
            || c == 0xa3f
            || (c >= 0xa40 && c <= 0xa42)
            || (c >= 0xa47 && c <= 0xa48)
            || (c >= 0xa4b && c <= 0xa4d)
            || (c >= 0xa70 && c <= 0xa71)
            || (c >= 0xa81 && c <= 0xa83)
            || c == 0xabc
            || (c >= 0xabe && c <= 0xac5)
            || (c >= 0xac7 && c <= 0xac9)
            || (c >= 0xacb && c <= 0xacd)
            || (c >= 0xb01 && c <= 0xb03)
            || c == 0xb3c
            || (c >= 0xb3e && c <= 0xb43)
            || (c >= 0xb47 && c <= 0xb48)
            || (c >= 0xb4b && c <= 0xb4d)
            || (c >= 0xb56 && c <= 0xb57)
            || (c >= 0xb82 && c <= 0xb83)
            || (c >= 0xbbe && c <= 0xbc2)
            || (c >= 0xbc6 && c <= 0xbc8)
            || (c >= 0xbca && c <= 0xbcd)
            || c == 0xbd7
            || (c >= 0xc01 && c <= 0xc03)
            || (c >= 0xc3e && c <= 0xc44)
            || (c >= 0xc46 && c <= 0xc48)
            || (c >= 0xc4a && c <= 0xc4d)
            || (c >= 0xc55 && c <= 0xc56)
            || (c >= 0xc82 && c <= 0xc83)
            || (c >= 0xcbe && c <= 0xcc4)
            || (c >= 0xcc6 && c <= 0xcc8)
            || (c >= 0xcca && c <= 0xccd)
            || (c >= 0xcd5 && c <= 0xcd6)
            || (c >= 0xd02 && c <= 0xd03)
            || (c >= 0xd3e && c <= 0xd43)
            || (c >= 0xd46 && c <= 0xd48)
            || (c >= 0xd4a && c <= 0xd4d)
            || c == 0xd57
            || c == 0xe31
            || (c >= 0xe34 && c <= 0xe3a)
            || (c >= 0xe47 && c <= 0xe4e)
            || c == 0xeb1
            || (c >= 0xeb4 && c <= 0xeb9)
            || (c >= 0xebb && c <= 0xebc)
            || (c >= 0xec8 && c <= 0xecd)
            || (c >= 0xf18 && c <= 0xf19)
            || c == 0xf35
            || c == 0xf37
            || c == 0xf39
            || c == 0xf3e
            || c == 0xf3f
            || (c >= 0xf71 && c <= 0xf84)
            || (c >= 0xf86 && c <= 0xf8b)
            || (c >= 0xf90 && c <= 0xf95)
            || c == 0xf97
            || (c >= 0xf99 && c <= 0xfad)
            || (c >= 0xfb1 && c <= 0xfb7)
            || c == 0xfb9
            || (c >= 0x20d0 && c <= 0x20dc)
            || c == 0x20e1
            || (c >= 0x302a && c <= 0x302f)
            || c == 0x3099
            || c == 0x309a
            || (c >= 0x30 && c <= 0x39)
            || (c >= 0x660 && c <= 0x669)
            || (c >= 0x6f0 && c <= 0x6f9)
            || (c >= 0x966 && c <= 0x96f)
            || (c >= 0x9e6 && c <= 0x9ef)
            || (c >= 0xa66 && c <= 0xa6f)
            || (c >= 0xae6 && c <= 0xaef)
            || (c >= 0xb66 && c <= 0xb6f)
            || (c >= 0xbe7 && c <= 0xbef)
            || (c >= 0xc66 && c <= 0xc6f)
            || (c >= 0xce6 && c <= 0xcef)
            || (c >= 0xd66 && c <= 0xd6f)
            || (c >= 0xe50 && c <= 0xe59)
            || (c >= 0xed0 && c <= 0xed9)
            || (c >= 0xf20 && c <= 0xf29)
            || c == 0xb7
            || c == 0x2d0
            || c == 0x2d1
            || c == 0x387
            || c == 0x640
            || c == 0xe46
            || c == 0xec6
            || c == 0x3005
            || (c >= 0x3031 && c <= 0x3035)
            || (c >= 0x309d && c <= 0x309e) || (c >= 0x30fc && c <= 0x30fe));
    }

    /**
     * Is the given character a single or double quote?
     * @param c char
     * @return true if c is " or '
     */
    static boolean isQuote(int c)
    {
        return (c == '\'' || c == '\"');
    }

    /**
     * Should always be able convert to/from UTF-8, so encoding exceptions are converted to an Error to avoid adding
     * throws declarations in lots of methods.
     * @param str String
     * @return utf8 bytes
     * @see String#getBytes()
     */
    public static byte[] getBytes(String str)
    {
        return str.getBytes(StandardCharsets.UTF_8);
    }

    /**
     * Should always be able convert to/from UTF-8, so encoding exceptions are converted to an Error to avoid adding
     * throws declarations in lots of methods.
     * @param bytes byte array
     * @param offset starting offset in byte array
     * @param length length in byte array starting from offset
     * @return same as new String(bytes, offset, length, "UTF8")
     */
    public static String getString(final byte[] bytes, final int offset, final int length) {
        return length == 0 ? null : new String(bytes, offset, Math.min(length, bytes.length - offset),
            StandardCharsets.UTF_8);
    }

    /**
     * Return the last char in string. This is useful when trailing quotemark is missing on an attribute
     * @param str String
     * @return last char in String
     */
    public static int lastChar(String str)
    {
        if (str != null && str.length() > 0)
        {
            return str.charAt(str.length() - 1);
        }

        return 0;
    }

    /**
     * Determines if the specified character is whitespace.
     * @param c char
     * @return true if char is whitespace.
     */
    public static boolean isWhite(char c)
    {
        short m = map(c);
        return TidyUtils.toBoolean(m & WHITE);
    }

    /**
     * Is the given char a digit?
     * @param c char
     * @return true if the given char is a digit
     */
    public static boolean isDigit(char c)
    {
        short m;
        m = map(c);
        return TidyUtils.toBoolean(m & DIGIT);
    }

    /**
     * Is the given char a letter?
     * @param c char
     * @return true if the given char is a letter
     */
    public static boolean isLetter(char c)
    {
        short m;
        m = map(c);
        return TidyUtils.toBoolean(m & LETTER);
    }

    /**
     * Is the given char valid in name? (letter, digit or "-", ".", ":", "_")
     * @param c char
     * @return true if char is a name char.
     */
    public static boolean isNamechar(char c)
    {
        short map = map(c);

        return TidyUtils.toBoolean(map & NAMECHAR);
    }

    /**
     * Determines if the specified character is a lowercase character.
     * @param c char
     * @return true if char is lower case.
     */
    public static boolean isLower(char c)
    {
        short map = map(c);

        return TidyUtils.toBoolean(map & LOWERCASE);
    }

    /**
     * Determines if the specified character is a uppercase character.
     * @param c char
     * @return true if char is upper case.
     */
    public static boolean isUpper(char c)
    {
        short map = map(c);

        return TidyUtils.toBoolean(map & UPPERCASE);
    }

    /**
     * Maps the given character to its lowercase equivalent.
     * @param c char
     * @return lowercase char.
     */
    public static char toLower(char c)
    {
        short m = map(c);

        if (TidyUtils.toBoolean(m & UPPERCASE))
        {
            c = (char) (c + 'a' - 'A');
        }

        return c;
    }

    /**
     * Maps the given character to its uppercase equivalent.
     * @param c char
     * @return uppercase char.
     */
    public static char toUpper(char c)
    {
        short m = map(c);

        if (TidyUtils.toBoolean(m & LOWERCASE))
        {
            c = (char) (c + 'A' - 'a');
        }

        return c;
    }

    /**
     * Fold case of a char.
     * @param c char
     * @param tocaps convert to caps
     * @param xmlTags use xml tags? If true no change will be performed
     * @return folded char
     * TODO check the use of xmlTags parameter
     */
    public static char foldCase(char c, boolean tocaps, boolean xmlTags)
    {

        if (!xmlTags)
        {

            if (tocaps)
            {
                if (isLower(c))
                {
                    c = toUpper(c);
                }
            }
            else
            {
                // force to lower case
                if (isUpper(c))
                {
                    c = toLower(c);
                }
            }
        }

        return c;
    }

    /**
     * Classify chars in String and put them in lexmap.
     * @param str String
     * @param code code associated to chars in the String
     */
    private static void mapStr(String str, short code)
    {
        int c;
        for (int i = 0; i < str.length(); i++)
        {
            c = str.charAt(i);
            lexmap[c] |= code;
        }
    }

    /**
     * Returns the constant which defines the classification of char in lexmap.
     * @param c char
     * @return char type
     */
    private static short map(char c)
    {
        return (c < 128 ? lexmap[c] : 0);
    }

    /**
     * Is the given character encoding supported?
     * @param name character encoding name
     * @return true if encoding is supported, false otherwhise.
     */
    public static boolean isCharEncodingSupported(String name)
    {
        name = EncodingNameMapper.toJava(name);
        if (name == null)
        {
            return false;
        }

        try
        {
            "".getBytes(name);
        }
        catch (UnsupportedEncodingException e)
        {
            return false;
        }
        return true;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy