All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.w3c.tidy.EncodingNameMapper Maven / Gradle / Ivy

/*
 *  Java HTML Tidy - JTidy
 *  HTML parser and pretty printer
 *
 *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
 *  Institute of Technology, Institut National de Recherche en
 *  Informatique et en Automatique, Keio University). All Rights
 *  Reserved.
 *
 *  Contributing Author(s):
 *
 *     Dave Raggett 
 *     Andy Quick  (translation to Java)
 *     Gary L Peskin  (Java development)
 *     Sami Lempinen  (release management)
 *     Fabrizio Giustina 
 *
 *  The contributing author(s) would like to thank all those who
 *  helped with testing, bug fixes, and patience.  This wouldn't
 *  have been possible without all of you.
 *
 *  COPYRIGHT NOTICE:
 *
 *  This software and documentation is provided "as is," and
 *  the copyright holders and contributing author(s) make no
 *  representations or warranties, express or implied, including
 *  but not limited to, warranties of merchantability or fitness
 *  for any particular purpose or that the use of the software or
 *  documentation will not infringe any third party patents,
 *  copyrights, trademarks or other rights.
 *
 *  The copyright holders and contributing author(s) will not be
 *  liable for any direct, indirect, special or consequential damages
 *  arising out of any use of the software or documentation, even if
 *  advised of the possibility of such damage.
 *
 *  Permission is hereby granted to use, copy, modify, and distribute
 *  this source code, or portions hereof, documentation and executables,
 *  for any purpose, without fee, subject to the following restrictions:
 *
 *  1. The origin of this source code must not be misrepresented.
 *  2. Altered versions must be plainly marked as such and must
 *     not be misrepresented as being the original source.
 *  3. This Copyright notice may not be removed or altered from any
 *     source or altered source distribution.
 *
 *  The copyright holders and contributing author(s) specifically
 *  permit, without fee, and encourage the use of this source code
 *  as a component for supporting the Hypertext Markup Language in
 *  commercial products. If you use this source code in a product,
 *  acknowledgment is not required but would be appreciated.
 *
 */
package org.w3c.tidy;

import java.util.HashMap;
import java.util.Map;


/**
 * Maps between Java and IANA character encoding names. Also handles encoding alias used in tidy c.
 * @author Fabrizio Giustina
 * @version $Revision$ ($Author$)
 * @see iana character-sets

 */
public abstract class EncodingNameMapper
{

    /**
     * Map containing uppercase alias - {standard iana, standard java}.
     */
    private static Map encodingNameMap = new HashMap<>();

    static
    {
        encodingNameMap.put("ISO-8859-1", new String[]{"ISO-8859-1", "ISO8859_1"});
        encodingNameMap.put("ISO8859_1", new String[]{"ISO-8859-1", "ISO8859_1"});
        encodingNameMap.put("ISO-IR-100", new String[]{"ISO-8859-1", "ISO8859_1"});
        encodingNameMap.put("LATIN1", new String[]{"ISO-8859-1", "ISO8859_1"});
        encodingNameMap.put("CSISOLATIN1", new String[]{"ISO-8859-1", "ISO8859_1"});
        encodingNameMap.put("L1", new String[]{"ISO-8859-1", "ISO8859_1"});
        encodingNameMap.put("819", new String[]{"ISO-8859-1", "ISO8859_1"});

        encodingNameMap.put("US-ASCII", new String[]{"US-ASCII", "ASCII"});
        encodingNameMap.put("ASCII", new String[]{"US-ASCII", "ASCII"});
        encodingNameMap.put("ISO-IR-6", new String[]{"US-ASCII", "ASCII"});
        encodingNameMap.put("CSASCII", new String[]{"US-ASCII", "ASCII"});
        encodingNameMap.put("ISO646-US", new String[]{"US-ASCII", "ASCII"});
        encodingNameMap.put("US", new String[]{"US-ASCII", "ASCII"});
        encodingNameMap.put("367", new String[]{"US-ASCII", "ASCII"});

        encodingNameMap.put("UTF-8", new String[]{"UTF-8", "UTF8"});
        encodingNameMap.put("UTF8", new String[]{"UTF-8", "UTF8"});
        encodingNameMap.put("UTF-16", new String[]{"UTF-16", "Unicode"});
        encodingNameMap.put("UNICODE", new String[]{"UTF-16", "Unicode"});
        encodingNameMap.put("UTF16", new String[]{"UTF-16", "Unicode"});
        encodingNameMap.put("UTF16", new String[]{"UTF-16", "Unicode"}); // tidy

        encodingNameMap.put("UTF-16BE", new String[]{"UTF-16BE", "UnicodeBig"});
        encodingNameMap.put("UNICODEBIG", new String[]{"UTF-16BE", "UnicodeBig"});
        encodingNameMap.put("UTF16-BE", new String[]{"UTF-16BE", "UnicodeBig"});
        encodingNameMap.put("UTF-16LE", new String[]{"UTF-16LE", "UnicodeLittle"});
        encodingNameMap.put("UNICODELITTLE", new String[]{"UTF-16LE", "UnicodeLittle"});
        encodingNameMap.put("UTF16-LE", new String[]{"UTF-16LE", "UnicodeLittle"});
        encodingNameMap.put("UTF16BE", new String[]{"UTF-16BE", "UnicodeBig"}); // tidy
        encodingNameMap.put("UTF16LE", new String[]{"UTF-16LE", "UnicodeLittle"}); // tidy

        encodingNameMap.put("BIG5", new String[]{"BIG5", "Big5"});
        encodingNameMap.put("CSBIG5", new String[]{"BIG5", "Big5"});

        encodingNameMap.put("SJIS", new String[]{"SHIFT_JIS", "SJIS"});
        encodingNameMap.put("SHIFT_JIS", new String[]{"SHIFT_JIS", "SJIS"});
        encodingNameMap.put("CSSHIFTJIS", new String[]{"CSSHIFTJIS", "SJIS"});
        encodingNameMap.put("MS_KANJI", new String[]{"MS_KANJI", "SJIS"});
        encodingNameMap.put("SHIFTJIS", new String[]{"SHIFT_JIS", "SJIS"}); // tidy

        encodingNameMap.put("JIS", new String[]{"ISO-2022-JP", "JIS"});
        encodingNameMap.put("ISO-2022-JP", new String[]{"ISO-2022-JP", "JIS"});
        encodingNameMap.put("CSISO2022JP", new String[]{"CSISO2022JP", "JIS"});
        encodingNameMap.put("ISO2022", new String[]{"ISO-2022-JP", "JIS"}); // tidy

        encodingNameMap.put("ISO2022KR", new String[]{"ISO-2022-KR", "ISO2022KR"});
        encodingNameMap.put("ISO-2022-KR", new String[]{"ISO-2022-KR", "ISO2022KR"});
        encodingNameMap.put("CSISO2022KR", new String[]{"CSISO2022KR", "ISO2022KR"});
        encodingNameMap.put("ISO-2022-CN", new String[]{"ISO-2022-CN", "ISO2022CN"});
        encodingNameMap.put("ISO2022CN", new String[]{"ISO-2022-CN", "ISO2022CN"});

        encodingNameMap.put("MACROMAN", new String[]{"macintosh", "MacRoman"}); // tidy
        encodingNameMap.put("MACINTOSH", new String[]{"macintosh", "MacRoman"});
        encodingNameMap.put("MACINTOSH ROMAN", new String[]{"macintosh", "MacRoman"});

        encodingNameMap.put("37", new String[]{"IBM037", "CP037"});
        encodingNameMap.put("273", new String[]{"IBM273", "CP273"});
        encodingNameMap.put("277", new String[]{"IBM277", "CP277"});
        encodingNameMap.put("278", new String[]{"IBM278", "CP278"});
        encodingNameMap.put("280", new String[]{"IBM280", "CP280"});
        encodingNameMap.put("284", new String[]{"IBM284", "CP284"});
        encodingNameMap.put("285", new String[]{"IBM285", "CP285"});
        encodingNameMap.put("290", new String[]{"IBM290", "CP290"});
        encodingNameMap.put("297", new String[]{"IBM297", "CP297"});
        encodingNameMap.put("420", new String[]{"IBM420", "CP420"});
        encodingNameMap.put("424", new String[]{"IBM424", "CP424"});
        encodingNameMap.put("437", new String[]{"IBM437", "CP437"});
        encodingNameMap.put("500", new String[]{"IBM500", "CP500"});
        encodingNameMap.put("775", new String[]{"IBM775", "CP775"});
        encodingNameMap.put("850", new String[]{"IBM850", "CP850"});
        encodingNameMap.put("852", new String[]{"IBM852", "CP852"});
        encodingNameMap.put("CSPCP852", new String[]{"IBM852", "CP852"});
        encodingNameMap.put("855", new String[]{"IBM855", "CP855"});
        encodingNameMap.put("857", new String[]{"IBM857", "CP857"});
        encodingNameMap.put("858", new String[]{"IBM00858", "Cp858"});
        encodingNameMap.put("0858", new String[]{"IBM00858", "Cp858"});
        encodingNameMap.put("860", new String[]{"IBM860", "CP860"});
        encodingNameMap.put("861", new String[]{"IBM861", "CP861"});
        encodingNameMap.put("IS", new String[]{"IBM861", "CP861"});
        encodingNameMap.put("862", new String[]{"IBM862", "CP862"});
        encodingNameMap.put("863", new String[]{"IBM863", "CP863"});
        encodingNameMap.put("864", new String[]{"IBM864", "CP864"});
        encodingNameMap.put("865", new String[]{"IBM865", "CP865"});
        encodingNameMap.put("866", new String[]{"IBM866", "CP866"});
        encodingNameMap.put("868", new String[]{"IBM868", "CP868"});
        encodingNameMap.put("AR", new String[]{"IBM868", "CP868"});
        encodingNameMap.put("869", new String[]{"IBM869", "CP869"});
        encodingNameMap.put("GR", new String[]{"IBM869", "CP869"});
        encodingNameMap.put("870", new String[]{"IBM870", "CP870"});
        encodingNameMap.put("871", new String[]{"IBM871", "CP871"});
        encodingNameMap.put("EBCDIC-CP-IS", new String[]{"IBM871", "CP871"});
        encodingNameMap.put("918", new String[]{"CP918", "CP918"});
        encodingNameMap.put("924", new String[]{"IBM00924", "CP924"});
        encodingNameMap.put("0924", new String[]{"IBM00924", "CP924"});
        encodingNameMap.put("1026", new String[]{"IBM1026", "CP1026"});
        encodingNameMap.put("1047", new String[]{"IBM1047", "Cp1047"});
        encodingNameMap.put("1140", new String[]{"IBM01140", "Cp1140"});
        encodingNameMap.put("1141", new String[]{"IBM01141", "Cp1141"});
        encodingNameMap.put("1142", new String[]{"IBM01142", "Cp1142"});
        encodingNameMap.put("1143", new String[]{"IBM01143", "Cp1143"});
        encodingNameMap.put("1144", new String[]{"IBM01144", "Cp1144"});
        encodingNameMap.put("1145", new String[]{"IBM01145", "Cp1145"});
        encodingNameMap.put("1146", new String[]{"IBM01146", "Cp1146"});
        encodingNameMap.put("1147", new String[]{"IBM01147", "Cp1147"});
        encodingNameMap.put("1148", new String[]{"IBM01148", "Cp1148"});
        encodingNameMap.put("1149", new String[]{"IBM01149", "Cp1149"});
        encodingNameMap.put("1250", new String[]{"WINDOWS-1250", "Cp1250"});
        encodingNameMap.put("1251", new String[]{"WINDOWS-1251", "Cp1251"});
        encodingNameMap.put("1252", new String[]{"WINDOWS-1252", "Cp1252"});
        encodingNameMap.put("WIN1252", new String[]{"WINDOWS-1252", "Cp1252"}); // tidy
        encodingNameMap.put("1253", new String[]{"WINDOWS-1253", "Cp1253"});
        encodingNameMap.put("1254", new String[]{"WINDOWS-1254", "Cp1254"});
        encodingNameMap.put("1255", new String[]{"WINDOWS-1255", "Cp1255"});
        encodingNameMap.put("1256", new String[]{"WINDOWS-1256", "Cp1256"});
        encodingNameMap.put("1257", new String[]{"WINDOWS-1257", "Cp1257"});
        encodingNameMap.put("1258", new String[]{"WINDOWS-1258", "Cp1258"});

        encodingNameMap.put("EUC-JP", new String[]{"EUC-JP", "EUCJIS"});
        encodingNameMap.put("EUCJIS", new String[]{"EUC-JP", "EUCJIS"});
        encodingNameMap.put("EUC-KR", new String[]{"EUC-KR", "KSC5601"});
        encodingNameMap.put("KSC5601", new String[]{"EUC-KR", "KSC5601"});
        encodingNameMap.put("GB2312", new String[]{"GB2312", "GB2312"});
        encodingNameMap.put("CSGB2312", new String[]{"GB2312", "GB2312"});
        encodingNameMap.put("X0201", new String[]{"X0201", "JIS0201"});
        encodingNameMap.put("JIS0201", new String[]{"X0201", "JIS0201"});
        encodingNameMap.put("X0208", new String[]{"X0208", "JIS0208"});
        encodingNameMap.put("JIS0208", new String[]{"X0208", "JIS0208"});
        encodingNameMap.put("ISO-IR-87", new String[]{"ISO-IR-87", "JIS0208"});
        encodingNameMap.put("JIS0208", new String[]{"ISO-IR-87", "JIS0208"});
        encodingNameMap.put("X0212", new String[]{"X0212", "JIS0212"});
        encodingNameMap.put("JIS0212", new String[]{"X0212", "JIS0212"});
        encodingNameMap.put("ISO-IR-159", new String[]{"X0212", "JIS0212"});
        encodingNameMap.put("GB18030", new String[]{"GB18030", "GB18030"});

        encodingNameMap.put("936", new String[]{"GBK", "GBK"});
        encodingNameMap.put("MS936", new String[]{"GBK", "GBK"});

        encodingNameMap.put("MS932", new String[]{"WINDOWS-31J", "MS932"});
        encodingNameMap.put("31J", new String[]{"WINDOWS-31J", "MS932"});
        encodingNameMap.put("CSWINDOWS31J", new String[]{"WINDOWS-31J", "MS932"});
        encodingNameMap.put("TIS-620", new String[]{"TIS-620", "TIS620"});
        encodingNameMap.put("TIS620", new String[]{"TIS-620", "TIS620"});

        encodingNameMap.put("ISO-8859-2", new String[]{"ISO-8859-2", "ISO8859_2"});
        encodingNameMap.put("ISO8859_2", new String[]{"ISO-8859-2", "ISO8859_2"});
        encodingNameMap.put("ISO-IR-101", new String[]{"ISO-8859-2", "ISO8859_2"});
        encodingNameMap.put("LATIN2", new String[]{"ISO-8859-2", "ISO8859_2"});
        encodingNameMap.put("L2", new String[]{"ISO-8859-2", "ISO8859_2"});

        encodingNameMap.put("ISO-8859-3", new String[]{"ISO-8859-3", "ISO8859_3"});
        encodingNameMap.put("ISO8859_3", new String[]{"ISO-8859-3", "ISO8859_3"});
        encodingNameMap.put("ISO-IR-109", new String[]{"ISO-8859-3", "ISO8859_3"});
        encodingNameMap.put("LATIN3", new String[]{"ISO-8859-3", "ISO8859_3"});
        encodingNameMap.put("L3", new String[]{"ISO-8859-3", "ISO8859_3"});

        encodingNameMap.put("ISO-8859-4", new String[]{"ISO-8859-4", "ISO8859_4"});
        encodingNameMap.put("ISO8859_4", new String[]{"ISO-8859-4", "ISO8859_4"});
        encodingNameMap.put("ISO-IR-110", new String[]{"ISO-8859-4", "ISO8859_4"});
        encodingNameMap.put("ISO-IR-110", new String[]{"ISO-8859-4", "ISO8859_4"});
        encodingNameMap.put("L4", new String[]{"ISO-8859-4", "ISO8859_4"});

        encodingNameMap.put("ISO-8859-5", new String[]{"ISO-8859-5", "ISO8859_5"});
        encodingNameMap.put("ISO8859_5", new String[]{"ISO-8859-5", "ISO8859_5"});
        encodingNameMap.put("ISO-IR-144", new String[]{"ISO-8859-5", "ISO8859_5"});
        encodingNameMap.put("CYRILLIC", new String[]{"ISO-8859-5", "ISO8859_5"});

        encodingNameMap.put("ISO-8859-6", new String[]{"ISO-8859-6", "ISO8859_6"});
        encodingNameMap.put("ISO8859_6", new String[]{"ISO-8859-6", "ISO8859_6"});
        encodingNameMap.put("ISO-IR-127", new String[]{"ISO-8859-6", "ISO8859_6"});
        encodingNameMap.put("ARABIC", new String[]{"ISO-8859-6", "ISO8859_6"});

        encodingNameMap.put("ISO-8859-7", new String[]{"ISO-8859-7", "ISO8859_7"});
        encodingNameMap.put("ISO8859_7", new String[]{"ISO-8859-7", "ISO8859_7"});
        encodingNameMap.put("ISO-IR-126", new String[]{"ISO-8859-7", "ISO8859_7"});
        encodingNameMap.put("GREEK", new String[]{"ISO-8859-7", "ISO8859_7"});

        encodingNameMap.put("ISO-8859-8", new String[]{"ISO-8859-8", "ISO8859_8"});
        encodingNameMap.put("ISO8859_8", new String[]{"ISO-8859-8", "ISO8859_8"});
        encodingNameMap.put("ISO-8859-8-I", new String[]{"ISO-8859-8", "ISO8859_8"});
        encodingNameMap.put("ISO-IR-138", new String[]{"ISO-8859-8", "ISO8859_8"});
        encodingNameMap.put("HEBREW", new String[]{"ISO-8859-8", "ISO8859_8"});

        encodingNameMap.put("ISO-8859-9", new String[]{"ISO-8859-9", "ISO8859_8"});
        encodingNameMap.put("ISO8859_8", new String[]{"ISO-8859-9", "ISO8859_8"});
        encodingNameMap.put("CSISOLATINHEBREW", new String[]{"ISO-8859-9", "ISO8859_9"});
        encodingNameMap.put("ISO-IR-148", new String[]{"ISO-8859-9", "ISO8859_9"});
        encodingNameMap.put("LATIN5", new String[]{"ISO-8859-9", "ISO8859_9"});
        encodingNameMap.put("CSISOLATIN5", new String[]{"ISO-8859-9", "ISO8859_9"});
        encodingNameMap.put("L5", new String[]{"ISO-8859-9", "ISO8859_9"});

        encodingNameMap.put("ISO-8859-15", new String[]{"ISO-8859-15", "ISO8859_15"});
        encodingNameMap.put("ISO8859_15", new String[]{"ISO-8859-15", "ISO8859_15"});

        encodingNameMap.put("KOI8-R", new String[]{"KOI8-R", "KOI8_R"});
        encodingNameMap.put("KOI8_R", new String[]{"CSKOI8R", "KOI8_R"});
        encodingNameMap.put("CSKOI8R", new String[]{"CSKOI8R", "KOI8_R"});
    }

    /**
     * Convert a Java character encoding name to its IANA equivalent.
     * @param encoding java encoding name or alias
     * @return iana equivalent or null if no match is found.
     */
    public static String toIana(String encoding)
    {
        if (encoding == null)
        {
            return null;
        }

        String[] values = (String[]) encodingNameMap.get(handlecommonAlias(encoding));
        if (values != null)
        {
            return values[0];
        }

        return null;
    }

    /**
     * "Fix" the name for common alias to reduce the number of entries needed in the hashmap. It actually removes CSIBM,
     * CCSID, IBM-, IBM0, CP-0, IBM, CP0, CP-, CP, WINDOWS- prefixes from given name.
     * @param encoding encoding name
     * @return "fixed" encoding.
     */
    private static String handlecommonAlias(String encoding)
    {
        String key = encoding.toUpperCase();

        // handle common alias
        if (key.startsWith("CSIBM") || key.startsWith("CCSID"))
        {
            key = key.substring(5);
        }
        else if (key.startsWith("IBM-") || key.startsWith("IBM0") || key.startsWith("CP-0"))
        {
            key = key.substring(4);
        }
        else if (key.startsWith("IBM") || key.startsWith("CP0") || key.startsWith("CP-"))
        {
            key = key.substring(3);
        }
        else if (key.startsWith("CP"))
        {
            key = key.substring(2);
        }
        else if (key.startsWith("WINDOWS-"))
        {
            key = key.substring(8);
        }
        else if (key.startsWith("ISO_"))
        {
            key = "ISO-" + key.substring(4);
        }

        return key;
    }

    /**
     * Converts an encoding name to the standard java name. Handles IANA names, legacy names used in tidy and different
     * java encoding alias. See http://www.iana.org/assignments/character-sets.
     * @param encoding IANA encoding name or alias
     * @return java equivalent or null if no match is found.
     */
    public static String toJava(String encoding)
    {
        if (encoding == null)
        {
            return null;
        }

        String[] values = (String[]) encodingNameMap.get(handlecommonAlias(encoding));
        if (values != null)
        {
            return values[1];
        }

        return null;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy