org.enhydra.apache.xerces.readers.MIME2Java Maven / Gradle / Ivy
/*
* The Apache Software License, Version 1.1
*
*
* Copyright (c) 1999 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact [email protected].
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.apache.org. For more
* information on the Apache Software Foundation, please see
* .
*/
package org.enhydra.apache.xerces.readers;
import java.util.Hashtable;
/**
* MIME2Java is a convenience class which handles conversions between MIME charset names
* and Java encoding names.
* The supported XML encodings are the intersection of XML-supported code sets and those
* supported in JDK 1.1.
*
MIME charset names are used on xmlEncoding parameters to methods such
* as TXDocument#setEncoding
and DTD#setEncoding
.
*
Java encoding names are used on encoding parameters to
* methods such as TXDocument#printWithFormat
and DTD#printExternal
.
*
*
*
*
* Common Name
*
*
* Use this name in XML files
*
*
* Name Type
*
*
* Xerces converts to this Java Encoder Name
*
*
*
* 8 bit Unicode
*
* UTF-8
*
*
* IANA
*
*
* UTF8
*
*
*
* ISO Latin 1
*
* ISO-8859-1
*
*
* MIME
*
*
* ISO-8859-1
*
*
*
* ISO Latin 2
*
* ISO-8859-2
*
*
* MIME
*
*
* ISO-8859-2
*
*
*
* ISO Latin 3
*
* ISO-8859-3
*
*
* MIME
*
*
* ISO-8859-3
*
*
*
* ISO Latin 4
*
* ISO-8859-4
*
*
* MIME
*
*
* ISO-8859-4
*
*
*
* ISO Latin Cyrillic
*
* ISO-8859-5
*
*
* MIME
*
*
* ISO-8859-5
*
*
*
* ISO Latin Arabic
*
* ISO-8859-6
*
*
* MIME
*
*
* ISO-8859-6
*
*
*
* ISO Latin Greek
*
* ISO-8859-7
*
*
* MIME
*
*
* ISO-8859-7
*
*
*
* ISO Latin Hebrew
*
* ISO-8859-8
*
*
* MIME
*
*
* ISO-8859-8
*
*
*
* ISO Latin 5
*
* ISO-8859-9
*
*
* MIME
*
*
* ISO-8859-9
*
*
*
* EBCDIC: US
*
* ebcdic-cp-us
*
*
* IANA
*
*
* cp037
*
*
*
* EBCDIC: Canada
*
* ebcdic-cp-ca
*
*
* IANA
*
*
* cp037
*
*
*
* EBCDIC: Netherlands
*
* ebcdic-cp-nl
*
*
* IANA
*
*
* cp037
*
*
*
* EBCDIC: Denmark
*
* ebcdic-cp-dk
*
*
* IANA
*
*
* cp277
*
*
*
* EBCDIC: Norway
*
* ebcdic-cp-no
*
*
* IANA
*
*
* cp277
*
*
*
* EBCDIC: Finland
*
* ebcdic-cp-fi
*
*
* IANA
*
*
* cp278
*
*
*
* EBCDIC: Sweden
*
* ebcdic-cp-se
*
*
* IANA
*
*
* cp278
*
*
*
* EBCDIC: Italy
*
* ebcdic-cp-it
*
*
* IANA
*
*
* cp280
*
*
*
* EBCDIC: Spain, Latin America
*
* ebcdic-cp-es
*
*
* IANA
*
*
* cp284
*
*
*
* EBCDIC: Great Britain
*
* ebcdic-cp-gb
*
*
* IANA
*
*
* cp285
*
*
*
* EBCDIC: France
*
* ebcdic-cp-fr
*
*
* IANA
*
*
* cp297
*
*
*
* EBCDIC: Arabic
*
* ebcdic-cp-ar1
*
*
* IANA
*
*
* cp420
*
*
*
* EBCDIC: Hebrew
*
* ebcdic-cp-he
*
*
* IANA
*
*
* cp424
*
*
*
* EBCDIC: Switzerland
*
* ebcdic-cp-ch
*
*
* IANA
*
*
* cp500
*
*
*
* EBCDIC: Roece
*
* ebcdic-cp-roece
*
*
* IANA
*
*
* cp870
*
*
*
* EBCDIC: Yogoslavia
*
* ebcdic-cp-yu
*
*
* IANA
*
*
* cp870
*
*
*
* EBCDIC: Iceland
*
* ebcdic-cp-is
*
*
* IANA
*
*
* cp871
*
*
*
* EBCDIC: Urdu
*
* ebcdic-cp-ar2
*
*
* IANA
*
*
* cp918
*
*
*
* Chinese for PRC, mixed 1/2 byte
*
* gb2312
*
*
* MIME
*
*
* GB2312
*
*
*
* Extended Unix Code, packed for Japanese
*
* euc-jp
*
*
* MIME
*
*
* eucjis
*
*
*
* Japanese: ISO-2022-jp
*
* ISO-2020-jp
*
*
* MIME
*
*
* JIS
*
*
*
* Japanese: Shift JIS
*
* Shift_JIS
*
*
* MIME
*
*
* SJIS
*
*
*
* Japanese Windows: An extension of Shift JIS
*
* Windows-31J
*
*
* MIME
*
*
* MS932 (since JDK 1.2)
*
*
*
* Chinese: Big5
*
* Big5
*
*
* MIME
*
*
* Big5
*
*
*
* Extended Unix Code, packed for Korean
*
* euc-kr
*
*
* MIME
*
*
* iso2022kr
*
*
*
* Cyrillic
*
* koi8-r
*
*
* MIME
*
*
* koi8-r
*
*
*
*
* @version $Id: MIME2Java.java,v 1.2 2005/01/26 08:28:44 jkjome Exp $
* @author TAMURA Kent <[email protected]>
*/
public class MIME2Java {
static private Hashtable s_enchash;
static private Hashtable s_revhash;
static {
s_enchash = new Hashtable();
// ,
s_enchash.put("UTF-8", "UTF8");
s_enchash.put("US-ASCII", "ASCII");
s_enchash.put("ISO-IR-6", "ASCII");
s_enchash.put("ANSI_X3.4-1986", "ASCII");
s_enchash.put("ISO_646.IRV:1991", "ASCII");
s_enchash.put("ASCII", "ASCII");
s_enchash.put("ISO646-US", "ASCII");
s_enchash.put("US", "ASCII");
s_enchash.put("IBM367", "ASCII");
s_enchash.put("CP367", "ASCII");
s_enchash.put("ISO-8859-1", "ISO8859_1");
s_enchash.put("ISO-IR-100", "ISO8859_1");
s_enchash.put("ISO_8859-1", "ISO8859_1");
s_enchash.put("LATIN1", "ISO8859_1");
s_enchash.put("L1", "ISO8859_1");
s_enchash.put("IBM819", "ISO8859_1");
s_enchash.put("CP819", "ISO8859_1");
s_enchash.put("ISO-8859-2", "ISO8859_2");
s_enchash.put("ISO-IR-101", "ISO8859_2");
s_enchash.put("ISO_8859-2", "ISO8859_2");
s_enchash.put("LATIN2", "ISO8859_2");
s_enchash.put("L2", "ISO8859_2");
s_enchash.put("ISO-8859-3", "ISO8859_3");
s_enchash.put("ISO-IR-109", "ISO8859_3");
s_enchash.put("ISO_8859-3", "ISO8859_3");
s_enchash.put("LATIN3", "ISO8859_3");
s_enchash.put("L3", "ISO8859_3");
s_enchash.put("ISO-8859-4", "ISO8859_4");
s_enchash.put("ISO-IR-110", "ISO8859_4");
s_enchash.put("ISO_8859-4", "ISO8859_4");
s_enchash.put("LATIN4", "ISO8859_4");
s_enchash.put("L4", "ISO8859_4");
s_enchash.put("ISO-8859-5", "ISO8859_5");
s_enchash.put("ISO-IR-144", "ISO8859_5");
s_enchash.put("ISO_8859-5", "ISO8859_5");
s_enchash.put("CYRILLIC", "ISO8859_5");
s_enchash.put("ISO-8859-6", "ISO8859_6");
s_enchash.put("ISO-IR-127", "ISO8859_6");
s_enchash.put("ISO_8859-6", "ISO8859_6");
s_enchash.put("ECMA-114", "ISO8859_6");
s_enchash.put("ASMO-708", "ISO8859_6");
s_enchash.put("ARABIC", "ISO8859_6");
s_enchash.put("ISO-8859-7", "ISO8859_7");
s_enchash.put("ISO-IR-126", "ISO8859_7");
s_enchash.put("ISO_8859-7", "ISO8859_7");
s_enchash.put("ELOT_928", "ISO8859_7");
s_enchash.put("ECMA-118", "ISO8859_7");
s_enchash.put("GREEK", "ISO8859_7");
s_enchash.put("GREEK8", "ISO8859_7");
s_enchash.put("ISO-8859-8", "ISO8859_8");
s_enchash.put("ISO-IR-138", "ISO8859_8");
s_enchash.put("ISO_8859-8", "ISO8859_8");
s_enchash.put("HEBREW", "ISO8859_8");
s_enchash.put("ISO-8859-9", "ISO8859_9");
s_enchash.put("ISO-IR-148", "ISO8859_9");
s_enchash.put("ISO_8859-9", "ISO8859_9");
s_enchash.put("LATIN5", "ISO8859_9");
s_enchash.put("L5", "ISO8859_9");
s_enchash.put("ISO-2022-JP", "ISO2022JP");
s_enchash.put("SHIFT_JIS", "SJIS");
s_enchash.put("MS_Kanji", "SJIS");
/**
* MS932 is suitable for Windows-31J,
* but JDK 1.1.x does not support MS932.
*/
String version = System.getProperty("java.version");
if (version.equals("1.1") || version.startsWith("1.1.")) {
s_enchash.put("WINDOWS-31J", "SJIS");
} else {
s_enchash.put("WINDOWS-31J", "MS932");
}
s_enchash.put("EUC-JP", "EUC_JP");
s_enchash.put("GB2312", "GB2312");
s_enchash.put("BIG5", "Big5");
s_enchash.put("EUC-KR", "EUC_KR");
s_enchash.put("ISO-2022-KR", "ISO2022KR");
s_enchash.put("KOI8-R", "KOI8_R");
s_enchash.put("ISO8859_1", "8859_1");
s_enchash.put("EBCDIC-CP-US", "CP037");
s_enchash.put("EBCDIC-CP-CA", "CP037");
s_enchash.put("EBCDIC-CP-NL", "CP037");
s_enchash.put("EBCDIC-CP-WT", "CP037");
s_enchash.put("EBCDIC-CP-DK", "CP277");
s_enchash.put("EBCDIC-CP-NO", "CP277");
s_enchash.put("EBCDIC-CP-FI", "CP278");
s_enchash.put("EBCDIC-CP-SE", "CP278");
s_enchash.put("EBCDIC-CP-IT", "CP280");
s_enchash.put("EBCDIC-CP-ES", "CP284");
s_enchash.put("EBCDIC-CP-GB", "CP285");
s_enchash.put("EBCDIC-CP-FR", "CP297");
s_enchash.put("EBCDIC-CP-AR1", "CP420");
s_enchash.put("EBCDIC-CP-HE", "CP424");
s_enchash.put("EBCDIC-CP-CH", "CP500");
s_enchash.put("EBCDIC-CP-BE", "CP500");
s_enchash.put("CP-AR", "CP868");
s_enchash.put("CP-GR", "CP869");
s_enchash.put("EBCDIC-CP-ROECE", "CP870");
s_enchash.put("EBCDIC-CP-YU", "CP870");
s_enchash.put("EBCDIC-CP-IS", "CP871");
s_enchash.put("EBCDIC-CP-AR2", "CP918");
// Add support for Cp1252 and its friends
s_enchash.put("WINDOWS-1250", "Cp1250");
s_enchash.put("WINDOWS-1251", "Cp1251");
s_enchash.put("WINDOWS-1252", "Cp1252");
s_enchash.put("WINDOWS-1253", "Cp1253");
s_enchash.put("WINDOWS-1254", "Cp1254");
s_enchash.put("WINDOWS-1255", "Cp1255");
s_enchash.put("WINDOWS-1256", "Cp1256");
s_enchash.put("WINDOWS-1257", "Cp1257");
s_enchash.put("WINDOWS-1258", "Cp1258");
s_enchash.put("TIS-620", "TIS620");
// j:CNS11643 -> EUC-TW?
s_enchash.put("ISO-2022-CN", "ISO2022CN");
s_enchash.put("X0201", "JIS0201");
s_enchash.put("X0208", "JIS0208");
s_enchash.put("X0212", "JIS0212");
s_enchash.put("ISO-IR-159", "JIS0212");
s_revhash = new Hashtable();
// ,
s_revhash.put("UTF8", "UTF-8");
s_revhash.put("ASCII", "US-ASCII");
s_revhash.put("ASCII", "ISO-IR-6");
s_revhash.put("ASCII", "ANSI_X3.4-1986");
s_revhash.put("ASCII", "ISO_646.IRV:1991");
s_revhash.put("ASCII", "ASCII");
s_revhash.put("ASCII", "ISO646-US");
s_revhash.put("ASCII", "US");
s_revhash.put("ASCII", "IBM367");
s_revhash.put("ASCII", "CP367");
s_revhash.put("ISO8859_1", "ISO-8859-1");
s_revhash.put("ISO8859_1", "ISO-IR-100");
s_revhash.put("ISO8859_1", "ISO_8859-1");
s_revhash.put("ISO8859_1", "LATIN1");
s_revhash.put("ISO8859_1", "L1");
s_revhash.put("ISO8859_1", "IBM819");
s_revhash.put("ISO8859_1", "CP819");
s_revhash.put("ISO8859_2", "ISO-8859-2");
s_revhash.put("ISO8859_2", "ISO-IR-101");
s_revhash.put("ISO8859_2", "ISO_8859-2");
s_revhash.put("ISO8859_2", "LATIN2");
s_revhash.put("ISO8859_2", "L2");
s_revhash.put("ISO8859_3", "ISO-8859-3");
s_revhash.put("ISO8859_3", "ISO-IR-109");
s_revhash.put("ISO8859_3", "ISO_8859-3");
s_revhash.put("ISO8859_3", "LATIN3");
s_revhash.put("ISO8859_3", "L3");
s_revhash.put("ISO8859_4", "ISO-8859-4");
s_revhash.put("ISO8859_4", "ISO-IR-110");
s_revhash.put("ISO8859_4", "ISO_8859-4");
s_revhash.put("ISO8859_4", "LATIN4");
s_revhash.put("ISO8859_4", "L4");
s_revhash.put("ISO8859_5", "ISO-8859-5");
s_revhash.put("ISO8859_5", "ISO-IR-144");
s_revhash.put("ISO8859_5", "ISO_8859-5");
s_revhash.put("ISO8859_5", "CYRILLIC");
s_revhash.put("ISO8859_6", "ISO-8859-6");
s_revhash.put("ISO8859_6", "ISO-IR-127");
s_revhash.put("ISO8859_6", "ISO_8859-6");
s_revhash.put("ISO8859_6", "ECMA-114");
s_revhash.put("ISO8859_6", "ASMO-708");
s_revhash.put("ISO8859_6", "ARABIC");
s_revhash.put("ISO8859_7", "ISO-8859-7");
s_revhash.put("ISO8859_7", "ISO-IR-126");
s_revhash.put("ISO8859_7", "ISO_8859-7");
s_revhash.put("ISO8859_7", "ELOT_928");
s_revhash.put("ISO8859_7", "ECMA-118");
s_revhash.put("ISO8859_7", "GREEK");
s_revhash.put("ISO8859_7", "GREEK8");
s_revhash.put("ISO8859_8", "ISO-8859-8");
s_revhash.put("ISO8859_8", "ISO-IR-138");
s_revhash.put("ISO8859_8", "ISO_8859-8");
s_revhash.put("ISO8859_8", "HEBREW");
s_revhash.put("ISO8859_9", "ISO-8859-9");
s_revhash.put("ISO8859_9", "ISO-IR-148");
s_revhash.put("ISO8859_9", "ISO_8859-9");
s_revhash.put("ISO8859_9", "LATIN5");
s_revhash.put("ISO8859_9", "L5");
s_revhash.put("ISO2022JP", "ISO-2022-JP");
s_revhash.put("SJIS", "Shift_JIS");
s_revhash.put("SJIS", "MS_Kanji");
s_revhash.put("MS932", "WINDOWS-31J");
s_revhash.put("EUC_JP", "EUC-JP");
s_revhash.put("GB2312", "GB2312");
s_revhash.put("BIG5", "Big5");
s_revhash.put("EUC_KR", "EUC-KR");
s_revhash.put("ISO2022KR", "ISO-2022-KR");
s_revhash.put("KOI8_R", "KOI8-R");
s_revhash.put("CP037", "EBCDIC-CP-US");
s_revhash.put("CP037", "EBCDIC-CP-CA");
s_revhash.put("CP037", "EBCDIC-CP-NL");
s_revhash.put("CP037", "EBCDIC-CP-WT");
s_revhash.put("CP277", "EBCDIC-CP-DK");
s_revhash.put("CP277", "EBCDIC-CP-NO");
s_revhash.put("CP278", "EBCDIC-CP-FI");
s_revhash.put("CP278", "EBCDIC-CP-SE");
s_revhash.put("CP280", "EBCDIC-CP-IT");
s_revhash.put("CP284", "EBCDIC-CP-ES");
s_revhash.put("CP285", "EBCDIC-CP-GB");
s_revhash.put("CP297", "EBCDIC-CP-FR");
s_revhash.put("CP420", "EBCDIC-CP-AR1");
s_revhash.put("CP424", "EBCDIC-CP-HE");
s_revhash.put("CP500", "EBCDIC-CP-CH");
s_revhash.put("CP500", "EBCDIC-CP-BE");
s_revhash.put("CP868", "CP-AR");
s_revhash.put("CP869", "CP-GR");
s_revhash.put("CP870", "EBCDIC-CP-ROECE");
s_revhash.put("CP870", "EBCDIC-CP-YU");
s_revhash.put("CP871", "EBCDIC-CP-IS");
s_revhash.put("CP918", "EBCDIC-CP-AR2");
// Add support for Cp1252 and friends
// Since this code page should be written out in mixed case,
// there is no need to reverse the function.
s_revhash.put("CP1250", "WINDOWS-1250");
s_revhash.put("CP1251", "WINDOWS-1251");
s_revhash.put("CP1252", "WINDOWS-1252");
s_revhash.put("CP1253", "WINDOWS-1253");
s_revhash.put("CP1254", "WINDOWS-1254");
s_revhash.put("CP1255", "WINDOWS-1255");
s_revhash.put("CP1256", "WINDOWS-1256");
s_revhash.put("CP1257", "WINDOWS-1257");
s_revhash.put("CP1258", "WINDOWS-1258");
s_revhash.put("TIS620", "TIS-620");
s_revhash.put("ISO2022CN", "ISO-2022-CN");
s_revhash.put("JIS0201", "X0201");
s_revhash.put("JIS0208", "X0208");
s_revhash.put("JIS0212", "X0212");
s_revhash.put("JIS0212", "ISO-IR-159");
}
private MIME2Java() {
}
/**
* Convert a MIME charset name, also known as an XML encoding name, to a Java encoding name.
* @param mimeCharsetName Case insensitive MIME charset name: UTF-8, US-ASCII, ISO-8859-1,
* ISO-8859-2, ISO-8859-3, ISO-8859-4, ISO-8859-5, ISO-8859-6,
* ISO-8859-7, ISO-8859-8, ISO-8859-9, ISO-2022-JP, Shift_JIS, Windows-31J
* EUC-JP, GB2312, Big5, EUC-KR, ISO-2022-KR, KOI8-R,
* EBCDIC-CP-US, EBCDIC-CP-CA, EBCDIC-CP-NL, EBCDIC-CP-DK,
* EBCDIC-CP-NO, EBCDIC-CP-FI, EBCDIC-CP-SE, EBCDIC-CP-IT,
* EBCDIC-CP-ES, EBCDIC-CP-GB, EBCDIC-CP-FR, EBCDIC-CP-AR1,
* EBCDIC-CP-HE, EBCDIC-CP-CH, EBCDIC-CP-ROECE, EBCDIC-CP-YU,
* EBCDIC-CP-IS and EBCDIC-CP-AR2
.
* @return Java encoding name, or null if mimeCharsetName
* is unknown.
* @see #reverse
*/
public static String convert(String mimeCharsetName) {
return (String)s_enchash.get(mimeCharsetName.toUpperCase());
}
/**
* Convert a Java encoding name to MIME charset name.
* Available values of encoding are "UTF8", "8859_1", "8859_2", "8859_3", "8859_4",
* "8859_5", "8859_6", "8859_7", "8859_8", "8859_9", "JIS", "SJIS", "MS932", "EUCJIS",
* "GB2312", "BIG5", "KSC5601", "ISO2022KR", "KOI8_R", "CP037", "CP277", "CP278",
* "CP280", "CP284", "CP285", "CP297", "CP420", "CP424", "CP500", "CP870", "CP871" and "CP918".
* @param encoding Case insensitive Java encoding name: UTF8, 8859_1, 8859_2, 8859_3,
* 8859_4, 8859_5, 8859_6, 8859_7, 8859_8, 8859_9, JIS, SJIS, MS932, EUCJIS,
* GB2312, BIG5, KSC5601, ISO2022KR, KOI8_R, CP037, CP277, CP278,
* CP280, CP284, CP285, CP297, CP420, CP424, CP500, CP870, CP871
* and CP918
.
* @return MIME charset name, or null if encoding is unknown.
* @see #convert
*/
public static String reverse(String encoding) {
return (String)s_revhash.get(encoding.toUpperCase());
}
}