All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.rtfparserkit.parser.standard.Encoding Maven / Gradle / Ivy

Go to download

Library that provides facilities to allow project information to be manipulated in Java and .Net. Supports a range of data formats: Microsoft Project Exchange (MPX), Microsoft Project (MPP,MPT), Microsoft Project Data Interchange (MSPDI XML), Microsoft Project Database (MPD), Planner (XML), Primavera (PM XML, XER, and database), Asta Powerproject (PP, MDB), Asta Easyplan (PP), Phoenix Project Manager (PPX), FastTrack Schedule (FTS), and the Standard Data Exchange Format (SDEF).

There is a newer version: 13.8.0
Show newest version
/*
 * Copyright 2013 Jon Iles
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.rtfparserkit.parser.standard;

import java.util.HashMap;
import java.util.Map;

/**
 * Represents character encodings which may be encountered in an RTF file.
 */
class Encoding
{
   public static final String ANSI_ENCODING = "Cp1252";
   public static final String PC_ENCODING = "Cp437";
   public static final String PCA_ENCODING = "Cp850";
   public static final String MAC_ENCODING = "MacRoman";

   public static final Map LOCALEID_MAPPING = new HashMap();
   static
   {
      // Comment lines based on: https://msdn.microsoft.com/en-us/library/windows/desktop/dd317756(v=vs.85).aspx

      // 037   IBM037      IBM EBCDIC US-Canada
      // 437   IBM437      OEM United States
      // 500   IBM500      IBM EBCDIC International
      // 708   ASMO-708    Arabic (ASMO 708)
      // 709               Arabic (ASMO-449+, BCON V4)
      // 710               Arabic - Transparent Arabic
      // 720   DOS-720     Arabic (Transparent ASMO); Arabic (DOS)
      // 737   ibm737      OEM Greek (formerly 437G); Greek (DOS)
      // 775   ibm775      OEM Baltic; Baltic (DOS)
      // 850   ibm850      OEM Multilingual Latin 1; Western European (DOS)
      // 852   ibm852      OEM Latin 2; Central European (DOS)
      // 855   IBM855      OEM Cyrillic (primarily Russian)
      // 857   ibm857      OEM Turkish; Turkish (DOS)
      // 858   IBM00858    OEM Multilingual Latin 1 + Euro symbol
      // 860   IBM860      OEM Portuguese; Portuguese (DOS)
      // 861   ibm861      OEM Icelandic; Icelandic (DOS)
      // 862   DOS-862     OEM Hebrew; Hebrew (DOS)
      // 863   IBM863      OEM French Canadian; French Canadian (DOS)
      // 864   IBM864      OEM Arabic; Arabic (864)
      // 865   IBM865      OEM Nordic; Nordic (DOS)
      // 866   cp866 OEM   Russian; Cyrillic (DOS)
      // 869   ibm869      OEM Modern Greek; Greek, Modern (DOS)
      // 870   IBM870      IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
      // 874   windows-874 ANSI/OEM Thai (ISO 8859-11); Thai (Windows)
      // 875   cp875       IBM EBCDIC Greek Modern
      LOCALEID_MAPPING.put("932", "SJIS"); // Japanese
      LOCALEID_MAPPING.put("936", "Cp936"); // Simplified Chinese
      LOCALEID_MAPPING.put("949", "Cp949"); // Korean
      // 950   big5  ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
      LOCALEID_MAPPING.put("1025", "Cp1256"); // Arabic (Saudi Arabia)
      LOCALEID_MAPPING.put("1026", "Cp1251"); // Bulgarian
      LOCALEID_MAPPING.put("1028", "Cp950"); // Chinese (Taiwan)
      LOCALEID_MAPPING.put("1029", "Cp1250"); // Czech
      LOCALEID_MAPPING.put("1032", "Cp1253"); // Greek
      LOCALEID_MAPPING.put("1037", "Cp1255"); // Hebrew
      LOCALEID_MAPPING.put("1038", "Cp1250"); // Hungarian
      LOCALEID_MAPPING.put("1041", "SJIS"); // Japanese
      LOCALEID_MAPPING.put("1042", "Cp949"); // Korean
      LOCALEID_MAPPING.put("1045", "Cp1250"); // Polish
      // 1047  IBM01047 IBM EBCDIC Latin 1/Open System
      LOCALEID_MAPPING.put("1048", "Cp1250"); // Romanian
      LOCALEID_MAPPING.put("1049", "Cp1251"); // Russian
      LOCALEID_MAPPING.put("1050", "Cp1250"); // Croatian
      LOCALEID_MAPPING.put("1051", "Cp1250"); // Slovak
      LOCALEID_MAPPING.put("1052", "Cp1250"); // Albanian
      LOCALEID_MAPPING.put("1054", "Cp874"); // Thai
      LOCALEID_MAPPING.put("1055", "Cp1254"); // Turkish
      LOCALEID_MAPPING.put("1056", "Cp1256"); // Urdu
      LOCALEID_MAPPING.put("1058", "Cp1251"); // Ukrainian
      LOCALEID_MAPPING.put("1059", "Cp1251"); // Belarusian
      LOCALEID_MAPPING.put("1060", "Cp1250"); // Slovenian
      LOCALEID_MAPPING.put("1061", "Cp1257"); // Estonian
      LOCALEID_MAPPING.put("1062", "Cp1257"); // Latvian
      LOCALEID_MAPPING.put("1063", "Cp1257"); // Lithuanian
      LOCALEID_MAPPING.put("1065", "Cp1256"); // Farsi
      LOCALEID_MAPPING.put("1066", "Cp1258"); // Vietnamese
      LOCALEID_MAPPING.put("1068", "Cp1254"); // Azeri (Latin)
      LOCALEID_MAPPING.put("1071", "Cp1251"); // FYRO Macedonian
      LOCALEID_MAPPING.put("1087", "Cp1251"); // Kazakh
      LOCALEID_MAPPING.put("1088", "Cp1251"); // Kyrgyz (Cyrillic)
      LOCALEID_MAPPING.put("1091", "Cp1254"); // Uzbek (Latin)
      LOCALEID_MAPPING.put("1092", "Cp1251"); // Tatar
      LOCALEID_MAPPING.put("1104", "Cp1251"); // Mongolian (Cyrillic)
      // 1140  IBM01140 IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
      // 1141  IBM01141 IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
      // 1142  IBM01142 IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
      // 1143  IBM01143 IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
      // 1144  IBM01144 IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
      // 1145  IBM01145 IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
      // 1146  IBM01146 IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
      // 1147  IBM01147 IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
      // 1148  IBM01148 IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
      // 1149  IBM01149 IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
      // 1200  utf-16   Unicode UTF-16, little endian byte order (BMP of ISO 10646)
      // 1201  unicodeFFFE Unicode UTF-16, big endian byte order
      LOCALEID_MAPPING.put("1250", "Cp1250"); // Windows Latin 2 (Central Europe)
      LOCALEID_MAPPING.put("1251", "Cp1251"); // Cyrillic
      LOCALEID_MAPPING.put("1252", "Cp1252"); // Latin
      LOCALEID_MAPPING.put("1253", "Cp1253"); // Greek
      LOCALEID_MAPPING.put("1254", "Cp1254"); // Turkish      
      LOCALEID_MAPPING.put("1255", "Cp1255"); // Windows Hebrew
      LOCALEID_MAPPING.put("1256", "Cp1256"); // Arabic (Iraq)
      LOCALEID_MAPPING.put("1257", "Cp1257"); // Baltic
      LOCALEID_MAPPING.put("1258", "Cp1258"); // Vietnamese
      // 1361  Johab Korean (Johab)
      LOCALEID_MAPPING.put("2049", "Cp1256"); // Arabic (Iraq)
      LOCALEID_MAPPING.put("2052", "MS936"); // Chinese (PRC)
      LOCALEID_MAPPING.put("2074", "Cp1250"); // Serbian (Latin)
      LOCALEID_MAPPING.put("2092", "Cp1251"); // Azeri (Cyrillic)
      LOCALEID_MAPPING.put("2115", "Cp1251"); // Uzbek (Cyrillic)
      LOCALEID_MAPPING.put("3073", "Cp1256"); // Arabic (Egypt)
      LOCALEID_MAPPING.put("3076", "Cp950"); // Chinese (Hong Kong S.A.R.)
      LOCALEID_MAPPING.put("3098", "Cp1251"); // Serbian (Cyrillic)
      LOCALEID_MAPPING.put("4097", "Cp1256"); // Arabic (Libya)
      LOCALEID_MAPPING.put("4100", "MS936"); // Chinese (Singapore)
      LOCALEID_MAPPING.put("5121", "Cp1256"); // Arabic (Algeria)
      LOCALEID_MAPPING.put("5124", "Cp950"); // Chinese (Macau S.A.R.)
      LOCALEID_MAPPING.put("6145", "Cp1256"); // Arabic (Morocco)
      LOCALEID_MAPPING.put("7169", "Cp1256"); // Arabic (Tunisia)
      LOCALEID_MAPPING.put("8193", "Cp1256"); // Arabic (Oman)
      LOCALEID_MAPPING.put("9217", "Cp1256"); // Arabic (Yemen)
      LOCALEID_MAPPING.put("10000", "MacRoman"); // Mac Roman
      // 10001 x-mac-japanese Japanese (Mac)
      // 10002 x-mac-chinesetrad MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
      // 10003 x-mac-korean   Korean (Mac)
      // 10004 x-mac-arabic   Arabic (Mac)
      // 10005 x-mac-hebrew   Hebrew (Mac)
      // 10006 x-mac-greek Greek (Mac)
      // 10007 x-mac-cyrillic Cyrillic (Mac)
      // 10008 x-mac-chinesesimp MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
      // 10010 x-mac-romanian Romanian (Mac)
      // 10017 x-mac-ukrainian   Ukrainian (Mac)
      // 10021 x-mac-thai  Thai (Mac)
      // 10029 x-mac-ce MAC Latin 2; Central European (Mac)
      // 10079 x-mac-icelandic   Icelandic (Mac)
      // 10081 x-mac-turkish  Turkish (Mac)
      // 10082 x-mac-croatian Croatian (Mac)
      LOCALEID_MAPPING.put("10241", "Cp1256"); // Arabic (Syria)
      LOCALEID_MAPPING.put("11265", "Cp1256"); // Arabic (Jordan)
      // 12000 utf-32   Unicode UTF-32, little endian byte order
      // 12001 utf-32BE Unicode UTF-32, big endian byte order
      LOCALEID_MAPPING.put("12289", "Cp1256"); // Arabic (Lebanon)
      LOCALEID_MAPPING.put("13313", "Cp1256"); // Arabic (Kuwait)
      LOCALEID_MAPPING.put("14337", "Cp1256"); // Arabic (U.A.E.)
      LOCALEID_MAPPING.put("15361", "Cp1256"); // Arabic (Bahrain)
      LOCALEID_MAPPING.put("16385", "Cp1256"); // Arabic (Qatar)
      // 20000 x-Chinese_CNS  CNS Taiwan; Chinese Traditional (CNS)
      // 20001 x-cp20001   TCA Taiwan
      // 20002 x_Chinese-Eten Eten Taiwan; Chinese Traditional (Eten)
      // 20003 x-cp20003   IBM5550 Taiwan
      // 20004 x-cp20004   TeleText Taiwan
      // 20005 x-cp20005   Wang Taiwan
      // 20105 x-IA5 IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
      // 20106 x-IA5-German   IA5 German (7-bit)
      // 20107 x-IA5-Swedish  IA5 Swedish (7-bit)
      // 20108 x-IA5-Norwegian   IA5 Norwegian (7-bit)
      // 20127 us-ascii US-ASCII (7-bit)
      // 20261 x-cp20261   T.61
      // 20269 x-cp20269   ISO 6937 Non-Spacing Accent
      // 20273 IBM273   IBM EBCDIC Germany
      // 20277 IBM277   IBM EBCDIC Denmark-Norway
      // 20278 IBM278   IBM EBCDIC Finland-Sweden
      // 20280 IBM280   IBM EBCDIC Italy
      // 20284 IBM284   IBM EBCDIC Latin America-Spain
      // 20285 IBM285   IBM EBCDIC United Kingdom
      // 20290 IBM290   IBM EBCDIC Japanese Katakana Extended
      // 20297 IBM297   IBM EBCDIC France
      // 20420 IBM420   IBM EBCDIC Arabic
      // 20423 IBM423   IBM EBCDIC Greek
      // 20424 IBM424   IBM EBCDIC Hebrew
      // 20833 x-EBCDIC-KoreanExtended IBM EBCDIC Korean Extended
      // 20838 IBM-Thai IBM EBCDIC Thai
      // 20866 koi8-r   Russian (KOI8-R); Cyrillic (KOI8-R)
      // 20871 IBM871   IBM EBCDIC Icelandic
      // 20880 IBM880   IBM EBCDIC Cyrillic Russian
      // 20905 IBM905   IBM EBCDIC Turkish
      // 20924 IBM00924 IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
      // 20932 EUC-JP   Japanese (JIS 0208-1990 and 0212-1990)
      // 20936 x-cp20936   Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
      // 20949 x-cp20949   Korean Wansung
      // 21025 cp1025   IBM EBCDIC Cyrillic Serbian-Bulgarian
      // 21027    (deprecated)
      // 21866 koi8-u   Ukrainian (KOI8-U); Cyrillic (KOI8-U)
      // 28591 iso-8859-1  ISO 8859-1 Latin 1; Western European (ISO)
      // 28592 iso-8859-2  ISO 8859-2 Central European; Central European (ISO)
      // 28593 iso-8859-3  ISO 8859-3 Latin 3
      // 28594 iso-8859-4  ISO 8859-4 Baltic
      // 28595 iso-8859-5  ISO 8859-5 Cyrillic
      // 28596 iso-8859-6  ISO 8859-6 Arabic
      // 28597 iso-8859-7  ISO 8859-7 Greek
      // 28598 iso-8859-8  ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
      // 28599 iso-8859-9  ISO 8859-9 Turkish
      // 28603 iso-8859-13 ISO 8859-13 Estonian
      // 28605 iso-8859-15 ISO 8859-15 Latin 9
      // 29001 x-Europa Europa 3
      // 38598 iso-8859-8-i   ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
      // 50220 iso-2022-jp ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
      // 50221 csISO2022JP ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
      // 50222 iso-2022-jp ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
      // 50225 iso-2022-kr ISO 2022 Korean
      // 50227 x-cp50227   ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
      // 50229    ISO 2022 Traditional Chinese
      // 50930    EBCDIC Japanese (Katakana) Extended
      // 50931    EBCDIC US-Canada and Japanese
      // 50933    EBCDIC Korean Extended and Korean
      // 50935    EBCDIC Simplified Chinese Extended and Simplified Chinese
      // 50936    EBCDIC Simplified Chinese
      // 50937    EBCDIC US-Canada and Traditional Chinese
      // 50939    EBCDIC Japanese (Latin) Extended and Japanese
      // 51932 euc-jp   EUC Japanese
      // 51936 EUC-CN   EUC Simplified Chinese; Chinese Simplified (EUC)
      // 51949 euc-kr   EUC Korean
      // 51950    EUC Traditional Chinese
      // 52936 hz-gb-2312  HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
      // 54936 GB18030  Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
      // 57002 x-iscii-de  ISCII Devanagari
      // 57003 x-iscii-be  ISCII Bangla
      // 57004 x-iscii-ta  ISCII Tamil
      // 57005 x-iscii-te  ISCII Telugu
      // 57006 x-iscii-as  ISCII Assamese
      // 57007 x-iscii-or  ISCII Odia
      // 57008 x-iscii-ka  ISCII Kannada
      // 57009 x-iscii-ma  ISCII Malayalam
      // 57010 x-iscii-gu  ISCII Gujarati
      // 57011 x-iscii-pa  ISCII Punjabi
      LOCALEID_MAPPING.put("65000", null); // UTF-7 - not a supported Java encoding, see: http://stackoverflow.com/questions/19861987/java-io-unsupportedencodingexception-unicode-1-1-utf-7
      LOCALEID_MAPPING.put("65001", "UTF-8"); // UTF-8
   }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy