panda.lang.Charsets Maven / Gradle / Ivy
Show all versions of panda-core Show documentation
package panda.lang;
import java.nio.charset.Charset;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* Character encoding names required of every implementation of the Java platform.
*
* From the Java documentation Standard charsets:
*
* Every implementation of the Java platform is required to support the following character encodings. Consult the
* release documentation for your implementation to see if any other encodings are supported. Consult the release
* documentation for your implementation to see if any other encodings are supported.
*
*
*
* US-ASCII
* Seven-bit ASCII, a.k.a. ISO646-US, a.k.a. the Basic Latin block of the Unicode character set.
* ISO-8859-1
* ISO Latin Alphabet No. 1, a.k.a. ISO-LATIN-1.
* UTF-8
* Eight-bit Unicode Transformation Format.
* UTF-16BE
* Sixteen-bit Unicode Transformation Format, big-endian byte order.
* UTF-16LE
* Sixteen-bit Unicode Transformation Format, little-endian byte order.
* UTF-16
* Sixteen-bit Unicode Transformation Format, byte order specified by a mandatory initial byte-order mark (either order
* accepted on input, big-endian used on output.)
*
*
* This perhaps would best belong in the [lang] project. Even if a similar interface is defined in [lang], it is not
* foreseen that [codec] would be made to depend on [lang].
*
* @see Standard charsets
*
*/
public class Charsets {
/**
* Japanese charsets.
*
* Every implementation of the Java platform is required to support this character encoding.
*
*
* @see Standard
* charsets
*/
public static final String MS932 = "MS932";
public static final String Shift_JIS = "Shift_JIS";
public static final String EUC_JP = "EUC-JP";
/**
* Chinese charsets.
*
* Every implementation of the Java platform is required to support this character encoding.
*
*
* @see Standard
* charsets
*/
public static final String GB2312 = "GB2312";
public static final String GBK = "GBK";
public static final String BIG5 = "Big5";
/**
* CharEncodingISO Latin Alphabet No. 1, a.k.a. ISO-LATIN-1.
*
* Every implementation of the Java platform is required to support this character encoding.
*
*
* @see Standard
* charsets
*/
public static final String ISO_8859_1 = "ISO-8859-1";
/**
*
* Seven-bit ASCII, also known as ISO646-US, also known as the Basic Latin block of the Unicode
* character set.
*
*
* Every implementation of the Java platform is required to support this character encoding.
*
*
* @see Standard
* charsets
*/
public static final String US_ASCII = "US-ASCII";
/**
*
* Sixteen-bit Unicode Transformation Format, The byte order specified by a mandatory initial
* byte-order mark (either order accepted on input, big-endian used on output)
*
*
* Every implementation of the Java platform is required to support this character encoding.
*
*
* @see Standard
* charsets
*/
public static final String UTF_16 = "UTF-16";
/**
*
* Sixteen-bit Unicode Transformation Format, big-endian byte order.
*
*
* Every implementation of the Java platform is required to support this character encoding.
*
*
* @see Standard
* charsets
*/
public static final String UTF_16BE = "UTF-16BE";
/**
*
* Sixteen-bit Unicode Transformation Format, little-endian byte order.
*
*
* Every implementation of the Java platform is required to support this character encoding.
*
*
* @see Standard
* charsets
*/
public static final String UTF_16LE = "UTF-16LE";
public static final String UTF_32BE = "UTF-32BE";
public static final String UTF_32LE = "UTF-32LE";
/**
*
* Eight-bit Unicode Transformation Format.
*
*
* Every implementation of the Java platform is required to support this character encoding.
*
*
* @see Standard
* charsets
*/
public static final String UTF_8 = "UTF-8";
/**
* CharEncodingISO Latin Alphabet No. 1, a.k.a. ISO-LATIN-1.
*
* Every implementation of the Java platform is required to support this character encoding.
*
* @see Standard
* charsets
*/
public static final Charset CS_ISO_8859_1 = Charset.forName(ISO_8859_1);
/**
* Seven-bit ASCII, also known as ISO646-US, also known as the Basic Latin block of the Unicode
* character set.
*
* Every implementation of the Java platform is required to support this character encoding.
*
* @see Standard
* charsets
*/
public static final Charset CS_US_ASCII = Charset.forName(US_ASCII);
/**
* Sixteen-bit Unicode Transformation Format, The byte order specified by a mandatory initial
* byte-order mark (either order accepted on input, big-endian used on output)
*
* Every implementation of the Java platform is required to support this character encoding.
*
* @see Standard
* charsets
*/
public static final Charset CS_UTF_16 = Charset.forName(UTF_16);
/**
* Sixteen-bit Unicode Transformation Format, big-endian byte order.
*
* Every implementation of the Java platform is required to support this character encoding.
*
* @see Standard
* charsets
*/
public static final Charset CS_UTF_16BE = Charset.forName(UTF_16BE);
/**
* Sixteen-bit Unicode Transformation Format, little-endian byte order.
*
* Every implementation of the Java platform is required to support this character encoding.
*
* @see Standard
* charsets
*/
public static final Charset CS_UTF_16LE = Charset.forName(UTF_16LE);
public static final Charset CS_UTF_32BE = Charset.forName(UTF_32BE);
public static final Charset CS_UTF_32LE = Charset.forName(UTF_32LE);
/**
* Eight-bit Unicode Transformation Format.
*
* Every implementation of the Java platform is required to support this character encoding.
*
* @see Standard
* charsets
*/
public static final Charset CS_UTF_8 = Charset.forName(UTF_8);
/**
* Returns the given Charset or the default Charset if the given Charset is null.
*
* @param charset A charset or null.
* @return the given Charset or the default Charset if the given Charset is null
*/
public static Charset toCharset(final Charset charset) {
return charset == null ? Charset.defaultCharset() : charset;
}
/**
* Returns a Charset for the named charset. If the name is null, return the default Charset.
*
* @param charset The name of the requested charset, may be null.
* @return a Charset for the named charset
* @throws java.nio.charset.UnsupportedCharsetException If the named charset is unavailable
*/
public static Charset toCharset(final String charset) {
return charset == null ? Charset.defaultCharset() : Charset.forName(charset);
}
/**
* Returns a Charset for the named charset. If the name is null, return the default Charset.
*
* @param charset The name of the requested charset, may be null.
* @param defCharset Default charset value
* @return a Charset for the named charset
*/
public static Charset toCharset(final String charset, Charset defCharset) {
if (defCharset == null) {
defCharset = Charset.defaultCharset();
}
if (Strings.isNotEmpty(charset)) {
try {
return Charset.forName(charset);
}
catch (Exception e) {
//skip
}
}
return defCharset;
}
private final static Map charsetMap = new ConcurrentHashMap();
static {
loadBuiltInCharsetMap();
}
/**
* Loads a preset language-to-encoding map. It assumes the usual character encodings for most
* languages. The previous content of the encoding map will be lost. This default map currently
* contains the following mappings:
*
*
* ar
* ISO-8859-6
*
*
* be
* ISO-8859-5
*
*
* bg
* ISO-8859-5
*
*
* ca
* ISO-8859-1
*
*
* cs
* ISO-8859-2
*
*
* da
* ISO-8859-1
*
*
* de
* ISO-8859-1
*
*
* el
* ISO-8859-7
*
*
* en
* ISO-8859-1
*
*
* es
* ISO-8859-1
*
*
* et
* ISO-8859-1
*
*
* fi
* ISO-8859-1
*
*
* fr
* ISO-8859-1
*
*
* hr
* ISO-8859-2
*
*
* hu
* ISO-8859-2
*
*
* is
* ISO-8859-1
*
*
* it
* ISO-8859-1
*
*
* iw
* ISO-8859-8
*
*
* ja
* Shift_JIS
*
*
* ko
* EUC-KR
*
*
* lt
* ISO-8859-2
*
*
* lv
* ISO-8859-2
*
*
* mk
* ISO-8859-5
*
*
* nl
* ISO-8859-1
*
*
* no
* ISO-8859-1
*
*
* pl
* ISO-8859-2
*
*
* pt
* ISO-8859-1
*
*
* ro
* ISO-8859-2
*
*
* ru
* ISO-8859-5
*
*
* sh
* ISO-8859-5
*
*
* sk
* ISO-8859-2
*
*
* sl
* ISO-8859-2
*
*
* sq
* ISO-8859-2
*
*
* sr
* ISO-8859-5
*
*
* sv
* ISO-8859-1
*
*
* tr
* ISO-8859-9
*
*
* uk
* ISO-8859-5
*
*
* zh
* GB2312
*
*
* zh_TW
* Big5
*
*
*/
public static void loadBuiltInCharsetMap() {
charsetMap.clear();
charsetMap.put("ar", "ISO-8859-6");
charsetMap.put("be", "ISO-8859-5");
charsetMap.put("bg", "ISO-8859-5");
charsetMap.put("ca", "ISO-8859-1");
charsetMap.put("cs", "ISO-8859-2");
charsetMap.put("da", "ISO-8859-1");
charsetMap.put("de", "ISO-8859-1");
charsetMap.put("el", "ISO-8859-7");
charsetMap.put("en", "ISO-8859-1");
charsetMap.put("es", "ISO-8859-1");
charsetMap.put("et", "ISO-8859-1");
charsetMap.put("fi", "ISO-8859-1");
charsetMap.put("fr", "ISO-8859-1");
charsetMap.put("hr", "ISO-8859-2");
charsetMap.put("hu", "ISO-8859-2");
charsetMap.put("is", "ISO-8859-1");
charsetMap.put("it", "ISO-8859-1");
charsetMap.put("iw", "ISO-8859-8");
charsetMap.put("ja", "Shift_JIS");
charsetMap.put("ko", "EUC-KR");
charsetMap.put("lt", "ISO-8859-2");
charsetMap.put("lv", "ISO-8859-2");
charsetMap.put("mk", "ISO-8859-5");
charsetMap.put("nl", "ISO-8859-1");
charsetMap.put("no", "ISO-8859-1");
charsetMap.put("pl", "ISO-8859-2");
charsetMap.put("pt", "ISO-8859-1");
charsetMap.put("ro", "ISO-8859-2");
charsetMap.put("ru", "ISO-8859-5");
charsetMap.put("sh", "ISO-8859-5");
charsetMap.put("sk", "ISO-8859-2");
charsetMap.put("sl", "ISO-8859-2");
charsetMap.put("sq", "ISO-8859-2");
charsetMap.put("sr", "ISO-8859-5");
charsetMap.put("sv", "ISO-8859-1");
charsetMap.put("tr", "ISO-8859-9");
charsetMap.put("uk", "ISO-8859-5");
charsetMap.put("zh", "GB2312");
charsetMap.put("zh_TW", "Big5");
}
/**
* Clears language-to-encoding map.
*
* @see #loadBuiltInCharsetMap
* @see #setCharset
*/
public static void clearCharsetMap() {
charsetMap.clear();
}
/**
* Sets the character set encoding to use for templates of a given locale.
*
* @param locale locale
* @param encoding encoding
* @see #clearCharsetMap
* @see #loadBuiltInCharsetMap
*/
public static void setCharset(Locale locale, String encoding) {
charsetMap.put(locale.toString(), encoding);
}
/**
* Gets the preferred character encoding for the given locale, or the default encoding if no
* encoding is set explicitly for the specified locale. You can associate encodings with locales
* using {@link #setCharset(Locale, String)} or {@link #loadBuiltInCharsetMap()}.
*
* @param loc the locale
* @return the preferred character encoding for the locale.
*/
public static String charsetFromLocale(Locale loc) {
// Try for a full name match (may include country and variant)
String charset = (String)charsetMap.get(loc.toString());
if (charset == null) {
if (loc.getVariant().length() > 0) {
Locale l = new Locale(loc.getLanguage(), loc.getCountry());
charset = (String)charsetMap.get(l.toString());
if (charset != null) {
charsetMap.put(loc.toString(), charset);
}
}
charset = (String)charsetMap.get(loc.getLanguage());
if (charset != null) {
charsetMap.put(loc.toString(), charset);
}
}
return charset;
}
/**
* is the specified charset name a unicode charset?
*
* @param charset charset
* @return true/false
*/
public static boolean isUnicodeCharset(String charset) {
return Strings.startsWithIgnoreCase(charset, "UTF-");
}
public static boolean isSupportedCharset(String charset) {
if (Strings.isNotEmpty(charset)) {
return Charset.isSupported(charset);
}
return false;
}
public static Charset defaultCharset() {
return Charset.defaultCharset();
}
public static Charset defaultCharset(Charset cs) {
return cs == null ? defaultCharset() : cs;
}
public static Charset defaultCharset(Charset cs, Charset def) {
return cs == null ? def : cs;
}
public static String defaultEncoding() {
return Charset.defaultCharset().name();
}
public static String defaultEncoding(String enc) {
return enc == null ? defaultEncoding() : enc;
}
public static String defaultEncoding(String enc, String def) {
return enc == null ? def : enc;
}
}