panda.lang.Charsets Maven / Gradle / Ivy

Go to download
package panda.lang;

import java.nio.charset.Charset;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

/**
 * Character encoding names required of every implementation of the Java platform.
 *
 * From the Java documentation Standard charsets:
 * 
 * Every implementation of the Java platform is required to support the following character encodings. Consult the
 * release documentation for your implementation to see if any other encodings are supported. Consult the release
 * documentation for your implementation to see if any other encodings are supported.
 * 
 *
 * 
 * US-ASCII

 * Seven-bit ASCII, a.k.a. ISO646-US, a.k.a. the Basic Latin block of the Unicode character set.
 * ISO-8859-1

 * ISO Latin Alphabet No. 1, a.k.a. ISO-LATIN-1.
 * UTF-8

 * Eight-bit Unicode Transformation Format.
 * UTF-16BE

 * Sixteen-bit Unicode Transformation Format, big-endian byte order.
 * UTF-16LE

 * Sixteen-bit Unicode Transformation Format, little-endian byte order.
 * UTF-16

 * Sixteen-bit Unicode Transformation Format, byte order specified by a mandatory initial byte-order mark (either order
 * accepted on input, big-endian used on output.)
 * 
 *
 * This perhaps would best belong in the [lang] project. Even if a similar interface is defined in [lang], it is not
 * foreseen that [codec] would be made to depend on [lang].
 *
 * @see Standard charsets
 *
 */
public class Charsets {
	/**
	 * Japanese charsets. 
	 * 
	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final String MS932 = "MS932";
	public static final String Shift_JIS = "Shift_JIS";
	public static final String EUC_JP = "EUC-JP";

	/**
	 * Chinese charsets. 
	 * 
	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final String GB2312 = "GB2312";
	public static final String GBK = "GBK";
	public static final String BIG5 = "Big5";

	/**
	 * CharEncodingISO Latin Alphabet No. 1, a.k.a. ISO-LATIN-1. 
	 * 
	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final String ISO_8859_1 = "ISO-8859-1";

	/**
	 * 
	 * Seven-bit ASCII, also known as ISO646-US, also known as the Basic Latin block of the Unicode
	 * character set.
	 * 
	 * 
	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final String US_ASCII = "US-ASCII";

	/**
	 * 
	 * Sixteen-bit Unicode Transformation Format, The byte order specified by a mandatory initial
	 * byte-order mark (either order accepted on input, big-endian used on output)
	 * 
	 * 
	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final String UTF_16 = "UTF-16";

	/**
	 * 
	 * Sixteen-bit Unicode Transformation Format, big-endian byte order.
	 * 
	 * 
	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final String UTF_16BE = "UTF-16BE";

	/**
	 * 
	 * Sixteen-bit Unicode Transformation Format, little-endian byte order.
	 * 
	 * 
	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final String UTF_16LE = "UTF-16LE";

	public static final String UTF_32BE = "UTF-32BE";
	public static final String UTF_32LE = "UTF-32LE";
	
	/**
	 * 
	 * Eight-bit Unicode Transformation Format.
	 * 
	 * 
	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final String UTF_8 = "UTF-8";

	/**
	 * CharEncodingISO Latin Alphabet No. 1, a.k.a. ISO-LATIN-1.
	 * 
	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final Charset CS_ISO_8859_1 = Charset.forName(ISO_8859_1);

	/**
	 * Seven-bit ASCII, also known as ISO646-US, also known as the Basic Latin block of the Unicode
	 * character set.
	 * 

	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final Charset CS_US_ASCII = Charset.forName(US_ASCII);

	/**
	 * Sixteen-bit Unicode Transformation Format, The byte order specified by a mandatory initial
	 * byte-order mark (either order accepted on input, big-endian used on output)
	 * 

	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final Charset CS_UTF_16 = Charset.forName(UTF_16);

	/**
	 * Sixteen-bit Unicode Transformation Format, big-endian byte order.
	 * 

	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final Charset CS_UTF_16BE = Charset.forName(UTF_16BE);

	/**
	 * Sixteen-bit Unicode Transformation Format, little-endian byte order.
	 * 

	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final Charset CS_UTF_16LE = Charset.forName(UTF_16LE);

	public static final Charset CS_UTF_32BE = Charset.forName(UTF_32BE);
	public static final Charset CS_UTF_32LE = Charset.forName(UTF_32LE);

	/**
	 * Eight-bit Unicode Transformation Format.
	 * 

	 * Every implementation of the Java platform is required to support this character encoding.
	 * 
	 * @see Standard
	 *      charsets
	 */
	public static final Charset CS_UTF_8 = Charset.forName(UTF_8);

	/**
	 * Returns the given Charset or the default Charset if the given Charset is null.
	 * 
	 * @param charset A charset or null.
	 * @return the given Charset or the default Charset if the given Charset is null
	 */
	public static Charset toCharset(final Charset charset) {
		return charset == null ? Charset.defaultCharset() : charset;
	}

	/**
	 * Returns a Charset for the named charset. If the name is null, return the default Charset.
	 * 
	 * @param charset The name of the requested charset, may be null.
	 * @return a Charset for the named charset
	 * @throws java.nio.charset.UnsupportedCharsetException If the named charset is unavailable
	 */
	public static Charset toCharset(final String charset) {
		return charset == null ? Charset.defaultCharset() : Charset.forName(charset);
	}

	/**
	 * Returns a Charset for the named charset. If the name is null, return the default Charset.
	 * 
	 * @param charset The name of the requested charset, may be null.
	 * @param defCharset Default charset value
	 * @return a Charset for the named charset
	 */
	public static Charset toCharset(final String charset, Charset defCharset) {
		if (defCharset == null) {
			defCharset = Charset.defaultCharset();
		}
		if (Strings.isNotEmpty(charset)) {
			try {
				return Charset.forName(charset);
			}
			catch (Exception e) {
				//skip
			}
		}
		return defCharset;
	}

	private final static Map charsetMap = new ConcurrentHashMap();

	static {
		loadBuiltInCharsetMap();
	}
	
	/**
	 * Loads a preset language-to-encoding map. It assumes the usual character encodings for most
	 * languages. The previous content of the encoding map will be lost. This default map currently
	 * contains the following mappings:
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * 
	 * ar ISO-8859-6
be ISO-8859-5
bg ISO-8859-5
ca ISO-8859-1
cs ISO-8859-2
da ISO-8859-1
de ISO-8859-1
el ISO-8859-7
en ISO-8859-1
es ISO-8859-1
et ISO-8859-1
fi ISO-8859-1
fr ISO-8859-1
hr ISO-8859-2
hu ISO-8859-2
is ISO-8859-1
it ISO-8859-1
iw ISO-8859-8
ja Shift_JIS
ko EUC-KR
lt ISO-8859-2
lv ISO-8859-2
mk ISO-8859-5
nl ISO-8859-1
no ISO-8859-1
pl ISO-8859-2
pt ISO-8859-1
ro ISO-8859-2
ru ISO-8859-5
sh ISO-8859-5
sk ISO-8859-2
sl ISO-8859-2
sq ISO-8859-2
sr ISO-8859-5
sv ISO-8859-1
tr ISO-8859-9
uk ISO-8859-5
zh GB2312
zh_TW Big5
	 */
	public static void loadBuiltInCharsetMap() {
		charsetMap.clear();
		charsetMap.put("ar", "ISO-8859-6");
		charsetMap.put("be", "ISO-8859-5");
		charsetMap.put("bg", "ISO-8859-5");
		charsetMap.put("ca", "ISO-8859-1");
		charsetMap.put("cs", "ISO-8859-2");
		charsetMap.put("da", "ISO-8859-1");
		charsetMap.put("de", "ISO-8859-1");
		charsetMap.put("el", "ISO-8859-7");
		charsetMap.put("en", "ISO-8859-1");
		charsetMap.put("es", "ISO-8859-1");
		charsetMap.put("et", "ISO-8859-1");
		charsetMap.put("fi", "ISO-8859-1");
		charsetMap.put("fr", "ISO-8859-1");
		charsetMap.put("hr", "ISO-8859-2");
		charsetMap.put("hu", "ISO-8859-2");
		charsetMap.put("is", "ISO-8859-1");
		charsetMap.put("it", "ISO-8859-1");
		charsetMap.put("iw", "ISO-8859-8");
		charsetMap.put("ja", "Shift_JIS");
		charsetMap.put("ko", "EUC-KR");
		charsetMap.put("lt", "ISO-8859-2");
		charsetMap.put("lv", "ISO-8859-2");
		charsetMap.put("mk", "ISO-8859-5");
		charsetMap.put("nl", "ISO-8859-1");
		charsetMap.put("no", "ISO-8859-1");
		charsetMap.put("pl", "ISO-8859-2");
		charsetMap.put("pt", "ISO-8859-1");
		charsetMap.put("ro", "ISO-8859-2");
		charsetMap.put("ru", "ISO-8859-5");
		charsetMap.put("sh", "ISO-8859-5");
		charsetMap.put("sk", "ISO-8859-2");
		charsetMap.put("sl", "ISO-8859-2");
		charsetMap.put("sq", "ISO-8859-2");
		charsetMap.put("sr", "ISO-8859-5");
		charsetMap.put("sv", "ISO-8859-1");
		charsetMap.put("tr", "ISO-8859-9");
		charsetMap.put("uk", "ISO-8859-5");
		charsetMap.put("zh", "GB2312");
		charsetMap.put("zh_TW", "Big5");
	}

	/**
	 * Clears language-to-encoding map.
	 * 
	 * @see #loadBuiltInCharsetMap
	 * @see #setCharset
	 */
	public static void clearCharsetMap() {
		charsetMap.clear();
	}

	/**
	 * Sets the character set encoding to use for templates of a given locale.
	 * 
	 * @param locale locale
	 * @param encoding encoding
	 * @see #clearCharsetMap
	 * @see #loadBuiltInCharsetMap
	 */
	public static void setCharset(Locale locale, String encoding) {
		charsetMap.put(locale.toString(), encoding);
	}

	/**
	 * Gets the preferred character encoding for the given locale, or the default encoding if no
	 * encoding is set explicitly for the specified locale. You can associate encodings with locales
	 * using {@link #setCharset(Locale, String)} or {@link #loadBuiltInCharsetMap()}.
	 * 
	 * @param loc the locale
	 * @return the preferred character encoding for the locale.
	 */
	public static String charsetFromLocale(Locale loc) {
		// Try for a full name match (may include country and variant)
		String charset = (String)charsetMap.get(loc.toString());
		if (charset == null) {
			if (loc.getVariant().length() > 0) {
				Locale l = new Locale(loc.getLanguage(), loc.getCountry());
				charset = (String)charsetMap.get(l.toString());
				if (charset != null) {
					charsetMap.put(loc.toString(), charset);
				}
			}

			charset = (String)charsetMap.get(loc.getLanguage());
			if (charset != null) {
				charsetMap.put(loc.toString(), charset);
			}
		}
		return charset;
	}

	/**
	 * is the specified charset name a unicode charset?
	 * 
	 * @param charset charset
	 * @return true/false
	 */
	public static boolean isUnicodeCharset(String charset) {
		return Strings.startsWithIgnoreCase(charset, "UTF-");
	}

	public static boolean isSupportedCharset(String charset) {
		if (Strings.isNotEmpty(charset)) {
			return Charset.isSupported(charset);
		}
		return false;
	}
	
	public static Charset defaultCharset() {
		return Charset.defaultCharset();
	}
	
	public static Charset defaultCharset(Charset cs) {
		return cs == null ? defaultCharset() : cs;
	}
	
	public static Charset defaultCharset(Charset cs, Charset def) {
		return cs == null ? def : cs;
	}
	
	public static String defaultEncoding() {
		return Charset.defaultCharset().name();
	}

	public static String defaultEncoding(String enc) {
		return enc == null ? defaultEncoding() : enc;
	}

	public static String defaultEncoding(String enc, String def) {
		return enc == null ? def : enc;
	}
}
ar	ISO-8859-6
be	ISO-8859-5
bg	ISO-8859-5
ca	ISO-8859-1
cs	ISO-8859-2
da	ISO-8859-1
de	ISO-8859-1
el	ISO-8859-7
en	ISO-8859-1
es	ISO-8859-1
et	ISO-8859-1
fi	ISO-8859-1
fr	ISO-8859-1
hr	ISO-8859-2
hu	ISO-8859-2
is	ISO-8859-1
it	ISO-8859-1
iw	ISO-8859-8
ja	Shift_JIS
ko	EUC-KR
lt	ISO-8859-2
lv	ISO-8859-2
mk	ISO-8859-5
nl	ISO-8859-1
no	ISO-8859-1
pl	ISO-8859-2
pt	ISO-8859-1
ro	ISO-8859-2
ru	ISO-8859-5
sh	ISO-8859-5
sk	ISO-8859-2
sl	ISO-8859-2
sq	ISO-8859-2
sr	ISO-8859-5
sv	ISO-8859-1
tr	ISO-8859-9
uk	ISO-8859-5
zh	GB2312
zh_TW	Big5