org.apache.tika.utils.CharsetUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-core Show documentation
Show all versions of tika-core Show documentation
This is the core Apache Tika™ toolkit library from which all other modules inherit functionality. It also
includes the core facades for the Tika API.
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.utils;
import static java.util.Locale.ENGLISH;
import java.lang.reflect.Method;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CharsetUtils {
private static final Pattern CHARSET_NAME_PATTERN =
Pattern.compile("[ \\\"]*([^ >,;\\\"]+).*");
private static final Pattern ISO_NAME_PATTERN =
Pattern.compile(".*8859-(\\d+)");
private static final Pattern CP_NAME_PATTERN =
Pattern.compile("cp-(\\d+)");
private static final Pattern WIN_NAME_PATTERN =
Pattern.compile("win-?(\\d+)");
private static final Map COMMON_CHARSETS =
new HashMap();
private static Method getCharsetICU = null;
private static Method isSupportedICU = null;
private static Map initCommonCharsets(String... names) {
Map charsets = new HashMap();
for (String name : names) {
try {
Charset charset = Charset.forName(name);
COMMON_CHARSETS.put(name.toLowerCase(ENGLISH), charset);
for (String alias : charset.aliases()) {
COMMON_CHARSETS.put(alias.toLowerCase(ENGLISH), charset);
}
} catch (Exception e) {
// ignore
}
}
return charsets;
}
static {
initCommonCharsets(
"Big5",
"EUC-JP", "EUC-KR", "x-EUC-TW",
"GB18030",
"IBM855", "IBM866",
"ISO-2022-CN", "ISO-2022-JP", "ISO-2022-KR",
"ISO-8859-1", "ISO-8859-2", "ISO-8859-3", "ISO-8859-4",
"ISO-8859-5", "ISO-8859-6", "ISO-8859-7", "ISO-8859-8",
"ISO-8859-9", "ISO-8859-11", "ISO-8859-13", "ISO-8859-15",
"KOI8-R",
"x-MacCyrillic",
"SHIFT_JIS",
"UTF-8", "UTF-16BE", "UTF-16LE",
"windows-1251", "windows-1252", "windows-1253", "windows-1255");
// Common aliases/typos not included in standard charset definitions
COMMON_CHARSETS.put("iso-8851-1", COMMON_CHARSETS.get("iso-8859-1"));
COMMON_CHARSETS.put("windows", COMMON_CHARSETS.get("windows-1252"));
COMMON_CHARSETS.put("koi8r", COMMON_CHARSETS.get("koi8-r"));
// See if we can load the icu4j CharsetICU class
Class> icuCharset = null;
try {
icuCharset = CharsetUtils.class.getClassLoader().loadClass(
"com.ibm.icu.charset.CharsetICU");
} catch (ClassNotFoundException e) {
}
if (icuCharset != null) {
try {
getCharsetICU = icuCharset.getMethod("forNameICU", String.class);
} catch (Throwable t) {
throw new RuntimeException(t);
}
try {
isSupportedICU = icuCharset.getMethod("isSupported", String.class);
} catch (Throwable t) {
}
// TODO: would be nice to somehow log that we
// successfully found ICU
}
}
/**
* Safely return whether is supported, without throwing exceptions
*
* @param charsetName Name of charset (can be null)
* @return true if the character set is supported
*/
public static boolean isSupported(String charsetName) {
try {
if (isSupportedICU != null && ((Boolean) isSupportedICU.invoke(null, charsetName)).booleanValue()) {
return true;
}
return Charset.isSupported(charsetName);
} catch (IllegalCharsetNameException e) {
return false;
} catch (IllegalArgumentException e) {
// null, for example
return false;
} catch (Exception e) {
// Unexpected exception, what to do?
return false;
}
}
/**
* Handle various common charset name errors, and return something
* that will be considered valid (and is normalized)
*
* @param charsetName name of charset to process
* @return potentially remapped/cleaned up version of charset name
*/
public static String clean(String charsetName) {
try {
return forName(charsetName).name();
} catch (Exception e) {
return null;
}
}
/** Returns Charset impl, if one exists. This method
* optionally uses ICU4J's CharsetICU.forNameICU,
* if it is found on the classpath, else only uses
* JDK's builtin Charset.forName. */
public static Charset forName(String name) {
if (name == null) {
throw new IllegalArgumentException();
}
// Get rid of cruft around names, like <>, trailing commas, etc.
Matcher m = CHARSET_NAME_PATTERN.matcher(name);
if (!m.matches()) {
throw new IllegalCharsetNameException(name);
}
name = m.group(1);
String lower = name.toLowerCase(Locale.ENGLISH);
Charset charset = COMMON_CHARSETS.get(lower);
if (charset != null) {
return charset;
} else if ("none".equals(lower) || "no".equals(lower)) {
throw new IllegalCharsetNameException(name);
} else {
Matcher iso = ISO_NAME_PATTERN.matcher(lower);
Matcher cp = CP_NAME_PATTERN.matcher(lower);
Matcher win = WIN_NAME_PATTERN.matcher(lower);
if (iso.matches()) {
// Handle "iso 8859-x" error
name = "iso-8859-" + iso.group(1);
charset = COMMON_CHARSETS.get(name);
} else if (cp.matches()) {
// Handle "cp-xxx" error
name = "cp" + cp.group(1);
charset = COMMON_CHARSETS.get(name);
} else if (win.matches()) {
// Handle "winxxx" and "win-xxx" errors
name = "windows-" + win.group(1);
charset = COMMON_CHARSETS.get(name);
}
if (charset != null) {
return charset;
}
}
if (getCharsetICU != null) {
try {
Charset cs = (Charset) getCharsetICU.invoke(null, name);
if (cs != null) {
return cs;
}
} catch (Exception e) {
// ignore
}
}
return Charset.forName(name);
}
}