org.apache.tika.utils.CharsetUtils Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.utils;

import static java.util.Locale.ENGLISH;

import java.lang.reflect.Method;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CharsetUtils {

    private static final Pattern CHARSET_NAME_PATTERN =
            Pattern.compile("[ \\\"]*([^ >,;\\\"]+).*");

    private static final Pattern ISO_NAME_PATTERN =
            Pattern.compile(".*8859-(\\d+)");

    private static final Pattern CP_NAME_PATTERN =
            Pattern.compile("cp-(\\d+)");

    private static final Pattern WIN_NAME_PATTERN =
            Pattern.compile("win-?(\\d+)");

    private static final Map COMMON_CHARSETS =
            new HashMap();

    private static Method getCharsetICU = null;
    private static Method isSupportedICU = null;

    private static Map initCommonCharsets(String... names) {
        Map charsets = new HashMap();
        for (String name : names) {
            try {
                Charset charset = Charset.forName(name);
                COMMON_CHARSETS.put(name.toLowerCase(ENGLISH), charset);
                for (String alias : charset.aliases()) {
                    COMMON_CHARSETS.put(alias.toLowerCase(ENGLISH), charset);
                }
            } catch (Exception e) {
                // ignore
            }
        }
        return charsets;
    }

    static {
        initCommonCharsets(
                "Big5",
                "EUC-JP", "EUC-KR", "x-EUC-TW",
                "GB18030",
                "IBM855", "IBM866",
                "ISO-2022-CN", "ISO-2022-JP", "ISO-2022-KR",
                "ISO-8859-1", "ISO-8859-2", "ISO-8859-3", "ISO-8859-4",
                "ISO-8859-5", "ISO-8859-6", "ISO-8859-7", "ISO-8859-8",
                "ISO-8859-9", "ISO-8859-11", "ISO-8859-13", "ISO-8859-15",
                "KOI8-R",
                "x-MacCyrillic",
                "SHIFT_JIS",
                "UTF-8", "UTF-16BE", "UTF-16LE",
                "windows-1251", "windows-1252", "windows-1253", "windows-1255");

        // Common aliases/typos not included in standard charset definitions
        COMMON_CHARSETS.put("iso-8851-1", COMMON_CHARSETS.get("iso-8859-1"));
        COMMON_CHARSETS.put("windows", COMMON_CHARSETS.get("windows-1252"));
        COMMON_CHARSETS.put("koi8r", COMMON_CHARSETS.get("koi8-r"));

        // See if we can load the icu4j CharsetICU class
        Class icuCharset = null;
        try  {
            icuCharset = CharsetUtils.class.getClassLoader().loadClass(
                    "com.ibm.icu.charset.CharsetICU");
        }  catch (ClassNotFoundException e) {
        }
        if (icuCharset != null) {
            try {
                getCharsetICU = icuCharset.getMethod("forNameICU", String.class);
            } catch (Throwable t) {
                throw new RuntimeException(t);
            }
            try {
                isSupportedICU = icuCharset.getMethod("isSupported", String.class);
            } catch (Throwable t) {
            }
            // TODO: would be nice to somehow log that we
            // successfully found ICU
        }
    }

    /**
     * Safely return whether  is supported, without throwing exceptions
     * 
     * @param charsetName Name of charset (can be null)
     * @return true if the character set is supported
     */
    public static boolean isSupported(String charsetName) {
        try {
            if (isSupportedICU != null && ((Boolean) isSupportedICU.invoke(null, charsetName)).booleanValue()) {
                return true;
            }
            return Charset.isSupported(charsetName);
        } catch (IllegalCharsetNameException e) {
            return false;
        } catch (IllegalArgumentException e) {
            // null, for example
            return false;
        } catch (Exception e) {
            // Unexpected exception, what to do?
            return false;
        }
    }

    /**
     * Handle various common charset name errors, and return something
     * that will be considered valid (and is normalized)
     * 
     * @param charsetName name of charset to process
     * @return potentially remapped/cleaned up version of charset name
     */
    public static String clean(String charsetName) {
        try {
            return forName(charsetName).name();
        } catch (Exception e) {
            return null;
        }
    }

    /** Returns Charset impl, if one exists.  This method
     *  optionally uses ICU4J's CharsetICU.forNameICU,
     *  if it is found on the classpath, else only uses
     *  JDK's builtin Charset.forName. */
    public static Charset forName(String name) {
        if (name == null) {
            throw new IllegalArgumentException();
        }

        // Get rid of cruft around names, like <>, trailing commas, etc.
        Matcher m = CHARSET_NAME_PATTERN.matcher(name);
        if (!m.matches()) {
            throw new IllegalCharsetNameException(name);
        }
        name = m.group(1);

        String lower = name.toLowerCase(Locale.ENGLISH);
        Charset charset = COMMON_CHARSETS.get(lower);
        if (charset != null) {
            return charset;
        } else if ("none".equals(lower) || "no".equals(lower)) {
            throw new IllegalCharsetNameException(name);
        } else {
            Matcher iso = ISO_NAME_PATTERN.matcher(lower);
            Matcher cp = CP_NAME_PATTERN.matcher(lower);
            Matcher win = WIN_NAME_PATTERN.matcher(lower);
            if (iso.matches()) {
                // Handle "iso 8859-x" error
                name = "iso-8859-" + iso.group(1);
                charset = COMMON_CHARSETS.get(name);
            } else if (cp.matches()) {
                // Handle "cp-xxx" error
                name = "cp" + cp.group(1);
                charset = COMMON_CHARSETS.get(name);
            } else if (win.matches()) {
                // Handle "winxxx" and "win-xxx" errors
                name = "windows-" + win.group(1);
                charset = COMMON_CHARSETS.get(name);
            }
            if (charset != null) {
                return charset;
            }
        }

        if (getCharsetICU != null) {
            try {
                Charset cs = (Charset) getCharsetICU.invoke(null, name);
                if (cs != null) {
                    return cs;
                }
            } catch (Exception e) {
                // ignore
            }
        }

        return Charset.forName(name);
    }
}