All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.basho.riak.client.util.CharsetUtils Maven / Gradle / Ivy

/*
 * This file is provided to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.basho.riak.client.util;

import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.basho.riak.client.http.util.Constants;

/**
 * Utils for dealing with byte[], String charset
 * issues, especially since Java 5 is less cool than Java 6 in this respect.
 * 
 * This code is mainly from the Trifork fork of the client and was written by
 * Krestan Krab and/or Erik Søe Sørensen.
 * 
 * @author russell
 */
public class CharsetUtils {
    public static Charset ASCII = Charset.forName("ASCII");
    public static Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
    public static Charset UTF_8 = Charset.forName("UTF-8");

    /**
     * RegEx pattern to get the charset from a content-type value.
     */
    private static final Pattern CHARSET_PATT = Pattern.compile("\\bcharset *= *\"?([^ ;\"]+)\"?", Pattern.CASE_INSENSITIVE);


    /**
     * Attempt to extract a charset from a Map of HTTP
     * headers. Really just pulls a the entry
     * {@link Constants#HDR_CONTENT_LENGTH} from the map and passes it to
     * {@link CharsetUtils#getCharset(String)}
     * 
     * @param headers
     *            a {@link Map} of HTTP headers (or anything, really).
     * @return {@link CharsetUtils#ISO_8859_1} if headers is null,
     *         or result of calling {@link CharsetUtils#getCharset(String)} with
     *         the content-type header from headers
     * @see CharsetUtils#getCharset(String)
     */
    public static Charset getCharset(Map headers) {
        if(headers == null) {
            return ISO_8859_1;
        }
        return getCharset(headers.get(com.basho.riak.client.http.util.Constants.HDR_CONTENT_TYPE));
    }

    /**
     * Attempts to parse the {@link Charset} from a contentType string.
     * 
     * If contentType is null or no charset declaration found, then UTF-8 is
     * returned. If the found Charset declaration is unknown on this platform
     * then a runtime exception is thrown.
     * 
     * @param contentType
     * @return a {@link Charset} parsed from a charset declaration in a
     *         contentType String.
     */
    public static Charset getCharset(String contentType) {
        if (contentType == null) {
            return ISO_8859_1;
        }

        if (com.basho.riak.client.http.util.Constants.CTYPE_JSON_UTF8.equals(contentType)) {
            return UTF_8; // Fast-track
        }

        Matcher matcher = CHARSET_PATT.matcher(contentType);
        if (matcher.find()) {
            String encstr = matcher.group(1);

            if (encstr.equalsIgnoreCase("UTF-8")) {
                return UTF_8; // Fast-track
            } else {
                try {
                    return Charset.forName(encstr.toUpperCase());
                } catch (Exception e) {
                    // ignore //
                }
            }
        }

        return ISO_8859_1;
    }

    /**
     * Get the actual string value declared as the charset in a content-type
     * string, regardless of its validity.
     * 

* NOTE: this is different from getCharset, which will always return a * default value. *

* * @param contentType * the content-type string * @return the verbatim charset declared or null if non-exists */ public static String getDeclaredCharset(String contentType) { if (contentType == null) { return null; } Matcher matcher = CHARSET_PATT.matcher(contentType); if (matcher.find()) { String encstr = matcher.group(1); return encstr; } else { return null; } } /** * Adds the utf-8 charset to a content type. * * @param contentType * @return the contentType with ;charset=utf-8 appended. */ public static String addUtf8Charset(String contentType) { if (contentType == null) { return "text/plain;charset=utf-8"; } Matcher matcher = CHARSET_PATT.matcher(contentType); if (matcher.find()) { // replace what ever content-type with utf8 return contentType.substring(0, matcher.start(1)) + "utf-8" + contentType.substring(matcher.end(1)); } return contentType + ";charset=utf-8"; } /** * Turns a byte[] array into a string in the provided {@link Charset} * * @param bytes * @param charset * @return a String */ public static String asString(byte[] bytes, Charset charset) { if(bytes == null) { return null; } if(charset == null) { throw new IllegalArgumentException("Cannot get bytes without a Charset"); } try { return new String(bytes, charset.name()); } catch (UnsupportedEncodingException e) { throw new IllegalStateException(charset.name() + " must be present", e); } } /** * Turns a byte[] array into a UTF8 string * * @param bytes * @param charset * @return a String */ public static String asUTF8String(byte[] bytes) { return asString(bytes, UTF_8); } /** * Turn a string into an array of bytes using the passed {@link Charset} * * @param string * @param charset * @return a byte[] array */ public static byte[] asBytes(String string, Charset charset) { if(string == null) { return null; } if(charset == null) { throw new IllegalArgumentException("Cannot get bytes without a Charset"); } try { return string.getBytes(charset.name()); } catch (UnsupportedEncodingException e) { //since we are using *actual* charsets, not string lookups, this //should *never* happen. But it is better to throw it up than swallow it. throw new IllegalStateException("Charset present", e); } } /** * Turn a UTF-8 encoded string into an array of bytes * * @param string * @return */ public static byte[] utf8StringToBytes(String string) { return asBytes(string, UTF_8); } /** * Check if a content-type string has a charset field appended. * * @param ctype * the content-type string * @return true if ctype has a charset, false otherwise */ public static boolean hasCharset(String ctype) { if(ctype == null) { return false; } Matcher matcher = CHARSET_PATT.matcher(ctype); if (matcher.find()) { return true; } else { return false; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy