java.nio.charset.Charset Maven / Gradle / Ivy

/*
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package java.nio.charset;

import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.spi.CharsetProvider;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.ServiceLoader;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import libcore.icu.NativeConverter;

/**
 * A charset is a named mapping between Unicode characters and byte sequences. Every
 * {@code Charset} can decode, converting a byte sequence into a sequence of characters,
 * and some can also encode, converting a sequence of characters into a byte sequence.
 * Use the method {@link #canEncode} to find out whether a charset supports both.
 *
 * Characters
 * In the context of this class, character always refers to a Java character: a Unicode
 * code point in the range U+0000 to U+FFFF. (Java represents supplementary characters using surrogates.)
 * Not all byte sequences will represent a character, and not
 * all characters can necessarily be represented by a given charset. The method {@link #contains}
 * can be used to determine whether every character representable by one charset can also be
 * represented by another (meaning that a lossless transformation is possible from the contained
 * to the container).
 *
 * 
Encodings
 * There are many possible ways to represent Unicode characters as byte sequences.
 * See UTR#17: Unicode Character Encoding Model
 * for detailed discussion.
 *
 * 
The most important mappings capable of representing every character are the Unicode
 * Transformation Format (UTF) charsets. Of those, UTF-8 and the UTF-16 family are the most
 * common. UTF-8 (described in RFC 3629)
 * encodes a character using 1 to 4 bytes. UTF-16 uses exactly 2 bytes per character (potentially
 * wasting space, but allowing efficient random access into BMP text), and UTF-32 uses
 * exactly 4 bytes per character (trading off even more space for efficient random access into text
 * that includes supplementary characters).
 *
 * 
UTF-16 and UTF-32 encode characters directly, using their code point as a two- or four-byte
 * integer. This means that any given UTF-16 or UTF-32 byte sequence is either big- or
 * little-endian. To assist decoders, Unicode includes a special byte order mark (BOM)
 * character U+FEFF used to determine the endianness of a sequence. The corresponding byte-swapped
 * code point U+FFFE is guaranteed never to be assigned. If a UTF-16 decoder sees
 * {@code 0xfe, 0xff}, for example, it knows it's reading a big-endian byte sequence, while
 * {@code 0xff, 0xfe}, would indicate a little-endian byte sequence.
 *
 * 
UTF-8 can contain a BOM, but since the UTF-8 encoding of a character always uses the same
 * byte sequence, there is no information about endianness to convey. Seeing the bytes
 * corresponding to the UTF-8 encoding of U+FEFF ({@code 0xef, 0xbb, 0xbf}) would only serve to
 * suggest that you're reading UTF-8. Note that BOMs are decoded as the U+FEFF character, and
 * will appear in the output character sequence. This means that a disadvantage to including a BOM
 * in UTF-8 is that most applications that use UTF-8 do not expect to see a BOM. (This is also a
 * reason to prefer UTF-8: it's one less complication to worry about.)
 *
 * 
Because a BOM indicates how the data that follows should be interpreted, a BOM should occur
 * as the first character in a character sequence.
 *
 * 
See the Byte Order Mark (BOM) FAQ for
 * more about dealing with BOMs.
 *
 * 
Endianness and BOM behavior
 *
 * The following tables show the endianness and BOM behavior of the UTF-16 variants.
 *
 * 
This table shows what the encoder writes. "BE" means that the byte sequence is big-endian,
 * "LE" means little-endian. "BE BOM" means a big-endian BOM (that is, {@code 0xfe, 0xff}).
 * 

 * 
 * 
 * 
 * 
 *  Charset   Encoder writes   
 UTF-16BE  BE, no BOM       
 UTF-16LE  LE, no BOM       
 UTF-16    BE, with BE BOM  
 *
 * The next table shows how each variant's decoder behaves when reading a byte sequence.
 * The exact meaning of "failure" in the table is dependent on the
 * {@link CodingErrorAction} supplied to {@link CharsetDecoder#malformedInputAction}, so
 * "BE, failure" means "the byte sequence is treated as big-endian, and a little-endian BOM
 * triggers the malformedInputAction".
 *
 * 
The phrase "includes BOM" means that the output includes the U+FEFF byte order mark character.
 *
 * 

 * 
 * 
 * 
 * 
 *  Charset   BE BOM            LE BOM            No BOM  
 UTF-16BE  BE, includes BOM  BE, failure       BE      
 UTF-16LE  LE, failure       LE, includes BOM  LE      
 UTF-16    BE                LE                BE      
 *
 * Charset names
 * A charset has a canonical name, returned by {@link #name}. Most charsets will
 * also have one or more aliases, returned by {@link #aliases}. A charset can be looked up
 * by canonical name or any of its aliases using {@link #forName}.
 *
 * 
Guaranteed-available charsets
 * The following charsets are available on every Java implementation:
 * 

 * ISO-8859-1
 * 
US-ASCII
 * 
UTF-16
 * 
UTF-16BE
 * 
UTF-16LE
 * 
UTF-8
 * 
 * All of these charsets support both decoding and encoding. The charsets whose names begin
 * "UTF" can represent all characters, as mentioned above. The "ISO-8859-1" and "US-ASCII" charsets
 * can only represent small subsets of these characters. Except when required to do otherwise for
 * compatibility, new code should use one of the UTF charsets listed above. The platform's default
 * charset is UTF-8. (This is in contrast to some older implementations, where the default charset
 * depended on the user's locale.)
 *
 * 
Most implementations will support hundreds of charsets. Use {@link #availableCharsets} or
 * {@link #isSupported} to see what's available. If you intend to use the charset if it's
 * available, just call {@link #forName} and catch the exceptions it throws if the charset isn't
 * available.
 *
 * 
Additional charsets can be made available by configuring one or more charset
 * providers through provider configuration files. Such files are always named
 * as "java.nio.charset.spi.CharsetProvider" and located in the
 * "META-INF/services" directory of one or more classpaths. The files should be
 * encoded in "UTF-8". Each line of their content specifies the class name of a
 * charset provider which extends {@link java.nio.charset.spi.CharsetProvider}.
 * A line should end with '\r', '\n' or '\r\n'. Leading and trailing whitespace
 * is trimmed. Blank lines, and lines (after trimming) starting with "#" which are
 * regarded as comments, are both ignored. Duplicates of names already found are also
 * ignored. Both the configuration files and the provider classes will be loaded
 * using the thread context class loader.
 *
 * 
Although class is thread-safe, the {@link CharsetDecoder} and {@link CharsetEncoder} instances
 * it returns are inherently stateful.
 */
public abstract class Charset implements Comparable {
    private static final HashMap CACHED_CHARSETS = new HashMap();

    private static final Charset DEFAULT_CHARSET = getDefaultCharset();

    private final String canonicalName;

    private final HashSet aliasesSet;

    /**
     * Constructs a Charset object. Duplicated aliases are
     * ignored.
     *
     * @param canonicalName
     *            the canonical name of the charset.
     * @param aliases
     *            an array containing all aliases of the charset. May be null.
     * @throws IllegalCharsetNameException
     *             on an illegal value being supplied for either
     *             canonicalName or for any element of
     *             aliases.
     */
    protected Charset(String canonicalName, String[] aliases) {
        // Check whether the given canonical name is legal.
        checkCharsetName(canonicalName);
        this.canonicalName = canonicalName;

        // Collect and check each unique alias.
        this.aliasesSet = new HashSet();
        if (aliases != null) {
            for (String alias : aliases) {
                checkCharsetName(alias);
                this.aliasesSet.add(alias);
            }
        }
    }

    private static void checkCharsetName(String name) {
        if (name.isEmpty()) {
            throw new IllegalCharsetNameException(name);
        }
        if (!isValidCharsetNameStart(name.charAt(0))) {
            throw new IllegalCharsetNameException(name);
        }
        for (int i = 1; i < name.length(); ++i) {
            if (!isValidCharsetNamePart(name.charAt(i))) {
                throw new IllegalCharsetNameException(name);
            }
        }
    }

    private static boolean isValidCharsetNameStart(char c) {
        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
    }

    private static boolean isValidCharsetNamePart(char c) {
        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') ||
                c == '-' || c == '.' || c == ':' || c == '_';
    }

    /**
     * Returns an immutable case-insensitive map from canonical names to {@code Charset} instances.
     * If multiple charsets have the same canonical name, it is unspecified which is returned in
     * the map. This method may be slow. If you know which charset you're looking for, use
     * {@link #forName}.
     */
    public static SortedMap availableCharsets() {
        // Start with a copy of the built-in charsets...
        TreeMap charsets = new TreeMap(String.CASE_INSENSITIVE_ORDER);
        for (String charsetName : NativeConverter.getAvailableCharsetNames()) {
            // RoboVM note: Added try-catch to ignore charsets with bad names (e.g. "x-UTF-16,version=1")
            try {
                Charset charset = NativeConverter.charsetForName(charsetName);
                charsets.put(charset.name(), charset);
            } catch (IllegalCharsetNameException e) {
            }
        }

        // Add all charsets provided by all charset providers...
        for (CharsetProvider charsetProvider : ServiceLoader.load(CharsetProvider.class)) {
            Iterator it = charsetProvider.charsets();
            while (it.hasNext()) {
                Charset cs = it.next();
                // A CharsetProvider can't override a built-in Charset.
                if (!charsets.containsKey(cs.name())) {
                    charsets.put(cs.name(), cs);
                }
            }
        }

        return Collections.unmodifiableSortedMap(charsets);
    }

    private static Charset cacheCharset(String charsetName, Charset cs) {
        synchronized (CACHED_CHARSETS) {
            // Get the canonical name for this charset, and the canonical instance from the table.
            String canonicalName = cs.name();
            Charset canonicalCharset = CACHED_CHARSETS.get(canonicalName);
            if (canonicalCharset == null) {
                canonicalCharset = cs;
            }

            // Cache the charset by its canonical name...
            CACHED_CHARSETS.put(canonicalName, canonicalCharset);

            // And the name the user used... (Section 1.4 of http://unicode.org/reports/tr22/ means
            // that many non-alias, non-canonical names are valid. For example, "utf8" isn't an
            // alias of the canonical name "UTF-8", but we shouldn't penalize consistent users of
            // such names unduly.)
            CACHED_CHARSETS.put(charsetName, canonicalCharset);

            // And all its aliases...
            for (String alias : cs.aliasesSet) {
                CACHED_CHARSETS.put(alias, canonicalCharset);
            }

            return canonicalCharset;
        }
    }

    /**
     * Returns a {@code Charset} instance for the named charset.
     *
     * @param charsetName a charset name (either canonical or an alias)
     * @throws IllegalCharsetNameException
     *             if the specified charset name is illegal.
     * @throws UnsupportedCharsetException
     *             if the desired charset is not supported by this runtime.
     */
    public static Charset forName(String charsetName) {
        // Is this charset in our cache?
        Charset cs;
        synchronized (CACHED_CHARSETS) {
            cs = CACHED_CHARSETS.get(charsetName);
            if (cs != null) {
                return cs;
            }
        }

        if (charsetName == null) {
            throw new IllegalCharsetNameException(null);
        }

        // Is this a built-in charset supported by ICU?
        checkCharsetName(charsetName);
        cs = NativeConverter.charsetForName(charsetName);
        if (cs != null) {
            return cacheCharset(charsetName, cs);
        }

        // Does a configured CharsetProvider have this charset?
        for (CharsetProvider charsetProvider : ServiceLoader.load(CharsetProvider.class)) {
            cs = charsetProvider.charsetForName(charsetName);
            if (cs != null) {
                return cacheCharset(charsetName, cs);
            }
        }

        throw new UnsupportedCharsetException(charsetName);
    }

    /**
     * Equivalent to {@code forName} but only throws {@code UnsupportedEncodingException},
     * which is all pre-nio code claims to throw.
     *
     * @hide internal use only
     */
    public static Charset forNameUEE(String charsetName) throws UnsupportedEncodingException {
        try {
            return Charset.forName(charsetName);
        } catch (Exception cause) {
            UnsupportedEncodingException ex = new UnsupportedEncodingException(charsetName);
            ex.initCause(cause);
            throw ex;
        }
    }

    /**
     * Determines whether the specified charset is supported by this runtime.
     *
     * @param charsetName
     *            the name of the charset.
     * @return true if the specified charset is supported, otherwise false.
     * @throws IllegalCharsetNameException
     *             if the specified charset name is illegal.
     */
    public static boolean isSupported(String charsetName) {
        try {
            forName(charsetName);
            return true;
        } catch (UnsupportedCharsetException ex) {
            return false;
        }
    }

    /**
     * Determines whether this charset is a superset of the given charset. A charset C1 contains
     * charset C2 if every character representable by C2 is also representable by C1. This means
     * that lossless conversion is possible from C2 to C1 (but not necessarily the other way
     * round). It does not imply that the two charsets use the same byte sequences for the
     * characters they share.
     *
     * 
Note that this method is allowed to be conservative, and some implementations may return
     * false when this charset does contain the other charset. Android's implementation is precise,
     * and will always return true in such cases.
     *
     * @param charset
     *            a given charset.
     * @return true if this charset is a super set of the given charset,
     *         false if it's unknown or this charset is not a superset of
     *         the given charset.
     */
    public abstract boolean contains(Charset charset);

    /**
     * Returns a new instance of an encoder for this charset.
     */
    public abstract CharsetEncoder newEncoder();

    /**
     * Returns a new instance of a decoder for this charset.
     */
    public abstract CharsetDecoder newDecoder();

    /**
     * Returns the canonical name of this charset.
     *
     * 
If a charset is in the IANA registry, this will be the MIME-preferred name (a charset
     * may have multiple IANA-registered names). Otherwise the canonical name will begin with "x-"
     * or "X-".
     */
    public final String name() {
        return this.canonicalName;
    }

    /**
     * Returns an unmodifiable set of this charset's aliases.
     */
    public final Set aliases() {
        return Collections.unmodifiableSet(this.aliasesSet);
    }

    /**
     * Returns the name of this charset for the default locale.
     *
     * 
The default implementation returns the canonical name of this charset.
     * Subclasses may return a localized display name.
     */
    public String displayName() {
        return this.canonicalName;
    }

    /**
     * Returns the name of this charset for the specified locale.
     *
     * 
The default implementation returns the canonical name of this charset.
     * Subclasses may return a localized display name.
     */
    public String displayName(Locale l) {
        return this.canonicalName;
    }

    /**
     * Returns true if this charset is known to be registered in the IANA
     * Charset Registry.
     */
    public final boolean isRegistered() {
        return !canonicalName.startsWith("x-") && !canonicalName.startsWith("X-");
    }

    /**
     * Returns true if this charset supports encoding, false otherwise.
     */
    public boolean canEncode() {
        return true;
    }

    /**
     * Returns a new {@code ByteBuffer} containing the bytes encoding the characters from
     * {@code buffer}.
     * This method uses {@code CodingErrorAction.REPLACE}.
     *
     * 
Applications should generally create a {@link CharsetEncoder} using {@link #newEncoder}
     * for performance.
     *
     * @param buffer
     *            the character buffer containing the content to be encoded.
     * @return the result of the encoding.
     */
    public final ByteBuffer encode(CharBuffer buffer) {
        try {
            return newEncoder()
                    .onMalformedInput(CodingErrorAction.REPLACE)
                    .onUnmappableCharacter(CodingErrorAction.REPLACE).encode(
                            buffer);
        } catch (CharacterCodingException ex) {
            throw new Error(ex.getMessage(), ex);
        }
    }

    /**
     * Returns a new {@code ByteBuffer} containing the bytes encoding the characters from {@code s}.
     * This method uses {@code CodingErrorAction.REPLACE}.
     *
     * 
Applications should generally create a {@link CharsetEncoder} using {@link #newEncoder}
     * for performance.
     *
     * @param s the string to be encoded.
     * @return the result of the encoding.
     */
    public final ByteBuffer encode(String s) {
        return encode(CharBuffer.wrap(s));
    }

    /**
     * Returns a new {@code CharBuffer} containing the characters decoded from {@code buffer}.
     * This method uses {@code CodingErrorAction.REPLACE}.
     *
     * Applications should generally create a {@link CharsetDecoder} using {@link #newDecoder}
     * for performance.
     *
     * @param buffer
     *            the byte buffer containing the content to be decoded.
     * @return a character buffer containing the output of the decoding.
     */
    public final CharBuffer decode(ByteBuffer buffer) {
        try {
            return newDecoder()
                    .onMalformedInput(CodingErrorAction.REPLACE)
                    .onUnmappableCharacter(CodingErrorAction.REPLACE).decode(buffer);
        } catch (CharacterCodingException ex) {
            throw new Error(ex.getMessage(), ex);
        }
    }

    /*
     * -------------------------------------------------------------------
     * Methods implementing parent interface Comparable
     * -------------------------------------------------------------------
     */

    /**
     * Compares this charset with the given charset. This comparison is
     * based on the case insensitive canonical names of the charsets.
     *
     * @param charset
     *            the given object to be compared with.
     * @return a negative integer if less than the given object, a positive
     *         integer if larger than it, or 0 if equal to it.
     */
    public final int compareTo(Charset charset) {
        return this.canonicalName.compareToIgnoreCase(charset.canonicalName);
    }

    /*
     * -------------------------------------------------------------------
     * Methods overriding parent class Object
     * -------------------------------------------------------------------
     */

    /**
     * Determines whether this charset equals to the given object. They are
     * considered to be equal if they have the same canonical name.
     *
     * @param obj
     *            the given object to be compared with.
     * @return true if they have the same canonical name, otherwise false.
     */
    @Override
    public final boolean equals(Object obj) {
        if (obj instanceof Charset) {
            Charset that = (Charset) obj;
            return this.canonicalName.equals(that.canonicalName);
        }
        return false;
    }

    /**
     * Gets the hash code of this charset.
     *
     * @return the hash code of this charset.
     */
    @Override
    public final int hashCode() {
        return this.canonicalName.hashCode();
    }

    /**
     * Gets a string representation of this charset. Usually this contains the
     * canonical name of the charset.
     *
     * @return a string representation of this charset.
     */
    @Override
    public final String toString() {
        return getClass().getName() + "[" + this.canonicalName + "]";
    }

    /**
     * Returns the system's default charset. This is determined during VM startup, and will not
     * change thereafter. On Android, the default charset is UTF-8.
     */
    public static Charset defaultCharset() {
        return DEFAULT_CHARSET;
    }

    private static Charset getDefaultCharset() {
        String encoding = System.getProperty("file.encoding", "UTF-8");
        try {
            return Charset.forName(encoding);
        } catch (UnsupportedCharsetException e) {
            return Charset.forName("UTF-8");
        }
    }
}
Charset	BE BOM	LE BOM	No BOM
UTF-16BE	BE, includes BOM	BE, failure	BE
UTF-16LE	LE, failure	LE, includes BOM	LE
UTF-16	BE	LE	BE