net.sf.saxon.serialize.charcode.UTF8CharacterSet Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of Saxon-HE Show documentation
The XSLT and XQuery Processor
There is a newer version: 12.5
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2023 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.serialize.charcode;

import net.sf.saxon.z.IntIterator;

import java.io.ByteArrayOutputStream;

/**
 * This class defines properties of the UTF-8 character set
 */

public final class UTF8CharacterSet implements CharacterSet {

    private static final UTF8CharacterSet theInstance = new UTF8CharacterSet();

    /**
     * Private constructor to force the singular instance to be used
     */

    private UTF8CharacterSet() {
    }

    /**
     * Get the singular instance of this class
     *
     * @return the singular instance of this class
     */

    public static UTF8CharacterSet getInstance() {
        return theInstance;
    }

    @Override
    public boolean inCharset(int c) {
        return true;
    }

    /*@NotNull*/
    @Override
    public String getCanonicalName() {
        return "UTF-8";
    }

    /**
     * Static method to generate the UTF-8 representation of a Unicode character
     *
     * @param in  the Unicode character, or the high half of a surrogate pair
     * @param in2 the low half of a surrogate pair (ignored unless the first argument is in the
     *            range for a surrogate pair)
     * @param out an array of at least 4 bytes to hold the UTF-8 representation.
     * @return the number of bytes in the UTF-8 representation
     */

    public static int getUTF8Encoding(char in, char in2, byte[] out) {
        // See Tony Graham, "Unicode, a Primer", page 92
        int i = in;
        if (i <= 0x7f) {
            out[0] = (byte) i;
            return 1;
        } else if (i <= 0x7ff) {
            out[0] = (byte) (0xc0 | ((in >> 6) & 0x1f));
            out[1] = (byte) (0x80 | (in & 0x3f));
            return 2;
        } else if (i >= 0xd800 && i <= 0xdbff) {
            // surrogate pair
            int j = in2;
            if (!(j >= 0xdc00 && j <= 0xdfff)) {
                throw new IllegalArgumentException("Malformed Unicode Surrogate Pair (" + i + ',' + j + ')');
            }
            byte xxxxxx = (byte) (j & 0x3f);
            byte yyyyyy = (byte) (((i & 0x03) << 4) | ((j >> 6) & 0x0f));
            byte zzzz = (byte) ((i >> 2) & 0x0f);
            byte uuuuu = (byte) (((i >> 6) & 0x0f) + 1);
            out[0] = (byte) (0xf0 | ((uuuuu >> 2) & 0x07));
            out[1] = (byte) (0x80 | ((uuuuu & 0x03) << 4) | zzzz);
            out[2] = (byte) (0x80 | yyyyyy);
            out[3] = (byte) (0x80 | xxxxxx);
            return 4;
        } else if (i >= 0xdc00 && i <= 0xdfff) {
            // second half of surrogate pair - ignore it
            return 0;
        } else {
            out[0] = (byte) (0xe0 | ((in >> 12) & 0x0f));
            out[1] = (byte) (0x80 | ((in >> 6) & 0x3f));
            out[2] = (byte) (0x80 | (in & 0x3f));
            return 3;
        }
    }

    /**
     * Static method to generate the UTF-8 representation of a sequence of Unicode codepoints
     *
     * @param codePoints  the sequence of Unicode codepoints: must not include surrogates
     * @return the UTF-8 encoding of the characters
     */

    public static byte[] encode(IntIterator codePoints) {
        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
        final byte[] cpBytes = new byte[6]; // IndexOutOfBounds for too large code points
        while(codePoints.hasNext()) {
            int cp = codePoints.next();
            if (cp < 0) {
                throw new IllegalStateException("No negative code point allowed");
            } else if (cp < 0x80) {
                baos.write((byte)cp);
            } else {
                int bi = 0;
                int lastPrefix = 0xC0;
                int lastMask = 0x1F;
                for (; ; ) {
                    int b = 0x80 | (cp & 0x3F);
                    cpBytes[bi] = (byte) b;
                    ++bi;
                    cp >>= 6;
                    if ((cp & ~lastMask) == 0) {
                        cpBytes[bi] = (byte) (lastPrefix | cp);
                        ++bi;
                        break;
                    }
                    lastPrefix = 0x80 | (lastPrefix >> 1);
                    lastMask >>= 1;
                }
                while (bi > 0) {
                    --bi;
                    baos.write(cpBytes[bi]);
                }
            }
        };
        return baos.toByteArray();
    }

    /**
     * Decode a UTF8 character
     *
     * @param in   array of bytes representing a single UTF-8 encoded character
     * @param used number of bytes in the array that are actually used
     * @return the Unicode codepoint of this character
     * @throws IllegalArgumentException if the byte sequence is not a valid UTF-8 representation
     */

    public static int decodeUTF8(byte[] in, int used) throws IllegalArgumentException {
        int bottom = 0;
        for (int i = 1; i < used; i++) {
            if ((in[i] & 0xc0) != 0x80) {
                throw new IllegalArgumentException("Byte " + (i + 1) + " in UTF-8 sequence has wrong top bits");
            }
            bottom = (bottom << 6) + (in[i] & 0x3f);
        }
        if ((in[0] & 0x80) == 0) {
            // single byte sequence 0xxxxxxx
            if (used == 1) {
                return in[0];
            } else {
                throw new IllegalArgumentException("UTF8 single byte expected");
            }
        } else if ((in[0] & 0xe0) == 0xc0) {
            // two byte sequence
            if (used != 2) {
                throw new IllegalArgumentException("UTF8 sequence of two bytes expected");
            }
            return ((in[0] & 0x1f) << 6) + bottom;
        } else if ((in[0] & 0xf0) == 0xe0) {
            // three byte sequence
            if (used != 3) {
                throw new IllegalArgumentException("UTF8 sequence of three bytes expected");
            }
            return ((in[0] & 0x0f) << 12) + bottom;
        } else if ((in[0] & 0xf8) == 0xf8) {
            // four-byte sequence 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
            if (used != 4) {
                throw new IllegalArgumentException("UTF8 sequence of four bytes expected");
            }
            return ((in[0] & 0x07) << 24) + bottom;
        } else {
            throw new IllegalArgumentException("UTF8 invalid first byte");
        }
    }


}