org.exist.util.UTF8 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of exist-core Show documentation
eXist-db NoSQL Database Core
There is a newer version: 6.3.0
/*
 *  `gnu.iou' I/O buffers and utilities.
 *  Copyright (C) 1998, 1999, 2000, 2001, 2002 John Pritchard.
 *
 *  This program is free software; you can redistribute it or modify
 *  it under the terms of the GNU Lesser General Public License as
 *  published by the Free Software Foundation; either version 2.1 of
 *  the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this program; if not, write to the Free
 *  Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
 *  02111-1307 USA
 */
package org.exist.util;

import javax.annotation.Nullable;

//TODO(AR) For eXist-db 6.0.0, this should be replaced with String.getBytes(UTF_8) and new String(bytes, UTF_8)

/**
 * This class contains two static tools for doing UTF-8 encoding and
 * decoding.
 *
 *  UTF-8 is ASCII- transparent.  It supports character sets
 * requiring more than the seven bit ASCII base range of UTF-8,
 * including Unicode, ISO-8859, ISO-10646, etc..
 *
 *  We do not use an ISO UCS code signature, and we do not use a
 * Java Data I/O- style strlen prefix.
 *
 * @author John Pritchard ([email protected])
 */
public class UTF8 {

    /**
     * Decode UTF-8 input, terminates decoding at a null character,
     * value 0x0.
     *
     * @param code the encoded UTf-8 string
     *
     * @return the decoded UTF-8 string
     *
     * @throws IllegalStateException Bad format.
     */
    public final static XMLString decode(@Nullable final byte[] code) {
        if (null == code) {
            return null;
        }
        return decode(code, 0, code.length);
    }

    public final static XMLString decode(@Nullable final byte[] code, final int off, final int many) {
        if (null == code || 0 >= code.length) {
            return null;
        }

        final XMLString xs = new XMLString(many);
        return decode(code, off, many, xs);
    }

    /**
     * Decode UTF-8 input, terminates decoding at a null character,
     * value 0x0.
     *
     * @param code the encoded UTf-8 string
     * @param off the offset of the string
     * @param many many
     * @param xs xs
     *
     * @return the decoded UTF-8 string
     *
     * @throws IllegalStateException Bad format.
     */
    public final static XMLString decode(@Nullable final byte[] code, final int off, final int many, final XMLString xs) {
        if (null == code || 0 >= code.length) {
            return null;
        }

        char ch;
        final int end = (off + many);
        byte cc;

        for (int c = off; c < end; c++) {
            cc = code[c];

            if (0 <= cc) {
                xs.append((char) cc);
            } else {
                ch = 0;

                if (b11000000 == (cc & b11100000)) {

                    ch |= (code[c + 1] & b00111111);
                    ch |= (cc & b00011111) << 6;

                    c += 1;
                } else if (b11100000 == (cc & b11110000)) {

                    ch |= (code[c + 2] & b00111111);
                    ch |= (code[c + 1] & b00111111) << 6;
                    ch |= (cc & b00001111) << 12;

                    c += 2;
                } else if (b11110000 == (cc & b11111000)) {

                    ch |= (code[c + 3] & b00111111);
                    ch |= (code[c + 2] & b00111111) << 6;
                    ch |= (code[c + 1] & b00111111) << 12;

                    c += 3;
                } else if (b11111000 == (cc & b11111100)) {

                    ch |= (code[c + 4] & b00111111);
                    ch |= (code[c + 3] & b00111111) << 6;
                    ch |= (code[c + 2] & b00111111) << 12;

                    c += 4;
                } else if (b11111100 == (cc & b11111110)) {

                    ch |= (code[c + 5] & b00111111);
                    ch |= (code[c + 4] & b00111111) << 6;
                    ch |= (code[c + 3] & b00111111) << 12;

                    c += 5;
                } else {
                    ch = (char) (cc & b01111111); // 0x7f

                }
                xs.append(ch);

            } // else // if ( 0 < cc)
        }

        return xs;
    }

    /**
     * Encode string in UTF-8.
     *
     * @param str the string to encode
     *
     * @return the encoded string
     */
    public final static byte[] encode(@Nullable final char[] str) {
        if (null == str || 0 >= str.length) {
            return null;
        }
        return encode(str, 0, str.length, null, 0);
    }

    /**
     * Encode string in UTF-8.
     *
     * Warning: the size of bytbuf is not checked. Use encoded() to determine
     * the size needed.
     *
     * @param str the string to encode
     * @param start the offset of the string
     * @param length the length of the string
     * @param bytbuf bytebuf
     * @param offset the offset in bytebuf
     *
     * @return the encoded string
     */
    public final static byte[] encode(@Nullable final char[] str, final int start, final int length, @Nullable byte[] bytbuf, int offset) {
        if (null == str || 0 >= length) {
            return bytbuf;
        }

        if (bytbuf == null) {
            bytbuf = new byte[encoded(str, start, length)];
        }

        char ch, sch;
        final int end = start + length;
        for (int c = start; c < end; c++) {

            ch = str[c];

            if (0x7f >= ch) {
                bytbuf[offset++] = (byte) ch;
            } else if (0x7ff >= ch) {

                sch = (char) (ch >>> 6);

                if (0 < sch) {
                    bytbuf[offset++] = (byte) (b11000000 | (sch & b00011111));
                } else {
                    bytbuf[offset++] = (byte) (b11000000);
                }

                bytbuf[offset++] = (byte) (b10000000 | (ch & b00111111));
            } else {

                sch = (char) (ch >>> 12);

                if (0 < sch) {

                    bytbuf[offset++] = (byte) (b11100000 | (sch & b00001111));
                } else {
                    bytbuf[offset++] = (byte) (b11100000);
                }

                bytbuf[offset++] = (byte) (b10000000 | ((ch >>> 6) & b00111111));

                bytbuf[offset++] = (byte) (b10000000 | (ch & b00111111));
            }
        }

        return bytbuf;
    }

    public final static byte[] encode(@Nullable final String str, @Nullable final byte[] bytbuf, final int offset) {
        return encode(str, 0, str.length(), bytbuf, offset);
    }

    /**
     * Encode string in UTF-8.
     *
     * Warning: the size of bytbuf is not checked. Use encoded() to determine
     * the size needed.
     *
     * @param str the string to encode
     * @param start the offset of the string
     * @param length the length of the string
     * @param bytbuf bytebuf
     * @param offset the offset in bytebuf
     *
     * @return the encoded string
     */
    public final static byte[] encode(@Nullable final String str, final int start, final int length, final byte[] bytbuf, int offset) {
        if (null == str || 0 >= length) {
            return bytbuf;
        }

        char ch, sch;
        final int end = start + length;
        for (int c = start; c < end; c++) {

            ch = str.charAt(c);

            if (0x7f >= ch) {
                bytbuf[offset++] = (byte) ch;
            } else if (0x7ff >= ch) {

                sch = (char) (ch >>> 6);

                if (0 < sch) {
                    bytbuf[offset++] = (byte) (b11000000 | (sch & b00011111));
                } else {
                    bytbuf[offset++] = (byte) (b11000000);
                }

                bytbuf[offset++] = (byte) (b10000000 | (ch & b00111111));
            } else {

                sch = (char) (ch >>> 12);

                if (0 < sch) {

                    bytbuf[offset++] = (byte) (b11100000 | (sch & b00001111));
                } else {
                    bytbuf[offset++] = (byte) (b11100000);
                }

                bytbuf[offset++] = (byte) (b10000000 | ((ch >>> 6) & b00111111));

                bytbuf[offset++] = (byte) (b10000000 | (ch & b00111111));
            }
        }

        return bytbuf;
    }

    /**
     * Encode string in UTF-8.
     *
     * @param s the string to encode
     *
     * @return the encoded string
     */
    public final static byte[] encode(@Nullable final String s) {
        if (null == s) {
            return null;
        } else {
            return encode(s.toCharArray(), 0, s.length(), null, 0);
        }
    }

    private static final char b10000000 = (char) 0x80;
    private static final char b11000000 = (char) 0xC0;
    private static final char b11100000 = (char) 0xE0;
    private static final char b11110000 = (char) 0xF0;
    private static final char b11111000 = (char) 0xF8;
    private static final char b11111100 = (char) 0xFC;
    private static final char b11111110 = (char) 0xFE;

    private static final char b01111111 = (char) 0x7F;
    private static final char b00111111 = (char) 0x3F;
    private static final char b00011111 = (char) 0x1F;
    private static final char b00001111 = (char) 0x0F;
    //private static final char b00000111 = (char) 0x07;
    //private static final char b00000011 = (char) 0x03;
    //private static final char b00000001 = (char) 0x01;

    /**
     * Returns the length of the string encoded in UTF-8.
     *
     * @param str the string
     * @return the length of the encoded string
     */
    public final static int encoded(@Nullable final String str) {
        if (null == str) {
            return 0;
        }

        int bytlen = 0;

        char ch;
        //char sch;
        for (int c = 0; c < str.length(); c++) {
            ch = str.charAt(c);

            if (0x7f >= ch) {
                bytlen++;
            } else if (0x7ff >= ch) {
                bytlen += 2;
            } else {
                bytlen += 3;
            }

        }

        return bytlen;
    }

    /**
     * Returns the length of the string encoded in UTF-8.
     *
     * @param str the string
     * @param start the offset of the string
     * @param len the length of the string
     * @return the length of the encoded string
     */
    public final static int encoded(@Nullable final char[] str, final int start, final int len) {
        if (null == str || 0 >= len) {
            return 0;
        }

        int bytlen = 0;

        char ch;
        //char sch;
        final int end = start + len;
        for (int c = start; c < end; c++) {

            ch = str[c];

            if (0x7f >= ch) {
                bytlen++;
            } else if (0x7ff >= ch) {
                bytlen += 2;
            } else {
                bytlen += 3;
            }

        }

        return bytlen;
    }

    /**
     * Static method to generate the UTF-8 representation of a Unicode character.
     * This particular code is taken from saxon (see http://saxon.sf.net).
     *
     * @param in  the Unicode character, or the high half of a surrogate pair
     * @param in2 the low half of a surrogate pair (ignored unless the first argument is in the
     *            range for a surrogate pair)
     * @param out an array of at least 4 bytes to hold the UTF-8 representation.
     * @return the number of bytes in the UTF-8 representation
     */
    public static int getUTF8Encoding(final char in, final char in2, final byte[] out) {
        // See Tony Graham, "Unicode, a Primer", page 92
        final int i = (int) in;
        if (i <= 0x7f) {
            out[0] = (byte) i;
            return 1;
        } else if (i <= 0x7ff) {
            out[0] = (byte) (0xc0 | ((in >> 6) & 0x1f));
            out[1] = (byte) (0x80 | (in & 0x3f));
            return 2;
        } else if (i >= 0xd800 && i <= 0xdbff) {
            // surrogate pair
            final int j = (int) in2;
            if (!(j >= 0xdc00 && j <= 0xdfff)) {
                throw new IllegalArgumentException("Malformed Unicode Surrogate Pair (" + i + "," + j + ")");
            }
            final byte xxxxxx = (byte) (j & 0x3f);
            final byte yyyyyy = (byte) (((i & 0x03) << 4) | ((j >> 6) & 0x0f));
            final byte zzzz = (byte) ((i >> 2) & 0x0f);
            final byte uuuuu = (byte) (((i >> 6) & 0x0f) + 1);
            out[0] = (byte) (0xf0 | ((uuuuu >> 2) & 0x07));
            out[1] = (byte) (0x80 | ((uuuuu & 0x03) << 4) | zzzz);
            out[2] = (byte) (0x80 | yyyyyy);
            out[3] = (byte) (0x80 | xxxxxx);
            return 4;
        } else if (i >= 0xdc00 && i <= 0xdfff) {
            // second half of surrogate pair - ignore it
            return 0;
        } else {
            out[0] = (byte) (0xe0 | ((in >> 12) & 0x0f));
            out[1] = (byte) (0x80 | ((in >> 6) & 0x3f));
            out[2] = (byte) (0x80 | (in & 0x3f));
            return 3;
        }
    }
}