org.apache.jena.atlas.io.BlockUTF8 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jena-base Show documentation
This module contains non-RDF library code and the common system runtime.
There is a newer version: 5.2.0
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.atlas.io;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;

/**
 * Convert between bytes and chars, UTF-8 only.
 *
 * This code is just the UTF-8 encoding rules - it does not check for legality
 * of the Unicode data.  The standard codecs do, so do not round-trip with binary
 * compatibility. (Example: a single element of a surrogate pair will
 * be encoded/decoded without lost.)
 *
 * The usual Charset encoders/decoders can be expensive to start up - they are also
 * not thread safe. Sometimes we want to convert 10's of chars and UTF-8 can be
 * done in code with no lookup tables (which, if used, are cache-unfriendly).
 *
 * This code is thread safe.  It uses code in the hope that JITting will
 * make it fast if used heavily.
 */

public class BlockUTF8
{
    // Looking in java.lang.StringCoding (Sun RT) is illuminating.
    // The actual encode/decode code is in sun.nio.cs.UTF_8.(Decoder|Encoder)
    // which has special cases for ByteBuffer, ByteBuffer with array (needs offsets)
    // and byte[] <-> char[]

    // It seems that chars -> bytes (on <100char strings) is faster with BlockUTF8
    // but the conversion from bytes to string is faster with Java decoders (not by much though).

    /*
     * Bits
     * 7    U+007F      1 to 127              0xxxxxxx
     * 11   U+07FF      128 to 2,047          110xxxxx 10xxxxxx
     * 16   U+FFFF      2,048 to 65,535       1110xxxx 10xxxxxx 10xxxxxx
     * 21   U+1FFFFF    65,536                11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     * 26   U+3FFFFFF                         111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     * 31   U+7FFFFFFF                        1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     */

    /**
     * Convert the bytes in the ByteBuffer to characters in the CharBuffer. The
     * CharBuffer must be large enough.
     */
    public static void toChars(ByteBuffer bb, CharBuffer cb) {
//        if ( bb.hasArray() && cb.hasArray() )
//        {
//            toCharsArray(bb.array(), cb.array());
//            return;
//        }
        toCharsBuffer(bb, cb);
    }

    /**
     * Convert characters to UTF-8 bytes in the ByteBuffer. The ByteBuffer must be
     * large enough.
     */
    public static void fromChars(CharBuffer cb, ByteBuffer bb) {
//        if ( bb.hasArray() && cb.hasArray() )
//        {
//            fromCharsArray(cb.array(), bb.array());
//            return;
//        }
        fromCharsBuffer(cb, bb);
    }

    /** Make a string from UTF-8 bytes in a ByteBuffer */
    public static String toString(ByteBuffer bb) {
        int len = bb.remaining();
        CharBuffer cb = CharBuffer.allocate(len);
        toChars(bb, cb);
        return new String(cb.array(), 0, cb.position());
    }

    // Using buffer access.
    private static void toCharsBuffer(ByteBuffer bb, CharBuffer cb) {
        int idx = bb.position();
        int limit = bb.limit();

        for (;  idx < limit; )
        {
            int x = bb.get();
            if ( x > 0 && x <= 127 ) {
                cb.put((char)x);
                idx += 1;
                continue;
            }
            if ( x == 0 ) {
                // Pass through a null byte as the null character (illegal Unicode, Java compatible).
                cb.put((char)x);
                idx += 1;
                continue;
            }
            if ( (x & 0xE0) == 0xC0 ) {
                // 10 => extension byte
                // 110..... => 2 bytes
                // Unroll common path
                //int ch = readMultiBytes(bb, x & 0x1F, 2);
                int x2 = bb.get();
                if ( (x2 & 0xC0) != 0x80 )
                    exception("Illegal UTF-8 processing character: 0x%04X",x2);
                // 6 bits of x2
                int ch = ( (x&0x1F) << 6) | (x2 & 0x3F);
                cb.put((char)ch);
                idx += 2 ;
                continue;
            }
            if ( (x & 0xF0) == 0xE0 ) {
                //  1110.... => 3 bytes : 16 bits : not outside 16bit chars
                int ch = readMultiBytes(bb, x & 0x0F, 3);
                cb.put((char)ch);
                idx += 3;
                continue;
            }
            if ( (x & 0xF8) == 0xF0 ) {
                // Looking like 4 byte character.
                // 11110zzz => 4 bytes.
                int ch = readMultiBytes(bb, x & 0x08, 4);

                char chars[] = Character.toChars(ch);
                cb.put(chars);
                idx += 4;
                continue;
            }
            exception("Illegal UTF-8: 0x%04X",x);
        }
    }

    private static void toCharsArray(byte[] bytes, char[] chars) {
        toCharsBuffer(ByteBuffer.wrap(bytes), CharBuffer.wrap(chars));
    }

    private static void fromCharsBuffer(CharBuffer cb, ByteBuffer bb) {
        // CharBuffers are CharSequences but charAt(i) adds a layer of work.
        //int bytesStart = bb.position();
        int idx = cb.position();
        int limit = cb.limit();
        for (; idx < limit; idx++ ) {
            char ch = cb.get();
            if ( ch != 0 && ch <= 127 ) {
                // 7 bits
                bb.put((byte)ch);
                continue;
            }
            if ( ch == 0 ) {
                // Java.
                bb.put((byte)0x00);
                // Modified UTF-8.
                // bb.put((byte)0xC0);
                // bb.put((byte)0x80);
                continue;
            }
            if ( ch <= 0x07FF ) {
                // 11 bits : 110yyyyy 10xxxxxx
                // int x1 = ( ((ch>>(11-5))&0x7) | 0xC0 );
                // outputBytes(out, x1, 2, ch);
                int x1 = (((ch >> (11 - 5)) & 0x01F) | 0xC0);
                int x2 = ((ch & 0x3F) | 0x80);
                bb.put((byte)x1);
                bb.put((byte)x2);
                continue;
            }
            if ( ch <= 0xFFFF ) {
                // 16 bits : 1110aaaa 10bbbbbb 10cccccc
                // int x1 = ( ((ch>>(16-4))&0x7) | 0xE0 );
                // outputBytes(out, x1, 3, ch);
                int x1 = (((ch >> (16 - 4)) & 0x0F) | 0xE0);
                int x2 = (((ch >> 6) & 0x3F) | 0x80);
                int x3 = ((ch & 0x3F) | 0x80);
                bb.put((byte)x1);
                bb.put((byte)x2);
                bb.put((byte)x3);
                continue;
            }

            // End of Java.
            // A Java char is 16 bit, unsigned, so it is between 0 and 0xFFFF.
            // Unicode is defined for 0 to 0x10FFFF
            // For reference the full 32 bits encodings are:

            if ( ch <= 0x1FFFFF ) {
                // 21 bits : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                int x1 = (((ch >> (21 - 3)) & 0x7) | 0xF0);
                outputBytes(bb, x1, 4, ch);
                continue;
            }
            if ( ch <= 0x3FFFFFF ) {
                // 26 bits : 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                int x1 = (((ch >> (26 - 2)) & 0x3) | 0xF8);
                outputBytes(bb, x1, 5, ch);
                continue;
            }
            if ( ch <= 0x7FFFFFFF ) {
                // 32 bits : 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                int x1 = (((ch >> (32 - 1)) & 0x1) | 0xFC);
                outputBytes(bb, x1, 6, ch);
                continue;
            }
        }
    }

    public static void fromChars(CharSequence cs, ByteBuffer bb) {
        fromChars(CharBuffer.wrap(cs), bb);
    }

    private static void fromCharsArray(char[] chars, byte[] bytes) {
        fromCharsBuffer(CharBuffer.wrap(chars), ByteBuffer.wrap(bytes));
    }

    private static int readMultiBytes(ByteBuffer input, int start, int len) {
        // We have already read one byte.
        if ( input.remaining() < (len - 1) )
            exception("Premature end to UTF-8 sequence at end of input");
        int x = start;
        for ( int i = 0; i < len - 1; i++ ) {
            int x2 = input.get();
            if ( (x2 & 0xC0) != 0x80 )
                exception("Illegal UTF-8 processing character: 0x%04X", x2);
            // 6 bits of x2
            x = (x << 6) | (x2 & 0x3F);
        }
        return x;
    }

    /**
     * Put bytes to the output ByteBuffer for character ch. The first byte is in x1
     * and already has the needed bits set.
     */
    private static void outputBytes(ByteBuffer bb, int x1, int byteLength, int ch) {
        bb.put((byte)x1);
        byteLength--; // remaining bytes
        for ( int i = 0; i < byteLength; i++ ) {
            // 6 Bits, loop from high to low
            int shift = 6 * (byteLength - i - 1);
            int x = (ch >> shift) & 0x3F;
            x = x | 0x80;  // 10xxxxxx
            bb.put((byte)x);
        }
    }

    // Does not return
    private static void exception(String fmt, Object...args) {
        String str = String.format(fmt, args);
        IO.exception(new IOException(str));
    }
}