org.apache.jena.atlas.io.BlockUTF8 Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jena-base Show documentation
Show all versions of jena-base Show documentation
This module contains non-RDF library code and the common system runtime.
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jena.atlas.io;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
/**
* Convert between bytes and chars, UTF-8 only.
*
* This code is just the UTF-8 encoding rules - it does not check for legality
* of the Unicode data. The standard codecs do, so do not round-trip with binary
* compatibility. (Example: a single element of a surrogate pair will
* be encoded/decoded without lost.)
*
* The usual Charset encoders/decoders can be expensive to start up - they are also
* not thread safe. Sometimes we want to convert 10's of chars and UTF-8 can be
* done in code with no lookup tables (which, if used, are cache-unfriendly).
*
* This code is thread safe. It uses code in the hope that JITting will
* make it fast if used heavily.
*/
public class BlockUTF8
{
// Looking in java.lang.StringCoding (Sun RT) is illuminating.
// The actual encode/decode code is in sun.nio.cs.UTF_8.(Decoder|Encoder)
// which has special cases for ByteBuffer, ByteBuffer with array (needs offsets)
// and byte[] <-> char[]
// It seems that chars -> bytes (on <100char strings) is faster with BlockUTF8
// but the conversion from bytes to string is faster with Java decoders (not by much though).
/*
* Bits
* 7 U+007F 1 to 127 0xxxxxxx
* 11 U+07FF 128 to 2,047 110xxxxx 10xxxxxx
* 16 U+FFFF 2,048 to 65,535 1110xxxx 10xxxxxx 10xxxxxx
* 21 U+1FFFFF 65,536 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 26 U+3FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 31 U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
/**
* Convert the bytes in the ByteBuffer to characters in the CharBuffer. The
* CharBuffer must be large enough.
*/
public static void toChars(ByteBuffer bb, CharBuffer cb) {
// if ( bb.hasArray() && cb.hasArray() )
// {
// toCharsArray(bb.array(), cb.array());
// return;
// }
toCharsBuffer(bb, cb);
}
/**
* Convert characters to UTF-8 bytes in the ByteBuffer. The ByteBuffer must be
* large enough.
*/
public static void fromChars(CharBuffer cb, ByteBuffer bb) {
// if ( bb.hasArray() && cb.hasArray() )
// {
// fromCharsArray(cb.array(), bb.array());
// return;
// }
fromCharsBuffer(cb, bb);
}
/** Make a string from UTF-8 bytes in a ByteBuffer */
public static String toString(ByteBuffer bb) {
int len = bb.remaining();
CharBuffer cb = CharBuffer.allocate(len);
toChars(bb, cb);
return new String(cb.array(), 0, cb.position());
}
// Using buffer access.
private static void toCharsBuffer(ByteBuffer bb, CharBuffer cb) {
int idx = bb.position();
int limit = bb.limit();
for (; idx < limit; )
{
int x = bb.get();
if ( x > 0 && x <= 127 ) {
cb.put((char)x);
idx += 1;
continue;
}
if ( x == 0 ) {
// Pass through a null byte as the null character (illegal Unicode, Java compatible).
cb.put((char)x);
idx += 1;
continue;
}
if ( (x & 0xE0) == 0xC0 ) {
// 10 => extension byte
// 110..... => 2 bytes
// Unroll common path
//int ch = readMultiBytes(bb, x & 0x1F, 2);
int x2 = bb.get();
if ( (x2 & 0xC0) != 0x80 )
exception("Illegal UTF-8 processing character: 0x%04X",x2);
// 6 bits of x2
int ch = ( (x&0x1F) << 6) | (x2 & 0x3F);
cb.put((char)ch);
idx += 2 ;
continue;
}
if ( (x & 0xF0) == 0xE0 ) {
// 1110.... => 3 bytes : 16 bits : not outside 16bit chars
int ch = readMultiBytes(bb, x & 0x0F, 3);
cb.put((char)ch);
idx += 3;
continue;
}
if ( (x & 0xF8) == 0xF0 ) {
// Looking like 4 byte character.
// 11110zzz => 4 bytes.
int ch = readMultiBytes(bb, x & 0x08, 4);
char chars[] = Character.toChars(ch);
cb.put(chars);
idx += 4;
continue;
}
exception("Illegal UTF-8: 0x%04X",x);
}
}
private static void toCharsArray(byte[] bytes, char[] chars) {
toCharsBuffer(ByteBuffer.wrap(bytes), CharBuffer.wrap(chars));
}
private static void fromCharsBuffer(CharBuffer cb, ByteBuffer bb) {
// CharBuffers are CharSequences but charAt(i) adds a layer of work.
//int bytesStart = bb.position();
int idx = cb.position();
int limit = cb.limit();
for (; idx < limit; idx++ ) {
char ch = cb.get();
if ( ch != 0 && ch <= 127 ) {
// 7 bits
bb.put((byte)ch);
continue;
}
if ( ch == 0 ) {
// Java.
bb.put((byte)0x00);
// Modified UTF-8.
// bb.put((byte)0xC0);
// bb.put((byte)0x80);
continue;
}
if ( ch <= 0x07FF ) {
// 11 bits : 110yyyyy 10xxxxxx
// int x1 = ( ((ch>>(11-5))&0x7) | 0xC0 );
// outputBytes(out, x1, 2, ch);
int x1 = (((ch >> (11 - 5)) & 0x01F) | 0xC0);
int x2 = ((ch & 0x3F) | 0x80);
bb.put((byte)x1);
bb.put((byte)x2);
continue;
}
if ( ch <= 0xFFFF ) {
// 16 bits : 1110aaaa 10bbbbbb 10cccccc
// int x1 = ( ((ch>>(16-4))&0x7) | 0xE0 );
// outputBytes(out, x1, 3, ch);
int x1 = (((ch >> (16 - 4)) & 0x0F) | 0xE0);
int x2 = (((ch >> 6) & 0x3F) | 0x80);
int x3 = ((ch & 0x3F) | 0x80);
bb.put((byte)x1);
bb.put((byte)x2);
bb.put((byte)x3);
continue;
}
// End of Java.
// A Java char is 16 bit, unsigned, so it is between 0 and 0xFFFF.
// Unicode is defined for 0 to 0x10FFFF
// For reference the full 32 bits encodings are:
if ( ch <= 0x1FFFFF ) {
// 21 bits : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
int x1 = (((ch >> (21 - 3)) & 0x7) | 0xF0);
outputBytes(bb, x1, 4, ch);
continue;
}
if ( ch <= 0x3FFFFFF ) {
// 26 bits : 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
int x1 = (((ch >> (26 - 2)) & 0x3) | 0xF8);
outputBytes(bb, x1, 5, ch);
continue;
}
if ( ch <= 0x7FFFFFFF ) {
// 32 bits : 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
int x1 = (((ch >> (32 - 1)) & 0x1) | 0xFC);
outputBytes(bb, x1, 6, ch);
continue;
}
}
}
public static void fromChars(CharSequence cs, ByteBuffer bb) {
fromChars(CharBuffer.wrap(cs), bb);
}
private static void fromCharsArray(char[] chars, byte[] bytes) {
fromCharsBuffer(CharBuffer.wrap(chars), ByteBuffer.wrap(bytes));
}
private static int readMultiBytes(ByteBuffer input, int start, int len) {
// We have already read one byte.
if ( input.remaining() < (len - 1) )
exception("Premature end to UTF-8 sequence at end of input");
int x = start;
for ( int i = 0; i < len - 1; i++ ) {
int x2 = input.get();
if ( (x2 & 0xC0) != 0x80 )
exception("Illegal UTF-8 processing character: 0x%04X", x2);
// 6 bits of x2
x = (x << 6) | (x2 & 0x3F);
}
return x;
}
/**
* Put bytes to the output ByteBuffer for character ch. The first byte is in x1
* and already has the needed bits set.
*/
private static void outputBytes(ByteBuffer bb, int x1, int byteLength, int ch) {
bb.put((byte)x1);
byteLength--; // remaining bytes
for ( int i = 0; i < byteLength; i++ ) {
// 6 Bits, loop from high to low
int shift = 6 * (byteLength - i - 1);
int x = (ch >> shift) & 0x3F;
x = x | 0x80; // 10xxxxxx
bb.put((byte)x);
}
}
// Does not return
private static void exception(String fmt, Object...args) {
String str = String.format(fmt, args);
IO.exception(new IOException(str));
}
}