org.apache.jena.atlas.io.InStreamUTF8 Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jena.atlas.io;
import java.io.ByteArrayInputStream ;
import java.io.IOException ;
import java.io.InputStream ;
import java.io.Reader ;
import org.apache.jena.atlas.AtlasException ;
/** Fast and streaming UTF-8 */
public final class InStreamUTF8 extends Reader implements CharStream
{
// TODO Add line and col counts.
// See arq.utf8.
// TODO Better ready()/available() in InputStreamBuffered
// The standard Java way of doing this is via charset decoders.
// One small disadvantage is that bad UTF-8 does not get flagged as to
// the byte position of the error.
// This class collects knowledge of how UTF-8 encoding works;
// the Java classes are usually slightly faster compared to using
// this class with an InputStreamBuffered but the difference is small.
// This class generated meaningful error messages (when line/col added).
// The Java classes copy-convert a byte buffer into a char buffer.
// Sometimes, for example in a parser, this isn't a convenient model
// because the app is looking one character at a time and accumulating
// the chars until it sees the end of a token of arbitrary length
// or processes escape sequences.
//
// The app might use a StringBuilder so the bytes get copied into
// a char buffer and out again. Instead, this code assumes the
// app is in charge of that.
// UTF-8 (UTF-16) is different from other character sets because
// the relationship with Java's internal character representation is
// arithmetic, not a character mapping.
// Todo: chars > 16 bits -> surrogate pairs.
/*
* http://en.wikipedia.org/wiki/UTF-8
* http://tools.ietf.org/html/rfc3629
* http://www.ietf.org/rfc/rfc3629.txt
*
* Unicode Byte1 Byte2 Byte3 Byte4
* U+0000–U+007F 0 to 127 0xxxxxxx
* U+0080–U+07FF 128 to 2,047 110yyyxx 10xxxxxx
* U+0800–U+FFFF 2,048 to 65,535 1110yyyy 10yyyyxx 10xxxxxx
* U+10000–U+10FFFF 65,536 to 1,114,111 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
*
* Restricted cases (RFC 3629)
* 11110101-11110111 F5-F7 245-247 start of 4-byte sequence for codepoint above 10FFFF
* 11111000-11111011 F8-FB 248-251 start of 5-byte sequence
* 11111100-11111101 FC-FD 252-253 start of 6-byte sequence
*
* Illegal:
* 11000000-11000001 C0-C1 192-193 Overlong encoding: start of a 2-byte sequence, but code point <= 127
* 11111110-11111111 FE-FF 254-255 Invalid: not defined by original UTF-8 specification
*/
// There is some sort of stream decoder backing the Sun implementation
// of CharsetDecoder (sun.io.StreamDecoder) but it's not on all platforms
// I want a known decoder specifically for UTF8
private InputStreamBuffered input ;
//private long count = 0 ;
public InStreamUTF8(InputStream in)
{
if ( in instanceof InputStreamBuffered )
{
input = (InputStreamBuffered)in ;
return ;
}
input = new InputStreamBuffered(in) ;
}
public InStreamUTF8(InputStreamBuffered in) { input = in ; }
@Override
public boolean ready() throws IOException
{
return input.available() > 0 ;
}
@Override
public void close() throws IOException
{ input.close() ; }
@Override
public void closeStream()
{ IO.close(input) ; }
@Override
public int read(char[] cbuf, int off, int len) {
// Doing this on a block of bytes may be faster.
for ( int i = off ; i < off + len ; i++ ) {
int x = read();
if ( x == -1 ) {
if ( i == off )
return -1;
return (i - off);
}
cbuf[i] = (char)x;
}
return len;
}
@Override
public final int read() {
int ch = advance(input);
// if ( ! Character.isDefined(ch) ) throw new
// AtlasException(String.format("Undefined codepoint: 0x%04X", ch)) ;
return ch;
}
/** Next codepoint, given the first byte of any UTF-8 byte sequence is already known.
* Not necessarily a valid char (this function can be used a straight UTF8 decoder
*/
@Override
public final int advance()
{ return advance(input) ; }
/** Next codepoint */
public static final int advance(InputStreamBuffered input) {
int x = input.advance() ;
if ( x == -1 ) return -1 ;
return advance(input, x) ;
}
/** Next codepoint, given the first byte of any UTF-8 byte sequence is already known.
* Not necessarily a valid char (this function can be used as a straight UTF8 decoder).
*/
private static final int advance(InputStreamBuffered input, int x) {
//count++ ;
// ASCII Fastpath
if ( x == -1 || (x >= 0 && x <= 127) ) {
// count++ ;
return x;
}
// 10 => extension byte
// 110..... => 2 bytes
if ( (x & 0xE0) == 0xC0 ) {
int ch = readMultiBytes(input, x & 0x1F, 2);
// count += 2 ;
return ch;
}
// 1110.... => 3 bytes : 16 bits : not outside 16bit chars
if ( (x & 0xF0) == 0xE0 ) {
int ch = readMultiBytes(input, x & 0x0F, 3);
// count += 3 ;
// if ( ! Character.isDefined(ch) ) throw new
// AtlasException(String.format("Undefined codepoint: 0x%04X", ch))
// ;
return ch;
}
// Looking like 4 byte character.
int ch = -2;
// 11110zzz => 4 bytes.
if ( (x & 0xF8) == 0xF0 ) {
ch = readMultiBytes(input, x & 0x08, 4);
// Opps - need two returns. Character.toChars(ch, chars, 0) ;
// count += 4 ;
}
else
IO.exception(new IOException("Illegal UTF-8: " + x));
// This test will go off. We're processing a 4 byte sequence but Java
// only supports 16 bit chars.
if ( ch > Character.MAX_VALUE )
throw new AtlasException("Out of range character (must use a surrogate pair)");
if ( !Character.isDefined(ch) )
throw new AtlasException(String.format("Undefined codepoint: 0x%04X", ch));
return ch;
}
private static int readMultiBytes(InputStreamBuffered input, int start, int len) {
int x = start ;
for ( int i = 0 ; i < len-1 ; i++ ) {
int x2 = input.advance() ;
if ( x2 == -1 )
throw new AtlasException("Premature end to UTF-8 sequence at end of input") ;
if ( (x2 & 0xC0) != 0x80 )
//throw new AtlasException("Illegal UTF-8 processing character "+count+": "+x2) ;
throw new AtlasException(String.format("Illegal UTF-8 processing character: 0x%04X",x2)) ;
// 6 bits of x2
x = (x << 6) | (x2 & 0x3F);
}
return x ;
}
public static String decode(byte[] bytes) {
try {
char[] chars = new char[bytes.length];
InputStream in = new ByteArrayInputStream(bytes);
Reader r = new InStreamUTF8(in);
int len;
len = r.read(chars);
IO.close(r);
return new String(chars, 0, len);
}
catch (IOException ex) {
IO.exception(ex);
return null;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy