org.apache.xerces.impl.io.UTF8Reader Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.xerces.impl.io;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.Locale;
import org.apache.xerces.impl.msg.XMLMessageFormatter;
import org.apache.xerces.util.MessageFormatter;
/**
* A UTF-8 reader.
*
* @xerces.internal
*
* @author Andy Clark, IBM
*
* @version $Id: UTF8Reader.java 718095 2008-11-16 20:00:14Z mrglavas $
*/
public final class UTF8Reader
extends Reader {
//
// Constants
//
/** Default byte buffer size (2048). */
public static final int DEFAULT_BUFFER_SIZE = 2048;
// debugging
/** Debug read. */
private static final boolean DEBUG_READ = false;
//
// Data
//
/** Input stream. */
protected final InputStream fInputStream;
/** Byte buffer. */
protected final byte[] fBuffer;
/** Offset into buffer. */
protected int fOffset;
/** Surrogate character. */
private int fSurrogate = -1;
// message formatter; used to produce localized
// exception messages
private final MessageFormatter fFormatter;
//Locale to use for messages
private final Locale fLocale;
//
// Constructors
//
/**
* Constructs a UTF-8 reader from the specified input stream
* using the default buffer size. Primarily for testing.
*
* @param inputStream The input stream.
*/
public UTF8Reader(InputStream inputStream) {
this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault());
} // (InputStream, MessageFormatter)
/**
* Constructs a UTF-8 reader from the specified input stream
* using the default buffer size and the given MessageFormatter.
*
* @param inputStream The input stream.
* @param messageFormatter given MessageFormatter
* @param locale Locale to use for messages
*/
public UTF8Reader(InputStream inputStream, MessageFormatter messageFormatter,
Locale locale) {
this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale);
} // (InputStream, MessageFormatter, Locale)
/**
* Constructs a UTF-8 reader from the specified input stream,
* buffer size and MessageFormatter.
*
* @param inputStream The input stream.
* @param size The initial buffer size.
* @param messageFormatter the formatter for localizing/formatting errors.
* @param locale the Locale to use for messages
*/
public UTF8Reader(InputStream inputStream, int size,
MessageFormatter messageFormatter, Locale locale) {
this(inputStream, new byte[size], messageFormatter, locale);
} // (InputStream, int, MessageFormatter, Locale)
/**
* Constructs a UTF-8 reader from the specified input stream,
* buffer and MessageFormatter.
*
* @param inputStream The input stream.
* @param buffer The byte buffer.
* @param messageFormatter the formatter for localizing/formatting errors.
* @param locale the Locale to use for messages
*/
public UTF8Reader(InputStream inputStream, byte [] buffer,
MessageFormatter messageFormatter, Locale locale) {
fInputStream = inputStream;
fBuffer = buffer;
fFormatter = messageFormatter;
fLocale = locale;
} // (InputStream, byte[], MessageFormatter, Locale)
//
// Reader methods
//
/**
* Read a single character. This method will block until a character is
* available, an I/O error occurs, or the end of the stream is reached.
*
* Subclasses that intend to support efficient single-character input
* should override this method.
*
* @return The character read, as an integer in the range 0 to 16383
* (0x00-0xffff), or -1 if the end of the stream has
* been reached
*
* @exception IOException If an I/O error occurs
*/
public int read() throws IOException {
// decode character
int c = fSurrogate;
if (fSurrogate == -1) {
// NOTE: We use the index into the buffer if there are remaining
// bytes from the last block read. -Ac
int index = 0;
// get first byte
int b0 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b0 == -1) {
return -1;
}
// UTF-8: [0xxx xxxx]
// Unicode: [0000 0000] [0xxx xxxx]
if (b0 < 0x80) {
c = (char)b0;
}
// UTF-8: [110y yyyy] [10xx xxxx]
// Unicode: [0000 0yyy] [yyxx xxxx]
else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
int b1 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b1 == -1) {
expectedByte(2, 2);
}
if ((b1 & 0xC0) != 0x80) {
invalidByte(2, 2, b1);
}
c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
}
// UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
// Unicode: [zzzz yyyy] [yyxx xxxx]
else if ((b0 & 0xF0) == 0xE0) {
int b1 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b1 == -1) {
expectedByte(2, 3);
}
if ((b1 & 0xC0) != 0x80
|| (b0 == 0xED && b1 >= 0xA0)
|| ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
invalidByte(2, 3, b1);
}
int b2 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b2 == -1) {
expectedByte(3, 3);
}
if ((b2 & 0xC0) != 0x80) {
invalidByte(3, 3, b2);
}
c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
(b2 & 0x003F);
}
// UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
// Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
// [1101 11yy] [yyxx xxxx] (low surrogate)
// * uuuuu = wwww + 1
else if ((b0 & 0xF8) == 0xF0) {
int b1 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b1 == -1) {
expectedByte(2, 4);
}
if ((b1 & 0xC0) != 0x80
|| ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
invalidByte(2, 3, b1);
}
int b2 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b2 == -1) {
expectedByte(3, 4);
}
if ((b2 & 0xC0) != 0x80) {
invalidByte(3, 3, b2);
}
int b3 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b3 == -1) {
expectedByte(4, 4);
}
if ((b3 & 0xC0) != 0x80) {
invalidByte(4, 4, b3);
}
int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
if (uuuuu > 0x10) {
invalidSurrogate(uuuuu);
}
int wwww = uuuuu - 1;
int hs = 0xD800 |
((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |
((b2 >> 4) & 0x0003);
int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
c = hs;
fSurrogate = ls;
}
// error
else {
invalidByte(1, 1, b0);
}
}
// use surrogate
else {
fSurrogate = -1;
}
// return character
if (DEBUG_READ) {
System.out.println("read(): 0x"+Integer.toHexString(c));
}
return c;
} // read():int
/**
* Read characters into a portion of an array. This method will block
* until some input is available, an I/O error occurs, or the end of the
* stream is reached.
*
* @param ch Destination buffer
* @param offset Offset at which to start storing characters
* @param length Maximum number of characters to read
*
* @return The number of characters read, or -1 if the end of the
* stream has been reached
*
* @exception IOException If an I/O error occurs
*/
public int read(char ch[], int offset, int length) throws IOException {
// read bytes
int out = offset;
int count = 0;
if (fOffset == 0) {
// adjust length to read
if (length > fBuffer.length) {
length = fBuffer.length;
}
// handle surrogate
if (fSurrogate != -1) {
ch[out++] = (char)fSurrogate;
fSurrogate = -1;
length--;
}
// perform read operation
count = fInputStream.read(fBuffer, 0, length);
if (count == -1) {
return -1;
}
count += out - offset;
}
// skip read; last character was in error
// NOTE: Having an offset value other than zero means that there was
// an error in the last character read. In this case, we have
// skipped the read so we don't consume any bytes past the
// error. By signalling the error on the next block read we
// allow the method to return the most valid characters that
// it can on the previous block read. -Ac
else {
count = fOffset;
fOffset = 0;
}
// convert bytes to characters
final int total = count;
int in;
byte byte1;
final byte byte0 = 0;
for (in = 0; in < total; in++) {
byte1 = fBuffer[in];
if (byte1 >= byte0) {
ch[out++] = (char)byte1;
}
else {
break;
}
}
for ( ; in < total; in++) {
byte1 = fBuffer[in];
// UTF-8: [0xxx xxxx]
// Unicode: [0000 0000] [0xxx xxxx]
if (byte1 >= byte0) {
ch[out++] = (char)byte1;
continue;
}
// UTF-8: [110y yyyy] [10xx xxxx]
// Unicode: [0000 0yyy] [yyxx xxxx]
int b0 = byte1 & 0x0FF;
if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
int b1 = -1;
if (++in < total) {
b1 = fBuffer[in] & 0x00FF;
}
else {
b1 = fInputStream.read();
if (b1 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fOffset = 1;
return out - offset;
}
expectedByte(2, 2);
}
count++;
}
if ((b1 & 0xC0) != 0x80) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
invalidByte(2, 2, b1);
}
int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
ch[out++] = (char)c;
count -= 1;
continue;
}
// UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
// Unicode: [zzzz yyyy] [yyxx xxxx]
if ((b0 & 0xF0) == 0xE0) {
int b1 = -1;
if (++in < total) {
b1 = fBuffer[in] & 0x00FF;
}
else {
b1 = fInputStream.read();
if (b1 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fOffset = 1;
return out - offset;
}
expectedByte(2, 3);
}
count++;
}
if ((b1 & 0xC0) != 0x80
|| (b0 == 0xED && b1 >= 0xA0)
|| ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
invalidByte(2, 3, b1);
}
int b2 = -1;
if (++in < total) {
b2 = fBuffer[in] & 0x00FF;
}
else {
b2 = fInputStream.read();
if (b2 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
expectedByte(3, 3);
}
count++;
}
if ((b2 & 0xC0) != 0x80) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fBuffer[2] = (byte)b2;
fOffset = 3;
return out - offset;
}
invalidByte(3, 3, b2);
}
int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
(b2 & 0x003F);
ch[out++] = (char)c;
count -= 2;
continue;
}
// UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
// Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
// [1101 11yy] [yyxx xxxx] (low surrogate)
// * uuuuu = wwww + 1
if ((b0 & 0xF8) == 0xF0) {
int b1 = -1;
if (++in < total) {
b1 = fBuffer[in] & 0x00FF;
}
else {
b1 = fInputStream.read();
if (b1 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fOffset = 1;
return out - offset;
}
expectedByte(2, 4);
}
count++;
}
if ((b1 & 0xC0) != 0x80
|| ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
invalidByte(2, 4, b1);
}
int b2 = -1;
if (++in < total) {
b2 = fBuffer[in] & 0x00FF;
}
else {
b2 = fInputStream.read();
if (b2 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fOffset = 2;
return out - offset;
}
expectedByte(3, 4);
}
count++;
}
if ((b2 & 0xC0) != 0x80) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fBuffer[2] = (byte)b2;
fOffset = 3;
return out - offset;
}
invalidByte(3, 4, b2);
}
int b3 = -1;
if (++in < total) {
b3 = fBuffer[in] & 0x00FF;
}
else {
b3 = fInputStream.read();
if (b3 == -1) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fBuffer[2] = (byte)b2;
fOffset = 3;
return out - offset;
}
expectedByte(4, 4);
}
count++;
}
if ((b3 & 0xC0) != 0x80) {
if (out > offset) {
fBuffer[0] = (byte)b0;
fBuffer[1] = (byte)b1;
fBuffer[2] = (byte)b2;
fBuffer[3] = (byte)b3;
fOffset = 4;
return out - offset;
}
invalidByte(4, 4, b2);
}
// decode bytes into surrogate characters
int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
if (uuuuu > 0x10) {
invalidSurrogate(uuuuu);
}
int wwww = uuuuu - 1;
int zzzz = b1 & 0x000F;
int yyyyyy = b2 & 0x003F;
int xxxxxx = b3 & 0x003F;
int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4);
int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
// set characters
ch[out++] = (char)hs;
if ((count -= 2) <= length) {
ch[out++] = (char)ls;
}
// reached the end of the char buffer; save low surrogate for the next read
else {
fSurrogate = ls;
--count;
}
continue;
}
// error
if (out > offset) {
fBuffer[0] = (byte)b0;
fOffset = 1;
return out - offset;
}
invalidByte(1, 1, b0);
}
// return number of characters converted
if (DEBUG_READ) {
System.out.println("read(char[],"+offset+','+length+"): count="+count);
}
return count;
} // read(char[],int,int)
/**
* Skip characters. This method will block until some characters are
* available, an I/O error occurs, or the end of the stream is reached.
*
* @param n The number of characters to skip
*
* @return The number of characters actually skipped
*
* @exception IOException If an I/O error occurs
*/
public long skip(long n) throws IOException {
long remaining = n;
final char[] ch = new char[fBuffer.length];
do {
int length = ch.length < remaining ? ch.length : (int)remaining;
int count = read(ch, 0, length);
if (count > 0) {
remaining -= count;
}
else {
break;
}
} while (remaining > 0);
long skipped = n - remaining;
return skipped;
} // skip(long):long
/**
* Tell whether this stream is ready to be read.
*
* @return True if the next read() is guaranteed not to block for input,
* false otherwise. Note that returning false does not guarantee that the
* next read will block.
*
* @exception IOException If an I/O error occurs
*/
public boolean ready() throws IOException {
return false;
} // ready()
/**
* Tell whether this stream supports the mark() operation.
*/
public boolean markSupported() {
return false;
} // markSupported()
/**
* Mark the present position in the stream. Subsequent calls to reset()
* will attempt to reposition the stream to this point. Not all
* character-input streams support the mark() operation.
*
* @param readAheadLimit Limit on the number of characters that may be
* read while still preserving the mark. After
* reading this many characters, attempting to
* reset the stream may fail.
*
* @exception IOException If the stream does not support mark(),
* or if some other I/O error occurs
*/
public void mark(int readAheadLimit) throws IOException {
throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"}));
} // mark(int)
/**
* Reset the stream. If the stream has been marked, then attempt to
* reposition it at the mark. If the stream has not been marked, then
* attempt to reset it in some way appropriate to the particular stream,
* for example by repositioning it to its starting point. Not all
* character-input streams support the reset() operation, and some support
* reset() without supporting mark().
*
* @exception IOException If the stream has not been marked,
* or if the mark has been invalidated,
* or if the stream does not support reset(),
* or if some other I/O error occurs
*/
public void reset() throws IOException {
fOffset = 0;
fSurrogate = -1;
} // reset()
/**
* Close the stream. Once a stream has been closed, further read(),
* ready(), mark(), or reset() invocations will throw an IOException.
* Closing a previously-closed stream, however, has no effect.
*
* @exception IOException If an I/O error occurs
*/
public void close() throws IOException {
fInputStream.close();
} // close()
//
// Private methods
//
/** Throws an exception for expected byte. */
private void expectedByte(int position, int count)
throws MalformedByteSequenceException {
throw new MalformedByteSequenceException(fFormatter,
fLocale,
XMLMessageFormatter.XML_DOMAIN,
"ExpectedByte",
new Object[] {Integer.toString(position), Integer.toString(count)});
} // expectedByte(int,int)
/** Throws an exception for invalid byte. */
private void invalidByte(int position, int count, int c)
throws MalformedByteSequenceException {
throw new MalformedByteSequenceException(fFormatter,
fLocale,
XMLMessageFormatter.XML_DOMAIN,
"InvalidByte",
new Object [] {Integer.toString(position), Integer.toString(count)});
} // invalidByte(int,int,int)
/** Throws an exception for invalid surrogate bits. */
private void invalidSurrogate(int uuuuu) throws MalformedByteSequenceException {
throw new MalformedByteSequenceException(fFormatter,
fLocale,
XMLMessageFormatter.XML_DOMAIN,
"InvalidHighSurrogate",
new Object[] {Integer.toHexString(uuuuu)});
} // invalidSurrogate(int)
} // class UTF8Reader