![JAR search and dependency download from the Maven repository](/logo.png)
jsimple.io.Utf8InputStreamReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jsimple-io Show documentation
Show all versions of jsimple-io Show documentation
JSimple cross platform friendly library
The newest version!
/*
* Copyright (c) 2012-2015, Microsoft Mobile
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package jsimple.io;
import jsimple.util.ByteArrayRange;
import jsimple.util.InvalidFormatException;
import jsimple.util.ProgrammerError;
/**
* This class was based on, and modified from, the Apache Harmony java.io.InputStreamWriter class. Unlike the standard
* Java Writer subclasses, this doesn't throw any checked exceptions.
*
* @author Bret Johnson
* @see Utf8OutputStreamWriter
* @since 10/7/12 6:29 PM
*/
public class Utf8InputStreamReader extends Reader {
private InputStream inputStream;
private byte[] srcBuffer = new byte[BUFFER_SIZE]; // src in original code
private int srcPosition = BUFFER_SIZE; // s in original code
private int srcEnd = BUFFER_SIZE; // src_actual_end in original code
private int state = UTF8_ACCEPT; // 'state' in original code
private boolean queuedSurrogatePair = false; // Only set when there's a surrogate pair that wouldn't fit previously
private int codePoint = 0; // 'codepoint' & 'codep' in original code
private static final int BUFFER_SIZE = 1024;
/**
* Constructs a new {@code InputStreamReader} on the {@link java.io.InputStream} {@code in}. This constructor sets
* the character converter to the encoding specified in the "file.encoding" property and falls back to ISO 8859_1
* (ISO-Latin-1) if the property doesn't exist.
*
* @param inputStream the input stream from which to read characters.
*/
public Utf8InputStreamReader(InputStream inputStream) {
this.inputStream = inputStream;
}
public Utf8InputStreamReader(ByteArrayRange byteArrayRange) {
this.inputStream = new ByteArrayInputStream(byteArrayRange);
}
/**
* Closes this reader. This implementation closes the source InputStream.
*/
@Override public void close() {
inputStream.close();
}
/**
* Reads a single character from this reader and returns it as an integer with the two higher-order bytes set to 0.
* Returns -1 if the end of the reader has been reached. The byte value is either obtained from converting bytes in
* this reader's buffer or by first filling the buffer from the source InputStream and then reading from the
* buffer.
*
* @return the character read or -1 if the end of the reader has been reached
*/
@Override public int read() {
char[] buf = new char[1];
return read(buf, 0, 1) != -1 ? buf[0] : -1;
}
private static final int UTF8_ACCEPT = 0;
private static final int UTF8_REJECT = 1;
// UTF-8 decoder lookup table
static final int[] utf8d = {
// 1st 256 entries (UTF-8 byte lookup)
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3f
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5f
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7f
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9f
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..bf
8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df
0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // e0..ef
0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // f0..ff
// Remaining 144 entries (next state)
0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8
};
/**
* Reads at most {@code length} characters from this reader and stores them at position {@code offset} in the
* character array {@code buf}. Returns the number of characters actually read or -1 if the end of the reader has
* been reached. The bytes are either obtained from converting bytes in this reader's buffer or by first filling the
* buffer from the source InputStream and then reading from the buffer.
*
* @param buffer the array to store the characters read
* @param offset the initial position in {@code buf} to store the characters read from this reader
* @param length the maximum number of characters to read
* @return the number of characters read or -1 if the end of the reader has been reached
* @throws InvalidFormatException if the stream isn't valid UTF-8
*/
@Override public int read(char[] buffer, int offset, int length) {
if (length < 0)
throw new ProgrammerError("read length parameter can't be negative");
/*
The implementation of this method is adapted from the code here: http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
Basically, I took the "Transcoding to UTF-16 buffer" code and incorporated the performance enhancement
below it preceded with "For the sample UTF-16 transcoder a more substantial increase in performance can be
achieved by manually including the decode code in the inner loop; then it is also worthwhile to make code
points in the US-ASCII range a special case". This code should be fairly high performance, compared with
alternatives.
*/
//toUtf16(uint8_t* src, size_t srcBytes, uint16_t* dst, size_t dstWords, ...)
char[] dest = buffer; // 'dst' in original code
int destIndex = offset; // 'd' in original code
int destEnd = offset + length; // 1 past the index of the last char that should be added (dstWords in original code)
// If there's a queued up surrogate pair, add it first (assuming there's room)
if (queuedSurrogatePair) {
if (destEnd - destIndex < 2) // Return if no room
return 0;
dest[destIndex++] = (char) (0xD7C0 + (codePoint >> 10)); // High surrogate
dest[destIndex++] = (char) (0xDC00 + (codePoint & 0x3FF)); // Low surrogate
queuedSurrogatePair = false;
}
// Loop until we run out of space in the dest buffer or until we reach the end of the input stream
while (true) {
int destCharsFree = destEnd - destIndex; // 'dst_words_free' in original code
if (destCharsFree == 0) // If the dest buffer is full, we're done
return destIndex - offset;
// If there's no more room in the buffer, then fill it up again
if (srcPosition == srcEnd) {
int bytesRead = inputStream.read(srcBuffer);
if (bytesRead < 1) {
if (bytesRead < 0) { // End of stream
if (state == UTF8_REJECT)
throw new InvalidFormatException("Invalid UTF-8 encoding--stream contains an invalid UTF-8 byte");
else break;
}
// If the stream for some reason returned 0 characters (not end of stream) when we asked to read
// BUFFER_SIZE characters, just return & let the caller call us again. Hopefully, the stream will
// return something different in the future
return destIndex - offset;
}
srcPosition = 0;
srcEnd = bytesRead;
}
// The original code subtracted one here, but we check the surrogate case specially below & don't need to;
// that simplifies some things
int srcCurrentEnd = srcPosition + destCharsFree; // 'src_current_end' in original code
if (srcEnd < srcCurrentEnd)
srcCurrentEnd = srcEnd;
// This inner loop is constructed to only need to do one bounds check, for performance
while (srcPosition < srcCurrentEnd) {
int srcByte = (int) srcBuffer[srcPosition++] & 0xFF; // 'byte' in original code; mask makes it unsigned
int type = utf8d[srcByte]; // 'type' in original code
if (state != UTF8_ACCEPT) {
codePoint = (codePoint << 6) | (srcByte & 63);
state = utf8d[256 + state * 16 + type];
if (state != UTF8_ACCEPT)
continue;
} else if (srcByte > 0x7f) {
codePoint = (srcByte) & (255 >> type);
state = utf8d[256 + type];
continue;
} else {
dest[destIndex++] = (char) srcByte;
continue;
}
if (codePoint > 0xffff) {
// If there's not enough room for both characters, return and get them both next time. We never
// add just half of a surrogate
if (destEnd - destIndex < 2) {
queuedSurrogatePair = true;
return destIndex - offset;
}
// Decode Unicode supplementary characters
dest[destIndex++] = (char) (0xD7C0 + (codePoint >> 10)); // High surrogate
dest[destIndex++] = (char) (0xDC00 + (codePoint & 0x3FF)); // Low surrogate
} else dest[destIndex++] = (char) codePoint;
}
}
// At this point we've read until the end of the input stream
if (state == UTF8_ACCEPT) {
int charsRead = destIndex - offset;
return charsRead == 0 ? -1 : charsRead;
} else if (state == UTF8_REJECT)
throw new InvalidFormatException("Invalid UTF-8 encoding--stream contains an invalid UTF-8 byte");
else
throw new InvalidFormatException("Invalid UTF-8 encoding--stream ends with a partially defined UTF-8 character");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy