com.fasterxml.jackson.core.io.UTF8Writer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of ehcache Show documentation
Show all versions of ehcache Show documentation
Ehcache is an open source, standards-based cache used to boost performance,
offload the database and simplify scalability. Ehcache is robust, proven and full-featured and
this has made it the most widely-used Java-based cache.
package com.fasterxml.jackson.core.io;
import java.io.*;
public final class UTF8Writer extends Writer
{
final static int SURR1_FIRST = 0xD800;
final static int SURR1_LAST = 0xDBFF;
final static int SURR2_FIRST = 0xDC00;
final static int SURR2_LAST = 0xDFFF;
final private IOContext _context;
private OutputStream _out;
private byte[] _outBuffer;
final private int _outBufferEnd;
private int _outPtr;
/**
* When outputting chars from BMP, surrogate pairs need to be coalesced.
* To do this, both pairs must be known first; and since it is possible
* pairs may be split, we need temporary storage for the first half
*/
private int _surrogate = 0;
public UTF8Writer(IOContext ctxt, OutputStream out)
{
_context = ctxt;
_out = out;
_outBuffer = ctxt.allocWriteEncodingBuffer();
/* Max. expansion for a single char (in unmodified UTF-8) is
* 4 bytes (or 3 depending on how you view it -- 4 when recombining
* surrogate pairs)
*/
_outBufferEnd = _outBuffer.length - 4;
_outPtr = 0;
}
@Override
public Writer append(char c)
throws IOException
{
write(c);
return this;
}
@Override
public void close()
throws IOException
{
if (_out != null) {
if (_outPtr > 0) {
_out.write(_outBuffer, 0, _outPtr);
_outPtr = 0;
}
OutputStream out = _out;
_out = null;
byte[] buf = _outBuffer;
if (buf != null) {
_outBuffer = null;
_context.releaseWriteEncodingBuffer(buf);
}
out.close();
/* Let's 'flush' orphan surrogate, no matter what; but only
* after cleanly closing everything else.
*/
int code = _surrogate;
_surrogate = 0;
if (code > 0) {
illegalSurrogate(code);
}
}
}
@Override
public void flush()
throws IOException
{
if (_out != null) {
if (_outPtr > 0) {
_out.write(_outBuffer, 0, _outPtr);
_outPtr = 0;
}
_out.flush();
}
}
@Override
public void write(char[] cbuf)
throws IOException
{
write(cbuf, 0, cbuf.length);
}
@Override
public void write(char[] cbuf, int off, int len)
throws IOException
{
if (len < 2) {
if (len == 1) {
write(cbuf[off]);
}
return;
}
// First: do we have a leftover surrogate to deal with?
if (_surrogate > 0) {
char second = cbuf[off++];
--len;
write(convertSurrogate(second));
// will have at least one more char
}
int outPtr = _outPtr;
byte[] outBuf = _outBuffer;
int outBufLast = _outBufferEnd; // has 4 'spare' bytes
// All right; can just loop it nice and easy now:
len += off; // len will now be the end of input buffer
output_loop:
for (; off < len; ) {
/* First, let's ensure we can output at least 4 bytes
* (longest UTF-8 encoded codepoint):
*/
if (outPtr >= outBufLast) {
_out.write(outBuf, 0, outPtr);
outPtr = 0;
}
int c = cbuf[off++];
// And then see if we have an Ascii char:
if (c < 0x80) { // If so, can do a tight inner loop:
outBuf[outPtr++] = (byte)c;
// Let's calc how many ascii chars we can copy at most:
int maxInCount = (len - off);
int maxOutCount = (outBufLast - outPtr);
if (maxInCount > maxOutCount) {
maxInCount = maxOutCount;
}
maxInCount += off;
ascii_loop:
while (true) {
if (off >= maxInCount) { // done with max. ascii seq
continue output_loop;
}
c = cbuf[off++];
if (c >= 0x80) {
break ascii_loop;
}
outBuf[outPtr++] = (byte) c;
}
}
// Nope, multi-byte:
if (c < 0x800) { // 2-byte
outBuf[outPtr++] = (byte) (0xc0 | (c >> 6));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
} else { // 3 or 4 bytes
// Surrogates?
if (c < SURR1_FIRST || c > SURR2_LAST) {
outBuf[outPtr++] = (byte) (0xe0 | (c >> 12));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate:
if (c > SURR1_LAST) { // must be from first range
_outPtr = outPtr;
illegalSurrogate(c);
}
_surrogate = c;
// and if so, followed by another from next range
if (off >= len) { // unless we hit the end?
break;
}
c = convertSurrogate(cbuf[off++]);
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
_outPtr = outPtr;
illegalSurrogate(c);
}
outBuf[outPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
}
}
_outPtr = outPtr;
}
@Override
public void write(int c) throws IOException
{
// First; do we have a left over surrogate?
if (_surrogate > 0) {
c = convertSurrogate(c);
// If not, do we start with a surrogate?
} else if (c >= SURR1_FIRST && c <= SURR2_LAST) {
// Illegal to get second part without first:
if (c > SURR1_LAST) {
illegalSurrogate(c);
}
// First part just needs to be held for now
_surrogate = c;
return;
}
if (_outPtr >= _outBufferEnd) { // let's require enough room, first
_out.write(_outBuffer, 0, _outPtr);
_outPtr = 0;
}
if (c < 0x80) { // ascii
_outBuffer[_outPtr++] = (byte) c;
} else {
int ptr = _outPtr;
if (c < 0x800) { // 2-byte
_outBuffer[ptr++] = (byte) (0xc0 | (c >> 6));
_outBuffer[ptr++] = (byte) (0x80 | (c & 0x3f));
} else if (c <= 0xFFFF) { // 3 bytes
_outBuffer[ptr++] = (byte) (0xe0 | (c >> 12));
_outBuffer[ptr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
_outBuffer[ptr++] = (byte) (0x80 | (c & 0x3f));
} else { // 4 bytes
if (c > 0x10FFFF) { // illegal
illegalSurrogate(c);
}
_outBuffer[ptr++] = (byte) (0xf0 | (c >> 18));
_outBuffer[ptr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
_outBuffer[ptr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
_outBuffer[ptr++] = (byte) (0x80 | (c & 0x3f));
}
_outPtr = ptr;
}
}
@Override
public void write(String str) throws IOException
{
write(str, 0, str.length());
}
@Override
public void write(String str, int off, int len) throws IOException
{
if (len < 2) {
if (len == 1) {
write(str.charAt(off));
}
return;
}
// First: do we have a leftover surrogate to deal with?
if (_surrogate > 0) {
char second = str.charAt(off++);
--len;
write(convertSurrogate(second));
// will have at least one more char (case of 1 char was checked earlier on)
}
int outPtr = _outPtr;
byte[] outBuf = _outBuffer;
int outBufLast = _outBufferEnd; // has 4 'spare' bytes
// All right; can just loop it nice and easy now:
len += off; // len will now be the end of input buffer
output_loop:
for (; off < len; ) {
/* First, let's ensure we can output at least 4 bytes
* (longest UTF-8 encoded codepoint):
*/
if (outPtr >= outBufLast) {
_out.write(outBuf, 0, outPtr);
outPtr = 0;
}
int c = str.charAt(off++);
// And then see if we have an Ascii char:
if (c < 0x80) { // If so, can do a tight inner loop:
outBuf[outPtr++] = (byte)c;
// Let's calc how many ascii chars we can copy at most:
int maxInCount = (len - off);
int maxOutCount = (outBufLast - outPtr);
if (maxInCount > maxOutCount) {
maxInCount = maxOutCount;
}
maxInCount += off;
ascii_loop:
while (true) {
if (off >= maxInCount) { // done with max. ascii seq
continue output_loop;
}
c = str.charAt(off++);
if (c >= 0x80) {
break ascii_loop;
}
outBuf[outPtr++] = (byte) c;
}
}
// Nope, multi-byte:
if (c < 0x800) { // 2-byte
outBuf[outPtr++] = (byte) (0xc0 | (c >> 6));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
} else { // 3 or 4 bytes
// Surrogates?
if (c < SURR1_FIRST || c > SURR2_LAST) {
outBuf[outPtr++] = (byte) (0xe0 | (c >> 12));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate:
if (c > SURR1_LAST) { // must be from first range
_outPtr = outPtr;
illegalSurrogate(c);
}
_surrogate = c;
// and if so, followed by another from next range
if (off >= len) { // unless we hit the end?
break;
}
c = convertSurrogate(str.charAt(off++));
if (c > 0x10FFFF) { // illegal, as per RFC 4627
_outPtr = outPtr;
illegalSurrogate(c);
}
outBuf[outPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
}
}
_outPtr = outPtr;
}
/*
/**********************************************************
/* Internal methods
/**********************************************************
*/
/**
* Method called to calculate UTF codepoint, from a surrogate pair.
*/
protected int convertSurrogate(int secondPart)
throws IOException
{
int firstPart = _surrogate;
_surrogate = 0;
// Ok, then, is the second part valid?
if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
throw new IOException("Broken surrogate pair: first char 0x"+Integer.toHexString(firstPart)+", second 0x"+Integer.toHexString(secondPart)+"; illegal combination");
}
return 0x10000 + ((firstPart - SURR1_FIRST) << 10) + (secondPart - SURR2_FIRST);
}
protected static void illegalSurrogate(int code) throws IOException {
throw new IOException(illegalSurrogateDesc(code));
}
protected static String illegalSurrogateDesc(int code)
{
if (code > 0x10FFFF) { // over max?
return "Illegal character point (0x"+Integer.toHexString(code)+") to output; max is 0x10FFFF as per RFC 4627";
}
if (code >= SURR1_FIRST) {
if (code <= SURR1_LAST) { // Unmatched first part (closing without second part?)
return "Unmatched first part of surrogate pair (0x"+Integer.toHexString(code)+")";
}
return "Unmatched second part of surrogate pair (0x"+Integer.toHexString(code)+")";
}
// should we ever get this?
return "Illegal character point (0x"+Integer.toHexString(code)+") to output";
}
}