com.fasterxml.jackson.core.io.UTF8Writer Maven / Gradle / Ivy
package com.fasterxml.jackson.core.io;
import java.io.*;
public final class UTF8Writer extends Writer
{
final static int SURR1_FIRST = 0xD800;
final static int SURR1_LAST = 0xDBFF;
final static int SURR2_FIRST = 0xDC00;
final static int SURR2_LAST = 0xDFFF;
/**
* @since 2.17
*/
public static final int SURROGATE_BASE = 0x10000 - UTF8Writer.SURR2_FIRST - (UTF8Writer.SURR1_FIRST << 10);
final private IOContext _context;
private OutputStream _out;
private byte[] _outBuffer;
final private int _outBufferEnd;
private int _outPtr;
/**
* When outputting chars from BMP, surrogate pairs need to be coalesced.
* To do this, both pairs must be known first; and since it is possible
* pairs may be split, we need temporary storage for the first half
*/
private int _surrogate;
public UTF8Writer(IOContext ctxt, OutputStream out)
{
_context = ctxt;
_out = out;
_outBuffer = ctxt.allocWriteEncodingBuffer();
/* Max. expansion for a single char (in unmodified UTF-8) is
* 4 bytes (or 3 depending on how you view it -- 4 when recombining
* surrogate pairs)
*/
_outBufferEnd = _outBuffer.length - 4;
_outPtr = 0;
}
@Override
public Writer append(char c)
throws IOException
{
write(c);
return this;
}
@Override
public void close()
throws IOException
{
if (_out != null) {
if (_outPtr > 0) {
_out.write(_outBuffer, 0, _outPtr);
_outPtr = 0;
}
OutputStream out = _out;
_out = null;
byte[] buf = _outBuffer;
if (buf != null) {
_outBuffer = null;
_context.releaseWriteEncodingBuffer(buf);
}
out.close();
// Let's 'flush' orphan surrogate, no matter what; but only
// after cleanly closing everything else.
int code = _surrogate;
_surrogate = 0;
if (code > 0) {
illegalSurrogate(code);
}
}
_context.close();
}
@Override
public void flush()
throws IOException
{
if (_out != null) {
if (_outPtr > 0) {
_out.write(_outBuffer, 0, _outPtr);
_outPtr = 0;
}
_out.flush();
}
}
@Override
public void write(char[] cbuf)
throws IOException
{
write(cbuf, 0, cbuf.length);
}
@Override
public void write(char[] cbuf, int off, int len)
throws IOException
{
if (len < 2) {
if (len == 1) {
write(cbuf[off]);
}
return;
}
// First: do we have a leftover surrogate to deal with?
if (_surrogate > 0) {
char second = cbuf[off++];
--len;
write(convertSurrogate(second));
// will have at least one more char
}
int outPtr = _outPtr;
byte[] outBuf = _outBuffer;
int outBufLast = _outBufferEnd; // has 4 'spare' bytes
// All right; can just loop it nice and easy now:
len += off; // len will now be the end of input buffer
output_loop:
for (; off < len; ) {
/* First, let's ensure we can output at least 4 bytes
* (longest UTF-8 encoded codepoint):
*/
if (outPtr >= outBufLast) {
_out.write(outBuf, 0, outPtr);
outPtr = 0;
}
int c = cbuf[off++];
// And then see if we have an Ascii char:
if (c < 0x80) { // If so, can do a tight inner loop:
outBuf[outPtr++] = (byte)c;
// Let's calc how many ascii chars we can copy at most:
int maxInCount = (len - off);
int maxOutCount = (outBufLast - outPtr);
if (maxInCount > maxOutCount) {
maxInCount = maxOutCount;
}
maxInCount += off;
ascii_loop:
while (true) {
if (off >= maxInCount) { // done with max. ascii seq
continue output_loop;
}
c = cbuf[off++];
if (c >= 0x80) {
break ascii_loop;
}
outBuf[outPtr++] = (byte) c;
}
}
// Nope, multi-byte:
if (c < 0x800) { // 2-byte
outBuf[outPtr++] = (byte) (0xc0 | (c >> 6));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
} else { // 3 or 4 bytes
// Surrogates?
if (c < SURR1_FIRST || c > SURR2_LAST) {
outBuf[outPtr++] = (byte) (0xe0 | (c >> 12));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate:
if (c > SURR1_LAST) { // must be from first range
_outPtr = outPtr;
illegalSurrogate(c);
}
_surrogate = c;
// and if so, followed by another from next range
if (off >= len) { // unless we hit the end?
break;
}
c = convertSurrogate(cbuf[off++]);
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
_outPtr = outPtr;
illegalSurrogate(c);
}
outBuf[outPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
}
}
_outPtr = outPtr;
}
@Override
public void write(int c) throws IOException
{
// First; do we have a left over surrogate?
if (_surrogate > 0) {
c = convertSurrogate(c);
// If not, do we start with a surrogate?
} else if (c >= SURR1_FIRST && c <= SURR2_LAST) {
// Illegal to get second part without first:
if (c > SURR1_LAST) {
illegalSurrogate(c);
}
// First part just needs to be held for now
_surrogate = c;
return;
}
if (_outPtr >= _outBufferEnd) { // let's require enough room, first
_out.write(_outBuffer, 0, _outPtr);
_outPtr = 0;
}
if (c < 0x80) { // ascii
_outBuffer[_outPtr++] = (byte) c;
} else {
int ptr = _outPtr;
if (c < 0x800) { // 2-byte
_outBuffer[ptr++] = (byte) (0xc0 | (c >> 6));
_outBuffer[ptr++] = (byte) (0x80 | (c & 0x3f));
} else if (c <= 0xFFFF) { // 3 bytes
_outBuffer[ptr++] = (byte) (0xe0 | (c >> 12));
_outBuffer[ptr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
_outBuffer[ptr++] = (byte) (0x80 | (c & 0x3f));
} else { // 4 bytes
if (c > 0x10FFFF) { // illegal
illegalSurrogate(c);
}
_outBuffer[ptr++] = (byte) (0xf0 | (c >> 18));
_outBuffer[ptr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
_outBuffer[ptr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
_outBuffer[ptr++] = (byte) (0x80 | (c & 0x3f));
}
_outPtr = ptr;
}
}
@Override
public void write(String str) throws IOException
{
write(str, 0, str.length());
}
@Override
public void write(String str, int off, int len) throws IOException
{
if (len < 2) {
if (len == 1) {
write(str.charAt(off));
}
return;
}
// First: do we have a leftover surrogate to deal with?
if (_surrogate > 0) {
char second = str.charAt(off++);
--len;
write(convertSurrogate(second));
// will have at least one more char (case of 1 char was checked earlier on)
}
int outPtr = _outPtr;
byte[] outBuf = _outBuffer;
int outBufLast = _outBufferEnd; // has 4 'spare' bytes
// All right; can just loop it nice and easy now:
len += off; // len will now be the end of input buffer
output_loop:
for (; off < len; ) {
/* First, let's ensure we can output at least 4 bytes
* (longest UTF-8 encoded codepoint):
*/
if (outPtr >= outBufLast) {
_out.write(outBuf, 0, outPtr);
outPtr = 0;
}
int c = str.charAt(off++);
// And then see if we have an Ascii char:
if (c < 0x80) { // If so, can do a tight inner loop:
outBuf[outPtr++] = (byte)c;
// Let's calc how many ascii chars we can copy at most:
int maxInCount = (len - off);
int maxOutCount = (outBufLast - outPtr);
if (maxInCount > maxOutCount) {
maxInCount = maxOutCount;
}
maxInCount += off;
ascii_loop:
while (true) {
if (off >= maxInCount) { // done with max. ascii seq
continue output_loop;
}
c = str.charAt(off++);
if (c >= 0x80) {
break ascii_loop;
}
outBuf[outPtr++] = (byte) c;
}
}
// Nope, multi-byte:
if (c < 0x800) { // 2-byte
outBuf[outPtr++] = (byte) (0xc0 | (c >> 6));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
} else { // 3 or 4 bytes
// Surrogates?
if (c < SURR1_FIRST || c > SURR2_LAST) {
outBuf[outPtr++] = (byte) (0xe0 | (c >> 12));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate:
if (c > SURR1_LAST) { // must be from first range
_outPtr = outPtr;
illegalSurrogate(c);
}
_surrogate = c;
// and if so, followed by another from next range
if (off >= len) { // unless we hit the end?
break;
}
c = convertSurrogate(str.charAt(off++));
if (c > 0x10FFFF) { // illegal, as per RFC 4627
_outPtr = outPtr;
illegalSurrogate(c);
}
outBuf[outPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
}
}
_outPtr = outPtr;
}
/*
/**********************************************************
/* Internal methods
/**********************************************************
*/
/**
* Method called to calculate Unicode code-point, from a surrogate pair.
*
* @param secondPart Second UTF-16 unit of surrogate (first part stored in {@code _surrogate})
*
* @return Decoded Unicode point
*
* @throws IOException If surrogate pair is invalid
*/
protected int convertSurrogate(int secondPart)
throws IOException
{
int firstPart = _surrogate;
_surrogate = 0;
// Ok, then, is the second part valid?
if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
throw new IOException("Broken surrogate pair: first char 0x"+Integer.toHexString(firstPart)+", second 0x"+Integer.toHexString(secondPart)+"; illegal combination");
}
return (firstPart << 10) + secondPart + UTF8Writer.SURROGATE_BASE;
}
protected static void illegalSurrogate(int code) throws IOException {
throw new IOException(illegalSurrogateDesc(code));
}
protected static String illegalSurrogateDesc(int code)
{
if (code > 0x10FFFF) { // over max?
return "Illegal character point (0x"+Integer.toHexString(code)+") to output; max is 0x10FFFF as per RFC 4627";
}
if (code >= SURR1_FIRST) {
if (code <= SURR1_LAST) { // Unmatched first part (closing without second part?)
return "Unmatched first part of surrogate pair (0x"+Integer.toHexString(code)+")";
}
return "Unmatched second part of surrogate pair (0x"+Integer.toHexString(code)+")";
}
// should we ever get this?
return "Illegal character point (0x"+Integer.toHexString(code)+") to output";
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy