com.ibm.icu.charset.CharsetUTF16 Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j-charset Show documentation
Show all versions of icu4j-charset Show documentation
icu4j-charset is a supplemental library for icu4j, implementing Java Charset SPI.
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
*******************************************************************************
* Copyright (C) 2006-2011, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* @author Niti Hantaweepant
*/
class CharsetUTF16 extends CharsetICU {
private static final int SIGNATURE_LENGTH = 2;
private static final byte[] fromUSubstitution_BE = { (byte) 0xff, (byte) 0xfd };
private static final byte[] fromUSubstitution_LE = { (byte) 0xfd, (byte) 0xff };
private static final byte[] BOM_BE = { (byte) 0xfe, (byte) 0xff };
private static final byte[] BOM_LE = { (byte) 0xff, (byte) 0xfe };
private static final int ENDIAN_XOR_BE = 0;
private static final int ENDIAN_XOR_LE = 1;
private static final int NEED_TO_WRITE_BOM = 1;
private boolean isEndianSpecified;
private boolean isBigEndian;
private int endianXOR;
private byte[] bom;
private byte[] fromUSubstitution;
private int version;
public CharsetUTF16(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
super(icuCanonicalName, javaCanonicalName, aliases);
/* Get the version number (e.g. UTF-16LE,version=1) */
int versionIndex = icuCanonicalName.indexOf("version=");
if (versionIndex > 0) {
version = Integer.decode(icuCanonicalName.substring(versionIndex+8, versionIndex+9)).intValue();
} else {
version = 0;
}
this.isEndianSpecified = (this instanceof CharsetUTF16BE || this instanceof CharsetUTF16LE);
this.isBigEndian = !(this instanceof CharsetUTF16LE);
if (isBigEndian) {
this.bom = BOM_BE;
this.fromUSubstitution = fromUSubstitution_BE;
this.endianXOR = ENDIAN_XOR_BE;
} else {
this.bom = BOM_LE;
this.fromUSubstitution = fromUSubstitution_LE;
this.endianXOR = ENDIAN_XOR_LE;
}
// UnicodeBig and UnicodeLittle used to require maxBytesPerChar set to 4 in Java 5 or less,
// but it's no longer necessary for newer Java versions. Java 5 or older runtime is no
// longer supported.
maxBytesPerChar = 2;
minBytesPerChar = 2;
maxCharsPerByte = 1;
}
class CharsetDecoderUTF16 extends CharsetDecoderICU {
private boolean isBOMReadYet;
private int actualEndianXOR;
private byte[] actualBOM;
public CharsetDecoderUTF16(CharsetICU cs) {
super(cs);
}
@Override
protected void implReset() {
super.implReset();
isBOMReadYet = false;
actualBOM = null;
}
@Override
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
/*
* If we detect a BOM in this buffer, then we must add the BOM size to the offsets because the actual
* converter function will not see and count the BOM. offsetDelta will have the number of the BOM bytes that
* are in the current buffer.
*/
if (!isBOMReadYet) {
while (true) {
if (!source.hasRemaining())
return CoderResult.UNDERFLOW;
toUBytesArray[toULength++] = source.get();
if (toULength == 1) {
// on the first byte, we haven't decided whether or not it's bigEndian yet
if ((!isEndianSpecified || isBigEndian)
&& toUBytesArray[toULength - 1] == BOM_BE[toULength - 1]) {
actualBOM = BOM_BE;
actualEndianXOR = ENDIAN_XOR_BE;
} else if ((!isEndianSpecified || !isBigEndian)
&& toUBytesArray[toULength - 1] == BOM_LE[toULength - 1]) {
actualBOM = BOM_LE;
actualEndianXOR = ENDIAN_XOR_LE;
} else {
// we do not have a BOM (and we have toULength==1 bytes)
if (isEndianSpecified && version == 1) {
actualBOM = isBigEndian ? CharsetUTF16.BOM_BE : CharsetUTF16.BOM_LE;
actualEndianXOR = isBigEndian ? CharsetUTF16.ENDIAN_XOR_BE : CharsetUTF16.ENDIAN_XOR_LE;
} else {
actualBOM = null;
actualEndianXOR = endianXOR;
}
break;
}
} else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 2] && toUBytesArray[toULength - 2] == actualBOM[toULength - 1])) {
return CoderResult.malformedForLength(2);
} else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 1] && toUBytesArray[toULength - 2] == actualBOM[toULength - 2])) {
// we found a BOM! at last!
// too bad we have to get ignore it now (like it was unwanted or something)
toULength = 0;
break;
} else if (isEndianSpecified || toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) {
// we do not have a BOM (and we have toULength bytes)
actualBOM = null;
actualEndianXOR = endianXOR;
break;
} else if (toULength == SIGNATURE_LENGTH) {
// we found a BOM! at last!
// too bad we have to get ignore it now (like it was unwanted or something)
toULength = 0;
break;
}
}
isBOMReadYet = true;
}
// now that we no longer need to look for a BOM, let's do some work
// if we have unfinished business
if (toUnicodeStatus != 0) {
CoderResult cr = decodeTrail(source, target, offsets, (char) toUnicodeStatus);
if (cr != null)
return cr;
}
char char16;
while (true) {
while (toULength < 2) {
if (!source.hasRemaining())
return CoderResult.UNDERFLOW;
toUBytesArray[toULength++] = source.get();
}
if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 2] && toUBytesArray[toULength - 2] == actualBOM[toULength - 1])) {
return CoderResult.malformedForLength(2);
} else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 1] && toUBytesArray[toULength - 2] == actualBOM[toULength - 2])) {
// we found a BOM! at last!
// too bad we have to get ignore it now (like it was unwanted or something)
toULength = 0;
continue;
}
if (!target.hasRemaining())
return CoderResult.OVERFLOW;
char16 = (char) (((toUBytesArray[0 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | ((toUBytesArray[1 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK)));
if (!UTF16.isSurrogate(char16)) {
toULength = 0;
target.put(char16);
} else {
CoderResult cr = decodeTrail(source, target, offsets, char16);
if (cr != null)
return cr;
}
}
}
private final CoderResult decodeTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets, char lead) {
if (!UTF16.isLeadSurrogate(lead)) {
// 2 bytes, lead malformed
toUnicodeStatus = 0;
return CoderResult.malformedForLength(2);
}
while (toULength < 4) {
if (!source.hasRemaining()) {
// let this be unfinished business
toUnicodeStatus = lead;
return CoderResult.UNDERFLOW;
}
toUBytesArray[toULength++] = source.get();
}
char trail = (char) (((toUBytesArray[2 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | ((toUBytesArray[3 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK)));
if (!UTF16.isTrailSurrogate(trail)) {
// pretend like we didnt read the last 2 bytes
toULength = 2;
source.position(source.position() - 2);
// 2 bytes, lead malformed
toUnicodeStatus = 0;
return CoderResult.malformedForLength(2);
}
toUnicodeStatus = 0;
toULength = 0;
target.put(lead);
if (target.hasRemaining()) {
target.put(trail);
return null;
} else {
/* Put in overflow buffer (not handled here) */
charErrorBufferArray[0] = trail;
charErrorBufferLength = 1;
return CoderResult.OVERFLOW;
}
}
}
class CharsetEncoderUTF16 extends CharsetEncoderICU {
private final byte[] temp = new byte[4];
public CharsetEncoderUTF16(CharsetICU cs) {
super(cs, fromUSubstitution);
fromUnicodeStatus = (isEndianSpecified && version != 1) ? 0 : NEED_TO_WRITE_BOM;
}
@Override
protected void implReset() {
super.implReset();
fromUnicodeStatus = (isEndianSpecified && version != 1) ? 0 : NEED_TO_WRITE_BOM;
}
@Override
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
CoderResult cr;
/* write the BOM if necessary */
if (fromUnicodeStatus == NEED_TO_WRITE_BOM) {
if (!target.hasRemaining())
return CoderResult.OVERFLOW;
fromUnicodeStatus = 0;
cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);
if (cr.isOverflow())
return cr;
}
if (fromUChar32 != 0) {
if (!target.hasRemaining())
return CoderResult.OVERFLOW;
// a note: fromUChar32 will either be 0 or a lead surrogate
cr = encodeChar(source, target, offsets, (char) fromUChar32);
if (cr != null)
return cr;
}
while (true) {
if (!source.hasRemaining())
return CoderResult.UNDERFLOW;
if (!target.hasRemaining())
return CoderResult.OVERFLOW;
cr = encodeChar(source, target, offsets, source.get());
if (cr != null)
return cr;
}
}
private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) {
int sourceIndex = source.position() - 1;
CoderResult cr;
if (UTF16.isSurrogate(ch)) {
cr = handleSurrogates(source, ch);
if (cr != null)
return cr;
char trail = UTF16.getTrailSurrogate(fromUChar32);
fromUChar32 = 0;
// 4 bytes
temp[0 ^ endianXOR] = (byte) (ch >>> 8);
temp[1 ^ endianXOR] = (byte) (ch);
temp[2 ^ endianXOR] = (byte) (trail >>> 8);
temp[3 ^ endianXOR] = (byte) (trail);
cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex);
} else {
// 2 bytes
temp[0 ^ endianXOR] = (byte) (ch >>> 8);
temp[1 ^ endianXOR] = (byte) (ch);
cr = fromUWriteBytes(this, temp, 0, 2, target, offsets, sourceIndex);
}
return (cr.isUnderflow() ? null : cr);
}
}
@Override
public CharsetDecoder newDecoder() {
return new CharsetDecoderUTF16(this);
}
@Override
public CharsetEncoder newEncoder() {
return new CharsetEncoderUTF16(this);
}
@Override
void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
getNonSurrogateUnicodeSet(setFillIn);
}
}