com.ibm.icu.charset.CharsetDecoderICU Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of icu4j-charset Show documentation
Show all versions of icu4j-charset Show documentation
icu4j-charset is a supplemental library for icu4j, implementing Java Charset SPI.
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
*******************************************************************************
* Copyright (C) 2006-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import com.ibm.icu.impl.Assert;
/**
* An abstract class that provides framework methods of decoding operations for concrete
* subclasses.
* In the future this class will contain API that will implement converter sematics of ICU4C.
* @stable ICU 3.6
*/
public abstract class CharsetDecoderICU extends CharsetDecoder{
int toUnicodeStatus;
byte[] toUBytesArray = new byte[128];
int toUBytesBegin = 0;
int toULength;
char[] charErrorBufferArray = new char[128];
int charErrorBufferLength;
int charErrorBufferBegin;
char[] invalidCharBuffer = new char[128];
int invalidCharLength;
/**
* Maximum number of indexed bytes
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected static final int EXT_MAX_BYTES = 0x1f;
/* store previous UChars/chars to continue partial matches */
byte[] preToUArray = new byte[EXT_MAX_BYTES];
int preToUBegin;
int preToULength; /* negative: replay */
int preToUFirstLength; /* length of first character */
int mode;
Object toUContext = null;
private CharsetCallback.Decoder onUnmappableCharacter = CharsetCallback.TO_U_CALLBACK_STOP;
private CharsetCallback.Decoder onMalformedInput = CharsetCallback.TO_U_CALLBACK_STOP;
CharsetCallback.Decoder toCharErrorBehaviour = new CharsetCallback.Decoder() {
@Override
public CoderResult call(CharsetDecoderICU decoder, Object context, ByteBuffer source,
CharBuffer target, IntBuffer offsets, char[] buffer, int length, CoderResult cr) {
if (cr.isUnmappable()) {
return onUnmappableCharacter.call(decoder, context, source, target, offsets, buffer,
length, cr);
} else /* if (cr.isMalformed()) */ {
return onMalformedInput.call(decoder, context, source, target, offsets, buffer,
length, cr);
}
// return CharsetCallback.TO_U_CALLBACK_STOP.call(decoder, context, source, target, offsets, buffer, length, cr);
}
};
// exist to keep implOnMalformedInput and implOnUnmappableInput from being too recursive
private boolean malformedInputCalled = false;
private boolean unmappableCharacterCalled = false;
/*
* Construct a CharsetDecorderICU based on the information provided from a CharsetICU object.
*
* @param cs The CharsetICU object containing information about how to charset to decode.
*/
CharsetDecoderICU(CharsetICU cs) {
super(cs, (1/cs.maxCharsPerByte), cs.maxCharsPerByte);
}
/*
* Is this Decoder allowed to use fallbacks? A fallback mapping is a mapping
* that will convert a byte sequence to a Unicode codepoint sequence, but
* the encoded Unicode codepoint sequence will round trip convert to a different
* byte sequence. In ICU, this is can be called a reverse fallback.
* @return A boolean
*/
final boolean isFallbackUsed() {
return true;
}
/**
* Fallback is currently always used by icu4j decoders.
*/
static final boolean isToUUseFallback() {
return isToUUseFallback(true);
}
/**
* Fallback is currently always used by icu4j decoders.
*/
static final boolean isToUUseFallback(boolean iUseFallback) {
return true;
}
/**
* Sets the action to be taken if an illegal sequence is encountered
*
* @param newAction action to be taken
* @exception IllegalArgumentException
* @stable ICU 3.6
*/
@Override
protected final void implOnMalformedInput(CodingErrorAction newAction) {
// don't run infinitely
if (malformedInputCalled)
return;
// if we get a replace, do not let the nio replace
if (newAction == CodingErrorAction.REPLACE) {
malformedInputCalled = true;
super.onMalformedInput(CodingErrorAction.IGNORE);
malformedInputCalled = false;
}
onMalformedInput = getCallback(newAction);
}
/**
* Sets the action to be taken if an illegal sequence is encountered
*
* @param newAction action to be taken
* @exception IllegalArgumentException
* @stable ICU 3.6
*/
@Override
protected final void implOnUnmappableCharacter(CodingErrorAction newAction) {
// dont run infinitely
if (unmappableCharacterCalled)
return;
// if we get a replace, do not let the nio replace
if (newAction == CodingErrorAction.REPLACE) {
unmappableCharacterCalled = true;
super.onUnmappableCharacter(CodingErrorAction.IGNORE);
unmappableCharacterCalled = false;
}
onUnmappableCharacter = getCallback(newAction);
}
/**
* Sets the callback encoder method and context to be used if an illegal sequence is encounterd.
* You would normally call this twice to set both the malform and unmappable error. In this case,
* newContext should remain the same since using a different newContext each time will negate the last
* one used.
* @param err CoderResult
* @param newCallback CharsetCallback.Encoder
* @param newContext Object
* @stable ICU 4.0
*/
public final void setToUCallback(CoderResult err, CharsetCallback.Decoder newCallback, Object newContext) {
if (err.isMalformed()) {
onMalformedInput = newCallback;
} else if (err.isUnmappable()) {
onUnmappableCharacter = newCallback;
} else {
/* Error: Only malformed and unmappable are handled. */
}
if (toUContext == null || !toUContext.equals(newContext)) {
toUContext = newContext;
}
}
private static CharsetCallback.Decoder getCallback(CodingErrorAction action){
if(action==CodingErrorAction.REPLACE){
return CharsetCallback.TO_U_CALLBACK_SUBSTITUTE;
}else if(action==CodingErrorAction.IGNORE){
return CharsetCallback.TO_U_CALLBACK_SKIP;
}else /* if(action==CodingErrorAction.REPORT) */ {
return CharsetCallback.TO_U_CALLBACK_STOP;
}
}
private final ByteBuffer EMPTY = ByteBuffer.allocate(0);
/**
* Flushes any characters saved in the converter's internal buffer and
* resets the converter.
* @param out action to be taken
* @return result of flushing action and completes the decoding all input.
* Returns CoderResult.UNDERFLOW if the action succeeds.
* @stable ICU 3.6
*/
@Override
protected final CoderResult implFlush(CharBuffer out) {
return decode(EMPTY, out, null, true);
}
/**
* Resets the to Unicode mode of converter
* @stable ICU 3.6
*/
@Override
protected void implReset() {
toUnicodeStatus = 0 ;
toULength = 0;
charErrorBufferLength = 0;
charErrorBufferBegin = 0;
/* store previous UChars/chars to continue partial matches */
preToUBegin = 0;
preToULength = 0; /* negative: replay */
preToUFirstLength = 0;
mode = 0;
}
/**
* Decodes one or more bytes. The default behaviour of the converter
* is stop and report if an error in input stream is encountered.
* To set different behaviour use @see CharsetDecoder.onMalformedInput()
* This method allows a buffer by buffer conversion of a data stream.
* The state of the conversion is saved between calls to convert.
* Among other things, this means multibyte input sequences can be
* split between calls. If a call to convert results in an Error, the
* conversion may be continued by calling convert again with suitably
* modified parameters.All conversions should be finished with a call to
* the flush method.
* @param in buffer to decode
* @param out buffer to populate with decoded result
* @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
* action succeeds or more input is needed for completing the decoding action.
* @stable ICU 3.6
*/
@Override
protected CoderResult decodeLoop(ByteBuffer in,CharBuffer out){
if(in.remaining() < toUCountPending()){
return CoderResult.UNDERFLOW;
}
// if (!in.hasRemaining()) {
// toULength = 0;
// return CoderResult.UNDERFLOW;
// }
in.position(in.position() + toUCountPending());
/* do the conversion */
CoderResult ret = decode(in, out, null, false);
// ok was there input held in the previous invocation of decodeLoop
// that resulted in output in this invocation?
in.position(in.position() - toUCountPending());
return ret;
}
/*
* Implements the ICU semantic for decode operation
* @param in The input byte buffer
* @param out The output character buffer
* @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
* action succeeds or more input is needed for completing the decoding action.
*/
abstract CoderResult decodeLoop(ByteBuffer in, CharBuffer out, IntBuffer offsets, boolean flush);
/*
* Implements the ICU semantic for decode operation
* @param source The input byte buffer
* @param target The output character buffer
* @param offsets
* @param flush true if, and only if, the invoker can provide no
* additional input bytes beyond those in the given buffer.
* @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
* action succeeds or more input is needed for completing the decoding action.
*/
final CoderResult decode(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
/* check parameters */
if (target == null || source == null) {
throw new IllegalArgumentException();
}
/*
* Make sure that the buffer sizes do not exceed the number range for
* int32_t because some functions use the size (in units or bytes)
* rather than comparing pointers, and because offsets are int32_t values.
*
* size_t is guaranteed to be unsigned and large enough for the job.
*
* Return with an error instead of adjusting the limits because we would
* not be able to maintain the semantics that either the source must be
* consumed or the target filled (unless an error occurs).
* An adjustment would be sourceLimit=t+0x7fffffff; for example.
*/
/*agljport:fix
if(
((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s) ||
((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t)
) {
*err=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
*/
/* flush the target overflow buffer */
if (charErrorBufferLength > 0) {
int i = 0;
do {
if (!target.hasRemaining()) {
/* the overflow buffer contains too much, keep the rest */
int j = 0;
do {
charErrorBufferArray[j++] = charErrorBufferArray[i++];
} while (i < charErrorBufferLength);
charErrorBufferLength = (byte) j;
return CoderResult.OVERFLOW;
}
/* copy the overflow contents to the target */
target.put(charErrorBufferArray[i++]);
if (offsets != null) {
offsets.put(-1); /* no source index available for old output */
}
} while (i < charErrorBufferLength);
/* the overflow buffer is completely copied to the target */
charErrorBufferLength = 0;
}
if (!flush && !source.hasRemaining() && toULength == 0 && preToULength >= 0) {
/* the overflow buffer is emptied and there is no new input: we are done */
return CoderResult.UNDERFLOW;
}
/*
* Do not simply return with a buffer overflow error if
* !flush && t==targetLimit
* because it is possible that the source will not generate any output.
* For example, the skip callback may be called;
* it does not output anything.
*/
return toUnicodeWithCallback(source, target, offsets, flush);
}
/* Currently, we are not using offsets in ICU4J. */
/* private void updateOffsets(IntBuffer offsets,int length, int sourceIndex, int errorInputLength) {
int limit;
int delta, offset;
if(sourceIndex>=0) {
/*
* adjust each offset by adding the previous sourceIndex
* minus the length of the input sequence that caused an
* error, if any
*/
/* delta=sourceIndex-errorInputLength;
} else {
/*
* set each offset to -1 because this conversion function
* does not handle offsets
*/
/* delta=-1;
}
limit=offsets.position()+length;
if(delta==0) {
/* most common case, nothing to do */
/* } else if(delta>0) {
/* add the delta to each offset (but not if the offset is <0) */
/* while(offsets.position()=0) {
offsets.put(offset+delta);
}
//FIXME: ++offsets;
}
} else /* delta<0 */ /* {
/*
* set each offset to -1 because this conversion function
* does not handle offsets
* or the error input sequence started in a previous buffer
*/
/* while(offsets.position()=0) {
/* normal mode */
} else {
/*
* Previous m:n conversion stored source units from a partial match
* and failed to consume all of them.
* We need to "replay" them from a temporary buffer and convert them first.
*/
realSource=source;
realFlush=flush;
realSourceIndex=sourceIndex;
//UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
replayArray.put(preToUArray,0, -preToULength);
source=replayArray;
source.position(0);
source.limit(replayArrayIndex-preToULength);
flush=false;
sourceIndex=-1;
preToULength=0;
}
/*
* loop for conversion and error handling
*
* loop {
* convert
* loop {
* update offsets
* handle end of input
* handle errors/call callback
* }
* }
*/
for(;;) {
/* convert */
cr = decodeLoop(source, target, offsets, flush);
/*
* set a flag for whether the converter
* successfully processed the end of the input
*
* need not check cnv->preToULength==0 because a replay (<0) will cause
* s0) {
updateOffsets(offsets, length, sourceIndex, errorInputLength);
/*
* if a converter handles offsets and updates the offsets
* pointer at the end, then pArgs->offset should not change
* here;
* however, some converters do not handle offsets at all
* (sourceIndex<0) or may not update the offsets pointer
*/
//TODO: pArgs->offsets=offsets+=length;
/* }
if(sourceIndex>=0) {
sourceIndex+=(source.position()-s);
}
} */
if(preToULength<0) {
/*
* switch the source to new replay units (cannot occur while replaying)
* after offset handling and before end-of-input and callback handling
*/
if(realSource==null)
{
realSource=source;
realFlush=flush;
realSourceIndex=sourceIndex;
//UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
replayArray.put(preToUArray,0, -preToULength);
// reset position
replayArray.position(0);
source=replayArray;
source.limit(replayArrayIndex-preToULength);
flush=false;
if((sourceIndex+=preToULength)<0) {
sourceIndex=-1;
}
preToULength=0;
} else {
/* see implementation note before _fromUnicodeWithCallback() */
//agljport:todo U_ASSERT(realSource==NULL);
Assert.assrt(realSource==null);
}
}
/* update pointers */
s=source.position();
//t=target.position();
if(cr.isUnderflow()) {
if(s0) {
/*
* the entire input stream is consumed
* and there is a partial, truncated input sequence left
*/
/* inject an error and continue with callback handling */
cr = CoderResult.malformedForLength(toULength);
calledCallback=false; /* new error condition */
} else {
/* input consumed */
if(flush) {
/*
* return to the conversion loop once more if the flush
* flag is set and the conversion function has not
* successfully processed the end of the input yet
*
* (continue converting by breaking out of only the inner loop)
*/
if(!converterSawEndOfInput) {
break;
}
/* reset the converter without calling the callback function */
implReset();
}
/* done successfully */
return cr;
}
}
/* U_FAILURE(*err) */
{
if( calledCallback || cr.isOverflow() ||
(cr.isMalformed() && cr.isUnmappable())
) {
/*
* the callback did not or cannot resolve the error:
* set output pointers and return
*
* the check for buffer overflow is redundant but it is
* a high-runner case and hopefully documents the intent
* well
*
* if we were replaying, then the replay buffer must be
* copied back into the UConverter
* and the real arguments must be restored
*/
if(realSource!=null) {
int length;
Assert.assrt(preToULength==0);
length = source.limit() - source.position();
if(length>0) {
//UConverterUtility.uprv_memcpy(preToUArray, preToUBegin, pArgs.sourceArray, pArgs.sourceBegin, length);
source.get(preToUArray, preToUBegin, length);
preToULength=(byte)-length;
}
}
return cr;
}
}
/* copy toUBytes[] to invalidCharBuffer[] */
errorInputLength=invalidCharLength=toULength;
if(errorInputLength>0) {
copy(toUBytesArray, 0, invalidCharBuffer, 0, errorInputLength);
}
/* set the converter state to deal with the next character */
toULength=0;
/* call the callback function */
cr = toCharErrorBehaviour.call(this, toUContext, source, target, offsets, invalidCharBuffer, errorInputLength, cr);
/*
* loop back to the offset handling
*
* this flag will indicate after offset handling
* that a callback was called;
* if the callback did not resolve the error, then we return
*/
calledCallback=true;
}
}
}
/*
* Returns the number of chars held in the converter's internal state
* because more input is needed for completing the conversion. This function is
* useful for mapping semantics of ICU's converter interface to those of iconv,
* and this information is not needed for normal conversion.
* @return The number of chars in the state. -1 if an error is encountered.
*/
/*public*/ int toUCountPending() {
if(preToULength > 0){
return preToULength ;
} else if(preToULength < 0){
return -preToULength;
} else if(toULength > 0){
return toULength;
} else {
return 0;
}
}
private void copy(byte[] src, int srcOffset, char[] dst, int dstOffset, int length) {
for(int i=srcOffset; i0 && target.hasRemaining()) {
target.put(ucharsArray[ucharsBegin++]);
--length;
}
} else {
/* output with offsets */
while(length>0 && target.hasRemaining()) {
target.put(ucharsArray[ucharsBegin++]);
offsets.put(sourceIndex);
--length;
}
}
/* write overflow */
if(length>0) {
cnv.charErrorBufferLength= 0;
cr = CoderResult.OVERFLOW;
do {
cnv.charErrorBufferArray[cnv.charErrorBufferLength++]=ucharsArray[ucharsBegin++];
} while(--length>0);
}
return cr;
}
/*
* This function will write out the Unicode substitution character to the
* target character buffer.
* Sub classes to override this method if required
* @param decoder
* @param source
* @param target
* @param offsets
* @return A CoderResult object that contains the error result when an error occurs.
*/
/* Note: Currently, this method is not being used because the callback method calls toUWriteUChars with
* the substitution characters. Will leave in here for the time being. To be removed later. (4.0)
*/
/*CoderResult cbToUWriteSub(CharsetDecoderICU decoder,
ByteBuffer source, CharBuffer target,
IntBuffer offsets){
String sub = decoder.replacement();
CharsetICU cs = (CharsetICU) decoder.charset();
if (decoder.invalidCharLength==1 && cs.subChar1 != 0x00) {
char[] subArr = new char[] { 0x1a };
return CharsetDecoderICU.toUWriteUChars(decoder, subArr, 0, sub
.length(), target, offsets, source.position());
} else {
return CharsetDecoderICU.toUWriteUChars(decoder, sub.toCharArray(),
0, sub.length(), target, offsets, source.position());
}
}*/
/**
* Returns the maxBytesPerChar value for the Charset that created this decoder.
* @return maxBytesPerChar
* @stable ICU 4.8
*/
public final float maxBytesPerChar() {
return ((CharsetICU)(this.charset())).maxBytesPerChar;
}
}