java.nio.charset.CharsetDecoder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jtransc-rt Show documentation
JVM AOT compiler currently generating JavaScript, C++, Haxe, with initial focus on Kotlin and games.
The newest version!
/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package java.nio.charset;

import java.nio.BufferOverflowException;
import java.nio.BufferUnderflowException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;

/**
 * A converter that can convert a byte sequence from a charset into a 16-bit
 * Unicode character sequence.
 * 
 * The input byte sequence is wrapped by a
 * {@link java.nio.ByteBuffer ByteBuffer} and the output character sequence is a
 * {@link java.nio.CharBuffer CharBuffer}. A decoder instance should be used in
 * the following sequence, which is referred to as a decoding operation:
 * 

 * invoking the {@link #reset() reset} method to reset the decoder if the
 * decoder has been used;
 * invoking the {@link #decode(ByteBuffer, CharBuffer, boolean) decode}
 * method until the additional input is not needed, the endOfInput
 * parameter must be set to false, the input buffer must be filled and the
 * output buffer must be flushed between invocations;
 * invoking the {@link #decode(ByteBuffer, CharBuffer, boolean) decode}
 * method for the last time, and then the endOfInput parameter
 * must be set to true;
 * invoking the {@link #flush(CharBuffer) flush} method to flush the
 * output.
 * 
 * 
 * The {@link #decode(ByteBuffer, CharBuffer, boolean) decode} method will
 * convert as many bytes as possible, and the process won't stop until the input
 * bytes have run out, the output buffer has been filled or some error has
 * happened. A {@link CoderResult CoderResult} instance will be returned to
 * indicate the stop reason, and the invoker can identify the result and choose
 * further action, which includes filling the input buffer, flushing the output
 * buffer or recovering from an error and trying again.
 * 

 * There are two common decoding errors. One is named malformed and it is
 * returned when the input byte sequence is illegal for the current specific
 * charset, the other is named unmappable character and it is returned when a
 * problem occurs mapping a legal input byte sequence to its Unicode character
 * equivalent.
 * 

 * Both errors can be handled in three ways, the default one is to report the
 * error to the invoker by a {@link CoderResult CoderResult} instance, and the
 * alternatives are to ignore it or to replace the erroneous input with the
 * replacement string. The replacement string is "\uFFFD" by default and can be
 * changed by invoking {@link #replaceWith(String) replaceWith} method. The
 * invoker of this decoder can choose one way by specifying a
 * {@link CodingErrorAction CodingErrorAction} instance for each error type via
 * {@link #onMalformedInput(CodingErrorAction) onMalformedInput} method and
 * {@link #onUnmappableCharacter(CodingErrorAction) onUnmappableCharacter}
 * method.
 * 

 * This is an abstract class and encapsulates many common operations of the
 * decoding process for all charsets. Decoders for a specific charset should
 * extend this class and need only to implement the
 * {@link #decodeLoop(ByteBuffer, CharBuffer) decodeLoop} method for the basic
 * decoding. If a subclass maintains an internal state, it should override the
 * {@link #implFlush(CharBuffer) implFlush} method and the
 * {@link #implReset() implReset} method in addition.
 * 

 * This class is not thread-safe.
 *
 * @see java.nio.charset.Charset
 * @see java.nio.charset.CharsetEncoder
 */
public abstract class CharsetDecoder {
	private static final int RESET = 0;
	private static final int ONGOING = 1;
	private static final int END_OF_INPUT = 2;
	private static final int FLUSHED = 3;

	private final Charset charset;

	private final float averageCharsPerByte;
	private final float maxCharsPerByte;

	private String replacementChars = "\ufffd";

	private int state = RESET;

	private CodingErrorAction malformedInputAction = CodingErrorAction.REPORT;
	private CodingErrorAction unmappableCharacterAction = CodingErrorAction.REPORT;

	/**
	 * Constructs a new CharsetDecoder using the given
	 * Charset, average number and maximum number of characters
	 * created by this decoder for one input byte, and the default replacement
	 * string "\uFFFD".
	 *
	 * @param charset
	 *            the Charset to be used by this decoder.
	 * @param averageCharsPerByte
	 *            the average number of characters created by this decoder for
	 *            one input byte, must be positive.
	 * @param maxCharsPerByte
	 *            the maximum number of characters created by this decoder for
	 *            one input byte, must be positive.
	 * @throws IllegalArgumentException
	 *     if {@code averageCharsPerByte <= 0 || maxCharsPerByte <= 0 || averageCharsPerByte > maxCharsPerByte}.
	 */
	protected CharsetDecoder(Charset charset, float averageCharsPerByte, float maxCharsPerByte) {
		if (averageCharsPerByte <= 0 || maxCharsPerByte <= 0) {
			throw new IllegalArgumentException("averageCharsPerByte and maxCharsPerByte must be positive");
		}
		if (averageCharsPerByte > maxCharsPerByte) {
			throw new IllegalArgumentException("averageCharsPerByte is greater than maxCharsPerByte");
		}
		this.averageCharsPerByte = averageCharsPerByte;
		this.maxCharsPerByte = maxCharsPerByte;
		this.charset = charset;
	}

	/**
	 * Returns the average number of characters created by this decoder for a
	 * single input byte.
	 */
	public final float averageCharsPerByte() {
		return averageCharsPerByte;
	}

	/**
	 * Returns the {@link Charset} which this decoder uses.
	 */
	public final Charset charset() {
		return charset;
	}

	/**
	 * This is a facade method for the decoding operation.
	 * 

	 * This method decodes the remaining byte sequence of the given byte buffer
	 * into a new character buffer. This method performs a complete decoding
	 * operation, resets at first, then decodes, and flushes at last.
	 * 

	 * This method should not be invoked while another {@code decode} operation
	 * is ongoing.
	 *
	 * @param in
	 *            the input buffer.
	 * @return a new CharBuffer containing the the characters
	 *         produced by this decoding operation. The buffer's limit will be
	 *         the position of the last character in the buffer, and the
	 *         position will be zero.
	 * @throws IllegalStateException
	 *             if another decoding operation is ongoing.
	 * @throws MalformedInputException
	 *             if an illegal input byte sequence for this charset was
	 *             encountered, and the action for malformed error is
	 *             {@link CodingErrorAction#REPORT CodingErrorAction.REPORT}
	 * @throws UnmappableCharacterException
	 *             if a legal but unmappable input byte sequence for this
	 *             charset was encountered, and the action for unmappable
	 *             character error is
	 *             {@link CodingErrorAction#REPORT CodingErrorAction.REPORT}.
	 *             Unmappable means the byte sequence at the input buffer's
	 *             current position cannot be mapped to a Unicode character
	 *             sequence.
	 * @throws CharacterCodingException
	 *             if another exception happened during the decode operation.
	 */
	public final CharBuffer decode(ByteBuffer in) throws CharacterCodingException {
		int length = (int) (in.remaining() * averageCharsPerByte);
		CharBuffer out = CharBuffer.allocate(length);

		reset();

		while (state != FLUSHED) {
			CoderResult result = decode(in, out, true);
			if (result == CoderResult.OVERFLOW) {
				out = allocateMore(out);
				continue; // No point trying to flush to an already-full buffer.
			} else {
				checkCoderResult(result);
			}

			result = flush(out);
			if (result == CoderResult.OVERFLOW) {
				out = allocateMore(out);
			} else {
				checkCoderResult(result);
			}
		}

		out.flip();
		return out;
	}

	/*
     * checks the result whether it needs to throw CharacterCodingException.
     */
	private void checkCoderResult(CoderResult result) throws CharacterCodingException {
		if (result.isMalformed() && malformedInputAction == CodingErrorAction.REPORT) {
			throw new MalformedInputException(result.length());
		} else if (result.isUnmappable() && unmappableCharacterAction == CodingErrorAction.REPORT) {
			throw new UnmappableCharacterException(result.length());
		}
	}

	/*
     * original output is full and doesn't have remaining. allocate more space
     * to new CharBuffer and return it, the contents in the given buffer will be
     * copied into the new buffer.
     */
	private CharBuffer allocateMore(CharBuffer output) {
		if (output.capacity() == 0) {
			return CharBuffer.allocate(1);
		}
		CharBuffer result = CharBuffer.allocate(output.capacity() * 2);
		output.flip();
		result.put(output);
		return result;
	}

	/**
	 * Decodes bytes starting at the current position of the given input buffer,
	 * and writes the equivalent character sequence into the given output buffer
	 * from its current position.
	 * 

	 * The buffers' position will be changed with the reading and writing
	 * operation, but their limits and marks will be kept intact.
	 * 

	 * A CoderResult instance will be returned according to
	 * following rules:
	 * 

	 * {@link CoderResult#OVERFLOW CoderResult.OVERFLOW} indicates that
	 * even though not all of the input has been processed, the buffer the
	 * output is being written to has reached its capacity. In the event of this
	 * code being returned this method should be called once more with an
	 * out argument that has not already been filled.
	 * {@link CoderResult#UNDERFLOW CoderResult.UNDERFLOW} indicates that
	 * as many bytes as possible in the input buffer have been decoded. If there
	 * is no further input and no remaining bytes in the input buffer then this
	 * operation may be regarded as complete. Otherwise, this method should be
	 * called once more with additional input.
	 * A {@link CoderResult#malformedForLength(int) malformed input} result
	 * indicates that some malformed input error has been encountered, and the
	 * erroneous bytes start at the input buffer's position and their number can
	 * be got by result's {@link CoderResult#length() length}. This kind of
	 * result can be returned only if the malformed action is
	 * {@link CodingErrorAction#REPORT CodingErrorAction.REPORT}. 
	 * A {@link CoderResult#unmappableForLength(int) unmappable character}
	 * result indicates that some unmappable character error has been
	 * encountered, and the erroneous bytes start at the input buffer's position
	 * and their number can be got by result's
	 * {@link CoderResult#length() length}. This kind of result can be returned
	 * only if the unmappable character action is
	 * {@link CodingErrorAction#REPORT CodingErrorAction.REPORT}. 
	 * 
	 * 
	 * The endOfInput parameter indicates that the invoker cannot
	 * provide further input. This parameter is true if and only if the bytes in
	 * current input buffer are all inputs for this decoding operation. Note
	 * that it is common and won't cause an error if the invoker sets false and
	 * then can't provide more input, while it may cause an error if the invoker
	 * always sets true in several consecutive invocations. This would make the
	 * remaining input to be treated as malformed input.
	 * 

	 * This method invokes the
	 * {@link #decodeLoop(ByteBuffer, CharBuffer) decodeLoop} method to
	 * implement the basic decode logic for a specific charset.
	 *
	 * @param in
	 *            the input buffer.
	 * @param out
	 *            the output buffer.
	 * @param endOfInput
	 *            true if all the input characters have been provided.
	 * @return a CoderResult instance which indicates the reason
	 *         of termination.
	 * @throws IllegalStateException
	 *             if decoding has started or no more input is needed in this
	 *             decoding progress.
	 * @throws CoderMalfunctionError
	 *             if the {@link #decodeLoop(ByteBuffer, CharBuffer) decodeLoop}
	 *             method threw an BufferUnderflowException or
	 *             BufferOverflowException.
	 */
	public final CoderResult decode(ByteBuffer in, CharBuffer out, boolean endOfInput) {
		if (state != RESET && state != ONGOING && !(endOfInput && state == END_OF_INPUT)) {
			throw illegalStateException();
		}

		state = endOfInput ? END_OF_INPUT : ONGOING;

		while (true) {
			CoderResult result;
			try {
				result = decodeLoop(in, out);
			} catch (BufferOverflowException ex) {
				throw new CoderMalfunctionError(ex);
			} catch (BufferUnderflowException ex) {
				throw new CoderMalfunctionError(ex);
			}

			if (result == CoderResult.UNDERFLOW) {
				if (endOfInput && in.hasRemaining()) {
					result = CoderResult.malformedForLength(in.remaining());
				} else {
					return result;
				}
			} else if (result == CoderResult.OVERFLOW) {
				return result;
			}

			// We have a real error, so do what the appropriate action tells us what to do...
			CodingErrorAction action =
				result.isUnmappable() ? unmappableCharacterAction : malformedInputAction;
			if (action == CodingErrorAction.REPORT) {
				return result;
			} else if (action == CodingErrorAction.REPLACE) {
				if (out.remaining() < replacementChars.length()) {
					return CoderResult.OVERFLOW;
				}
				out.put(replacementChars);
			}
			in.position(in.position() + result.length());
		}
	}

	/**
	 * Decodes bytes into characters. This method is called by the
	 * {@link #decode(ByteBuffer, CharBuffer, boolean) decode} method.
	 * 

	 * This method will implement the essential decoding operation, and it won't
	 * stop decoding until either all the input bytes are read, the output
	 * buffer is filled, or some exception is encountered. Then it will return a
	 * CoderResult object indicating the result of current
	 * decoding operation. The rules to construct the CoderResult
	 * are the same as for
	 * {@link #decode(ByteBuffer, CharBuffer, boolean) decode}. When an
	 * exception is encountered in the decoding operation, most implementations
	 * of this method will return a relevant result object to the
	 * {@link #decode(ByteBuffer, CharBuffer, boolean) decode} method, and some
	 * performance optimized implementation may handle the exception and
	 * implement the error action itself.
	 * 

	 * The buffers are scanned from their current positions, and their positions
	 * will be modified accordingly, while their marks and limits will be
	 * intact. At most {@link ByteBuffer#remaining() in.remaining()} characters
	 * will be read, and {@link CharBuffer#remaining() out.remaining()} bytes
	 * will be written.
	 * 

	 * Note that some implementations may pre-scan the input buffer and return a
	 * CoderResult.UNDERFLOW until it receives sufficient input.
	 *
	 * @param in
	 *            the input buffer.
	 * @param out
	 *            the output buffer.
	 * @return a CoderResult instance indicating the result.
	 */
	protected abstract CoderResult decodeLoop(ByteBuffer in, CharBuffer out);

	/**
	 * Gets the charset detected by this decoder; this method is optional.
	 * 

	 * If implementing an auto-detecting charset, then this decoder returns the
	 * detected charset from this method when it is available. The returned
	 * charset will be the same for the rest of the decode operation.
	 * 

	 * If insufficient bytes have been read to determine the charset, an
	 * IllegalStateException will be thrown.
	 * 

	 * The default implementation always throws
	 * UnsupportedOperationException, so it should be overridden
	 * by a subclass if needed.
	 *
	 * @return the charset detected by this decoder, or null if it is not yet
	 *         determined.
	 * @throws UnsupportedOperationException
	 *             if this decoder does not implement an auto-detecting charset.
	 * @throws IllegalStateException
	 *             if insufficient bytes have been read to determine the
	 *             charset.
	 */
	public Charset detectedCharset() {
		throw new UnsupportedOperationException();
	}

	/**
	 * Flushes this decoder.
	 *
	 * This method will call {@link #implFlush(CharBuffer) implFlush}. Some
	 * decoders may need to write some characters to the output buffer when they
	 * have read all input bytes; subclasses can override
	 * {@link #implFlush(CharBuffer) implFlush} to perform the writing operation.
	 * 

	 * The maximum number of written bytes won't be larger than
	 * {@link CharBuffer#remaining() out.remaining()}. If some decoder wants to
	 * write more bytes than an output buffer's remaining space allows, then a
	 * CoderResult.OVERFLOW will be returned, and this method
	 * must be called again with a character buffer that has more remaining
	 * space. Otherwise this method will return
	 * CoderResult.UNDERFLOW, which means one decoding process
	 * has been completed successfully.
	 * 

	 * During the flush, the output buffer's position will be changed
	 * accordingly, while its mark and limit will be intact.
	 *
	 * @param out
	 *            the given output buffer.
	 * @return CoderResult.UNDERFLOW or
	 *         CoderResult.OVERFLOW.
	 * @throws IllegalStateException
	 *             if this decoder isn't already flushed or at end of input.
	 */
	public final CoderResult flush(CharBuffer out) {
		if (state != FLUSHED && state != END_OF_INPUT) {
			throw illegalStateException();
		}
		CoderResult result = implFlush(out);
		if (result == CoderResult.UNDERFLOW) {
			state = FLUSHED;
		}
		return result;
	}

	/**
	 * Flushes this decoder. The default implementation does nothing and always
	 * returns CoderResult.UNDERFLOW; this method can be
	 * overridden if needed.
	 *
	 * @param out
	 *            the output buffer.
	 * @return CoderResult.UNDERFLOW or
	 *         CoderResult.OVERFLOW.
	 */
	protected CoderResult implFlush(CharBuffer out) {
		return CoderResult.UNDERFLOW;
	}

	/**
	 * Notifies that this decoder's CodingErrorAction specified
	 * for malformed input error has been changed. The default implementation
	 * does nothing; this method can be overridden if needed.
	 *
	 * @param newAction
	 *            the new action.
	 */
	protected void implOnMalformedInput(CodingErrorAction newAction) {
		// default implementation is empty
	}

	/**
	 * Notifies that this decoder's CodingErrorAction specified
	 * for unmappable character error has been changed. The default
	 * implementation does nothing; this method can be overridden if needed.
	 *
	 * @param newAction
	 *            the new action.
	 */
	protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
		// default implementation is empty
	}

	/**
	 * Notifies that this decoder's replacement has been changed. The default
	 * implementation does nothing; this method can be overridden if needed.
	 *
	 * @param newReplacement
	 *            the new replacement string.
	 */
	protected void implReplaceWith(String newReplacement) {
		// default implementation is empty
	}

	/**
	 * Reset this decoder's charset related state. The default implementation
	 * does nothing; this method can be overridden if needed.
	 */
	protected void implReset() {
		// default implementation is empty
	}

	/**
	 * Indicates whether this decoder implements an auto-detecting charset.
	 *
	 * @return true if this decoder implements an auto-detecting
	 *         charset.
	 */
	public boolean isAutoDetecting() {
		return false;
	}

	/**
	 * Indicates whether this decoder has detected a charset; this method is
	 * optional.
	 * 

	 * If this decoder implements an auto-detecting charset, then this method
	 * may start to return true during decoding operation to indicate that a
	 * charset has been detected in the input bytes and that the charset can be
	 * retrieved by invoking the {@link #detectedCharset() detectedCharset}
	 * method.
	 * 

	 * Note that a decoder that implements an auto-detecting charset may still
	 * succeed in decoding a portion of the given input even when it is unable
	 * to detect the charset. For this reason users should be aware that a
	 * false return value does not indicate that no decoding took
	 * place.
	 * 
	 * The default implementation always throws an
	 * UnsupportedOperationException; it should be overridden by
	 * a subclass if needed.
	 *
	 * @return true if this decoder has detected a charset.
	 * @throws UnsupportedOperationException
	 *             if this decoder doesn't implement an auto-detecting charset.
	 */
	public boolean isCharsetDetected() {
		throw new UnsupportedOperationException();
	}

	/**
	 * Returns this decoder's CodingErrorAction when malformed input
	 * occurred during the decoding process.
	 */
	public CodingErrorAction malformedInputAction() {
		return malformedInputAction;
	}

	/**
	 * Returns the maximum number of characters which can be created by this
	 * decoder for one input byte, must be positive.
	 */
	public final float maxCharsPerByte() {
		return maxCharsPerByte;
	}

	/**
	 * Sets this decoder's action on malformed input errors.
	 *
	 * This method will call the
	 * {@link #implOnMalformedInput(CodingErrorAction) implOnMalformedInput}
	 * method with the given new action as argument.
	 *
	 * @param newAction
	 *            the new action on malformed input error.
	 * @return this decoder.
	 * @throws IllegalArgumentException
	 *             if {@code newAction == null}.
	 */
	public final CharsetDecoder onMalformedInput(CodingErrorAction newAction) {
		if (newAction == null) {
			throw new IllegalArgumentException("newAction == null");
		}
		malformedInputAction = newAction;
		implOnMalformedInput(newAction);
		return this;
	}

	/**
	 * Sets this decoder's action on unmappable character errors.
	 *
	 * This method will call the
	 * {@link #implOnUnmappableCharacter(CodingErrorAction) implOnUnmappableCharacter}
	 * method with the given new action as argument.
	 *
	 * @param newAction
	 *            the new action on unmappable character error.
	 * @return this decoder.
	 * @throws IllegalArgumentException
	 *             if {@code newAction == null}.
	 */
	public final CharsetDecoder onUnmappableCharacter(CodingErrorAction newAction) {
		if (newAction == null) {
			throw new IllegalArgumentException("newAction == null");
		}
		unmappableCharacterAction = newAction;
		implOnUnmappableCharacter(newAction);
		return this;
	}

	/**
	 * Returns the replacement string, which is never null or empty.
	 */
	public final String replacement() {
		return replacementChars;
	}

	/**
	 * Sets the new replacement string.
	 *
	 * This method first checks the given replacement's validity, then changes
	 * the replacement value, and at last calls the
	 * {@link #implReplaceWith(String) implReplaceWith} method with the given
	 * new replacement as argument.
	 *
	 * @param replacement
	 *            the replacement string cannot be null, empty, or longer
	 *            than {@link #maxCharsPerByte()}.
	 * @return this decoder.
	 * @throws IllegalArgumentException
	 *             if the given replacement cannot satisfy the requirement
	 *             mentioned above.
	 */
	public final CharsetDecoder replaceWith(String replacement) {
		if (replacement == null) {
			throw new IllegalArgumentException("replacement == null");
		}
		if (replacement.isEmpty()) {
			throw new IllegalArgumentException("replacement.isEmpty()");
		}
		if (replacement.length() > maxCharsPerByte()) {
			throw new IllegalArgumentException("replacement length > maxCharsPerByte: " +
				replacement.length() + " > " + maxCharsPerByte());
		}
		replacementChars = replacement;
		implReplaceWith(replacement);
		return this;
	}

	/**
	 * Resets this decoder. This method will reset the internal state, and then
	 * calls {@link #implReset} to reset any state related to the
	 * specific charset.
	 */
	public final CharsetDecoder reset() {
		state = RESET;
		implReset();
		return this;
	}

	/**
	 * Returns this decoder's CodingErrorAction when an unmappable
	 * character error occurred during the decoding process.
	 */
	public CodingErrorAction unmappableCharacterAction() {
		return unmappableCharacterAction;
	}

	private IllegalStateException illegalStateException() {
		throw new IllegalStateException("State: " + state);
	}
}