All Downloads are FREE. Search and download functionalities are using the official Maven repository.

java.nio.charset.CharsetEncoder Maven / Gradle / Ivy

/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package java.nio.charset;

import java.nio.BufferOverflowException;
import java.nio.BufferUnderflowException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.util.Arrays;

/**
 * Transforms a sequence of 16-bit Java characters to a byte sequence in some encoding.
 *
 * 

The input character sequence is a {@link java.nio.CharBuffer CharBuffer} and the * output byte sequence is a {@link java.nio.ByteBuffer ByteBuffer}. * *

Use {@link #encode(CharBuffer)} to encode an entire {@code CharBuffer} to a * new {@code ByteBuffer}, or {@link #encode(CharBuffer, ByteBuffer, boolean)} for more * control. When using the latter method, the entire operation proceeds as follows: *

    *
  1. Invoke {@link #reset()} to reset the encoder if this instance has been used before.
  2. *
  3. Invoke {@link #encode(CharBuffer, ByteBuffer, boolean) encode} with the {@code endOfInput} * parameter set to false until additional input is not needed (as signaled by the return value). * The input buffer must be filled and the output buffer must be flushed between invocations. *

    The {@link #encode(CharBuffer, ByteBuffer, boolean) encode} method will * convert as many characters as possible, and the process won't stop until the * input buffer has been exhausted, the output buffer has been filled, or an * error has occurred. A {@link CoderResult CoderResult} instance will be * returned to indicate the current state. The caller should fill the input buffer, flush * the output buffer, or recovering from an error and try again, accordingly. *

  4. *
  5. Invoke {@link #encode(CharBuffer, ByteBuffer, boolean) encode} for the last time with * {@code endOfInput} set to true.
  6. *
  7. Invoke {@link #flush(ByteBuffer)} to flush remaining output.
  8. *
* *

There are two classes of encoding error: malformed input * signifies that the input character sequence is not legal, while unmappable character * signifies that the input is legal but cannot be mapped to a byte sequence (because the charset * cannot represent the character, for example). * *

Errors can be handled in three ways. The default is to * {@link CodingErrorAction#REPORT report} the error to the caller. The alternatives are to * {@link CodingErrorAction#IGNORE ignore} the error or {@link CodingErrorAction#REPLACE replace} * the problematic input with the byte sequence returned by {@link #replacement}. The disposition * for each of the two kinds of error can be set independently using the {@link #onMalformedInput} * and {@link #onUnmappableCharacter} methods. * *

The default replacement bytes depend on the charset but can be overridden using the * {@link #replaceWith} method. * *

This class is abstract and encapsulates many common operations of the * encoding process for all charsets. Encoders for a specific charset should * extend this class and need only to implement the * {@link #encodeLoop(CharBuffer, ByteBuffer) encodeLoop} method for basic * encoding. If a subclass maintains internal state, it should also override the * {@link #implFlush(ByteBuffer) implFlush} and {@link #implReset() implReset} methods. * *

This class is not thread-safe. * * @see java.nio.charset.Charset * @see java.nio.charset.CharsetDecoder */ public abstract class CharsetEncoder { private static final String RESET = "RESET"; private static final String ONGOING = "ONGOING"; private static final String END_OF_INPUT = "END_OF_INPUT"; private static final String FLUSHED = "FLUSHED"; private final Charset charset; private final float averageBytesPerChar; private final float maxBytesPerChar; private byte[] replacementBytes; private String state = RESET; private CodingErrorAction malformedInputAction = CodingErrorAction.REPORT; private CodingErrorAction unmappableCharacterAction = CodingErrorAction.REPORT; // decoder instance for this encoder's charset, used for replacement value checking private CharsetDecoder decoder; /** * Constructs a new {@code CharsetEncoder} using the given parameters and * the replacement byte array {@code { (byte) '?' }}. */ protected CharsetEncoder(Charset cs, float averageBytesPerChar, float maxBytesPerChar) { this(cs, averageBytesPerChar, maxBytesPerChar, new byte[] { (byte) '?' }); } /** * Constructs a new CharsetEncoder using the given * Charset, replacement byte array, average number and * maximum number of bytes created by this encoder for one input character. * * @param cs * the Charset to be used by this encoder. * @param averageBytesPerChar * average number of bytes created by this encoder for one single * input character, must be positive. * @param maxBytesPerChar * maximum number of bytes which can be created by this encoder * for one single input character, must be positive. * @param replacement * the replacement byte array, cannot be null or empty, its * length cannot be larger than maxBytesPerChar, * and must be a legal replacement, which can be justified by * {@link #isLegalReplacement(byte[]) isLegalReplacement}. * @throws IllegalArgumentException * if any parameters are invalid. */ protected CharsetEncoder(Charset cs, float averageBytesPerChar, float maxBytesPerChar, byte[] replacement) { this(cs, averageBytesPerChar, maxBytesPerChar, replacement, false); } CharsetEncoder(Charset cs, float averageBytesPerChar, float maxBytesPerChar, byte[] replacement, boolean trusted) { if (averageBytesPerChar <= 0 || maxBytesPerChar <= 0) { throw new IllegalArgumentException("averageBytesPerChar and maxBytesPerChar must both be positive"); } if (averageBytesPerChar > maxBytesPerChar) { throw new IllegalArgumentException("averageBytesPerChar is greater than maxBytesPerChar"); } this.charset = cs; this.averageBytesPerChar = averageBytesPerChar; this.maxBytesPerChar = maxBytesPerChar; if (trusted) { // The RI enforces unnecessary restrictions on the replacement bytes. We trust ICU to // know what it's doing. Doing so lets us support ICU's EUC-JP, SCSU, and Shift_JIS. this.replacementBytes = replacement; } else { replaceWith(replacement); } } /** * Returns the average number of bytes created by this encoder for a single * input character. */ public final float averageBytesPerChar() { return averageBytesPerChar; } /** * Tests whether the given character can be encoded by this encoder. * *

Note that this method may change the internal state of this encoder, so * it should not be called when another encoding process is ongoing, * otherwise it will throw an IllegalStateException. * * @throws IllegalStateException if another encode process is ongoing. */ public boolean canEncode(char c) { return canEncode(CharBuffer.wrap(new char[] { c })); } /** * Tests whether the given CharSequence can be encoded by this * encoder. * *

Note that this method may change the internal state of this encoder, so * it should not be called when another encode process is ongoing, otherwise * it will throw an IllegalStateException. * * @throws IllegalStateException if another encode process is ongoing. */ public boolean canEncode(CharSequence sequence) { CharBuffer cb; if (sequence instanceof CharBuffer) { cb = ((CharBuffer) sequence).duplicate(); } else { cb = CharBuffer.wrap(sequence); } if (state == FLUSHED) { reset(); } if (state != RESET) { throw illegalStateException(); } CodingErrorAction originalMalformedInputAction = malformedInputAction; CodingErrorAction originalUnmappableCharacterAction = unmappableCharacterAction; onMalformedInput(CodingErrorAction.REPORT); onUnmappableCharacter(CodingErrorAction.REPORT); try { encode(cb); return true; } catch (CharacterCodingException e) { return false; } finally { onMalformedInput(originalMalformedInputAction); onUnmappableCharacter(originalUnmappableCharacterAction); reset(); } } /** * Returns the {@link Charset} which this encoder uses. */ public final Charset charset() { return charset; } /** * This is a facade method for the encoding operation. *

* This method encodes the remaining character sequence of the given * character buffer into a new byte buffer. This method performs a complete * encoding operation, resets at first, then encodes, and flushes at last. *

* This method should not be invoked if another encode operation is ongoing. * * @param in * the input buffer. * @return a new ByteBuffer containing the bytes produced by * this encoding operation. The buffer's limit will be the position * of the last byte in the buffer, and the position will be zero. * @throws IllegalStateException * if another encoding operation is ongoing. * @throws MalformedInputException * if an illegal input character sequence for this charset is * encountered, and the action for malformed error is * {@link CodingErrorAction#REPORT CodingErrorAction.REPORT} * @throws UnmappableCharacterException * if a legal but unmappable input character sequence for this * charset is encountered, and the action for unmappable * character error is * {@link CodingErrorAction#REPORT CodingErrorAction.REPORT}. * Unmappable means the Unicode character sequence at the input * buffer's current position cannot be mapped to a equivalent * byte sequence. * @throws CharacterCodingException * if other exception happened during the encode operation. */ public final ByteBuffer encode(CharBuffer in) throws CharacterCodingException { int length = (int) (in.remaining() * averageBytesPerChar); ByteBuffer out = ByteBuffer.allocate(length); reset(); while (state != FLUSHED) { CoderResult result = encode(in, out, true); if (result == CoderResult.OVERFLOW) { out = allocateMore(out); continue; // No point trying to flush to an already-full buffer. } else { checkCoderResult(result); } result = flush(out); if (result == CoderResult.OVERFLOW) { out = allocateMore(out); } else { checkCoderResult(result); } } out.flip(); return out; } private void checkCoderResult(CoderResult result) throws CharacterCodingException { if (malformedInputAction == CodingErrorAction.REPORT && result.isMalformed()) { throw new MalformedInputException(result.length()); } else if (unmappableCharacterAction == CodingErrorAction.REPORT && result.isUnmappable()) { throw new UnmappableCharacterException(result.length()); } } private ByteBuffer allocateMore(ByteBuffer output) { if (output.capacity() == 0) { return ByteBuffer.allocate(1); } ByteBuffer result = ByteBuffer.allocate(output.capacity() * 2); output.flip(); result.put(output); return result; } /** * Encodes characters starting at the current position of the given input * buffer, and writes the equivalent byte sequence into the given output * buffer from its current position. *

* The buffers' position will be changed with the reading and writing * operation, but their limits and marks will be kept intact. *

* A CoderResult instance will be returned according to * following rules: *

    *
  • A {@link CoderResult#malformedForLength(int) malformed input} result * indicates that some malformed input error was encountered, and the * erroneous characters start at the input buffer's position and their * number can be got by result's {@link CoderResult#length() length}. This * kind of result can be returned only if the malformed action is * {@link CodingErrorAction#REPORT CodingErrorAction.REPORT}.
  • *
  • {@link CoderResult#UNDERFLOW CoderResult.UNDERFLOW} indicates that * as many characters as possible in the input buffer have been encoded. If * there is no further input and no characters left in the input buffer then * this task is complete. If this is not the case then the client should * call this method again supplying some more input characters.
  • *
  • {@link CoderResult#OVERFLOW CoderResult.OVERFLOW} indicates that the * output buffer has been filled, while there are still some characters * remaining in the input buffer. This method should be invoked again with a * non-full output buffer.
  • *
  • A {@link CoderResult#unmappableForLength(int) unmappable character} * result indicates that some unmappable character error was encountered, * and the erroneous characters start at the input buffer's position and * their number can be got by result's {@link CoderResult#length() length}. * This kind of result can be returned only on * {@link CodingErrorAction#REPORT CodingErrorAction.REPORT}.
  • *
*

* The endOfInput parameter indicates if the invoker can * provider further input. This parameter is true if and only if the * characters in the current input buffer are all inputs for this encoding * operation. Note that it is common and won't cause an error if the invoker * sets false and then has no more input available, while it may cause an * error if the invoker always sets true in several consecutive invocations. * This would make the remaining input to be treated as malformed input. * input. *

* This method invokes the * {@link #encodeLoop(CharBuffer, ByteBuffer) encodeLoop} method to * implement the basic encode logic for a specific charset. * * @param in * the input buffer. * @param out * the output buffer. * @param endOfInput * true if all the input characters have been provided. * @return a CoderResult instance indicating the result. * @throws IllegalStateException * if the encoding operation has already started or no more * input is needed in this encoding process. * @throws CoderMalfunctionError * If the {@link #encodeLoop(CharBuffer, ByteBuffer) encodeLoop} * method threw an BufferUnderflowException or * BufferUnderflowException. */ public final CoderResult encode(CharBuffer in, ByteBuffer out, boolean endOfInput) { if (state != RESET && state != ONGOING && !(endOfInput && state == END_OF_INPUT)) { throw illegalStateException(); } state = endOfInput ? END_OF_INPUT : ONGOING; while (true) { CoderResult result; try { result = encodeLoop(in, out); } catch (BufferOverflowException ex) { throw new CoderMalfunctionError(ex); } catch (BufferUnderflowException ex) { throw new CoderMalfunctionError(ex); } if (result == CoderResult.UNDERFLOW) { if (endOfInput && in.hasRemaining()) { result = CoderResult.malformedForLength(in.remaining()); } else { return result; } } else if (result == CoderResult.OVERFLOW) { return result; } // We have a real error, so do what the appropriate action tells us what to do... CodingErrorAction action = result.isUnmappable() ? unmappableCharacterAction : malformedInputAction; if (action == CodingErrorAction.REPORT) { return result; } else if (action == CodingErrorAction.REPLACE) { if (out.remaining() < replacementBytes.length) { return CoderResult.OVERFLOW; } out.put(replacementBytes); } in.position(in.position() + result.length()); } } /** * Encodes characters into bytes. This method is called by * {@link #encode(CharBuffer, ByteBuffer, boolean) encode}. *

* This method will implement the essential encoding operation, and it won't * stop encoding until either all the input characters are read, the output * buffer is filled, or some exception is encountered. Then it will * return a CoderResult object indicating the result of the * current encoding operation. The rule to construct the * CoderResult is the same as for * {@link #encode(CharBuffer, ByteBuffer, boolean) encode}. When an * exception is encountered in the encoding operation, most implementations * of this method will return a relevant result object to the * {@link #encode(CharBuffer, ByteBuffer, boolean) encode} method, and * subclasses may handle the exception and * implement the error action themselves. *

* The buffers are scanned from their current positions, and their positions * will be modified accordingly, while their marks and limits will be * intact. At most {@link CharBuffer#remaining() in.remaining()} characters * will be read, and {@link ByteBuffer#remaining() out.remaining()} bytes * will be written. *

* Note that some implementations may pre-scan the input buffer and return * CoderResult.UNDERFLOW until it receives sufficient input. *

* @param in * the input buffer. * @param out * the output buffer. * @return a CoderResult instance indicating the result. */ protected abstract CoderResult encodeLoop(CharBuffer in, ByteBuffer out); /** * Flushes this encoder. *

* This method will call {@link #implFlush(ByteBuffer) implFlush}. Some * encoders may need to write some bytes to the output buffer when they have * read all input characters, subclasses can overridden * {@link #implFlush(ByteBuffer) implFlush} to perform writing action. *

* The maximum number of written bytes won't larger than * {@link ByteBuffer#remaining() out.remaining()}. If some encoder wants to * write more bytes than the output buffer's available remaining space, then * CoderResult.OVERFLOW will be returned, and this method * must be called again with a byte buffer that has free space. Otherwise * this method will return CoderResult.UNDERFLOW, which * means one encoding process has been completed successfully. *

* During the flush, the output buffer's position will be changed * accordingly, while its mark and limit will be intact. * * @param out * the given output buffer. * @return CoderResult.UNDERFLOW or * CoderResult.OVERFLOW. * @throws IllegalStateException * if this encoder isn't already flushed or at end of input. */ public final CoderResult flush(ByteBuffer out) { if (state != FLUSHED && state != END_OF_INPUT) { throw illegalStateException(); } CoderResult result = implFlush(out); if (result == CoderResult.UNDERFLOW) { state = FLUSHED; } return result; } /** * Flushes this encoder. The default implementation does nothing and always * returns CoderResult.UNDERFLOW; this method can be * overridden if needed. * * @param out * the output buffer. * @return CoderResult.UNDERFLOW or * CoderResult.OVERFLOW. */ protected CoderResult implFlush(ByteBuffer out) { return CoderResult.UNDERFLOW; } /** * Notifies that this encoder's CodingErrorAction specified * for malformed input error has been changed. The default implementation * does nothing; this method can be overridden if needed. * * @param newAction * the new action. */ protected void implOnMalformedInput(CodingErrorAction newAction) { // default implementation is empty } /** * Notifies that this encoder's CodingErrorAction specified * for unmappable character error has been changed. The default * implementation does nothing; this method can be overridden if needed. * * @param newAction * the new action. */ protected void implOnUnmappableCharacter(CodingErrorAction newAction) { // default implementation is empty } /** * Notifies that this encoder's replacement has been changed. The default * implementation does nothing; this method can be overridden if needed. * * @param newReplacement * the new replacement string. */ protected void implReplaceWith(byte[] newReplacement) { // default implementation is empty } /** * Resets this encoder's charset related state. The default implementation * does nothing; this method can be overridden if needed. */ protected void implReset() { // default implementation is empty } /** * Tests whether the given argument is legal as this encoder's replacement byte * array. The given byte array is legal if and only if it can be decoded into * characters. */ public boolean isLegalReplacement(byte[] replacement) { if (decoder == null) { decoder = charset.newDecoder(); decoder.onMalformedInput(CodingErrorAction.REPORT); decoder.onUnmappableCharacter(CodingErrorAction.REPORT); } ByteBuffer in = ByteBuffer.wrap(replacement); CharBuffer out = CharBuffer.allocate((int) (replacement.length * decoder.maxCharsPerByte())); CoderResult result = decoder.decode(in, out, true); return !result.isError(); } /** * Returns this encoder's CodingErrorAction when a malformed * input error occurred during the encoding process. */ public CodingErrorAction malformedInputAction() { return malformedInputAction; } /** * Returns the maximum number of bytes which can be created by this encoder for * one input character, must be positive. */ public final float maxBytesPerChar() { return maxBytesPerChar; } /** * Sets this encoder's action on malformed input error. * * This method will call the * {@link #implOnMalformedInput(CodingErrorAction) implOnMalformedInput} * method with the given new action as argument. * * @param newAction * the new action on malformed input error. * @return this encoder. * @throws IllegalArgumentException * if the given newAction is null. */ public final CharsetEncoder onMalformedInput(CodingErrorAction newAction) { if (newAction == null) { throw new IllegalArgumentException("newAction == null"); } malformedInputAction = newAction; implOnMalformedInput(newAction); return this; } /** * Sets this encoder's action on unmappable character error. * * This method will call the * {@link #implOnUnmappableCharacter(CodingErrorAction) implOnUnmappableCharacter} * method with the given new action as argument. * * @param newAction * the new action on unmappable character error. * @return this encoder. * @throws IllegalArgumentException * if the given newAction is null. */ public final CharsetEncoder onUnmappableCharacter(CodingErrorAction newAction) { if (newAction == null) { throw new IllegalArgumentException("newAction == null"); } unmappableCharacterAction = newAction; implOnUnmappableCharacter(newAction); return this; } /** * Returns the replacement byte array, which is never null or empty. */ public final byte[] replacement() { return replacementBytes; } /** * Sets the new replacement value. * * This method first checks the given replacement's validity, then changes * the replacement value and finally calls the * {@link #implReplaceWith(byte[]) implReplaceWith} method with the given * new replacement as argument. * * @param replacement * the replacement byte array, cannot be null or empty, its * length cannot be larger than maxBytesPerChar, * and it must be legal replacement, which can be justified by * calling isLegalReplacement(byte[] replacement). * @return this encoder. * @throws IllegalArgumentException * if the given replacement cannot satisfy the requirement * mentioned above. */ public final CharsetEncoder replaceWith(byte[] replacement) { if (replacement == null) { throw new IllegalArgumentException("replacement == null"); } if (replacement.length == 0) { throw new IllegalArgumentException("replacement.length == 0"); } if (replacement.length > maxBytesPerChar()) { throw new IllegalArgumentException("replacement.length > maxBytesPerChar: " + replacement.length + " > " + maxBytesPerChar()); } if (!isLegalReplacement(replacement)) { throw new IllegalArgumentException("Bad replacement: " + Arrays.toString(replacement)); } // It seems like a bug, but the RI doesn't clone, and we have tests that check we don't. this.replacementBytes = replacement; implReplaceWith(replacementBytes); return this; } /** * Resets this encoder. This method will reset the internal state and then * calls {@link #implReset} to reset any state related to the * specific charset. */ public final CharsetEncoder reset() { state = RESET; implReset(); return this; } /** * Returns this encoder's CodingErrorAction when unmappable * character occurred during encoding process. */ public CodingErrorAction unmappableCharacterAction() { return unmappableCharacterAction; } private IllegalStateException illegalStateException() { throw new IllegalStateException("State: " + state); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy