net.freeutils.charset.EscapedByteLookupCharset Maven / Gradle / Ivy
/*
* Copyright © 2005-2015 Amichai Rothman
*
* This file is part of JCharset - the Java Charset package.
*
* JCharset is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or
* (at your option) any later version.
*
* JCharset is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JCharset. If not, see .
*
* For additional info see http://www.freeutils.net/source/jcharset/
*/
package net.freeutils.charset;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
/**
* The EscapedByteLookupCharset class handles the encoding and
* decoding of simple charsets where the byte-to-char conversion
* is performed using a simple lookup table, with the addition of a special
* escape byte, such that the single byte following it is converted using
* an alternate lookup table.
*
* @author Amichai Rothman
* @since 2007-03-26
*/
public abstract class EscapedByteLookupCharset extends Charset {
int[] BYTE_TO_CHAR;
int[] BYTE_TO_CHAR_ESCAPED;
int[][] CHAR_TO_BYTE;
int[][] CHAR_TO_BYTE_ESCAPED;
byte ESCAPE;
/**
* Initializes a new charset with the given canonical name and alias
* set, and byte-to-char/char-to-byte lookup tables.
*
* @param canonicalName the canonical name of this charset
* @param aliases an array of this charset's aliases, or null if it has no aliases
* @param escape the special escape byte value
* @param byteToChar a byte-to-char conversion table for this charset
* @param byteToCharEscaped a byte-to-char conversion table for this charset
* for the escaped characters
* @param charToByte a char-to-byte conversion table for this charset. It can
* be generated on-the-fly by calling
* {@link ByteLookupCharset#createInverseLookupTable
* createInverseLookupTable(byteToChar)}.
* @param charToByteEscaped a char-to-byte conversion table for this charset
* for the escaped characters
* @throws java.nio.charset.IllegalCharsetNameException
* if the canonical name or any of the aliases are illegal
*/
protected EscapedByteLookupCharset(String canonicalName, String[] aliases,
byte escape, int[] byteToChar, int[] byteToCharEscaped,
int[][] charToByte, int[][] charToByteEscaped) {
super(canonicalName, aliases);
ESCAPE = escape;
BYTE_TO_CHAR = byteToChar;
CHAR_TO_BYTE = charToByte;
BYTE_TO_CHAR_ESCAPED = byteToCharEscaped;
CHAR_TO_BYTE_ESCAPED = charToByteEscaped;
}
/**
* Tells whether or not this charset contains the given charset.
*
* A charset C is said to contain a charset D if,
* and only if, every character representable in D is also
* representable in C. If this relationship holds then it is
* guaranteed that every string that can be encoded in D can also be
* encoded in C without performing any replacements.
*
*
That C contains D does not imply that each character
* representable in C by a particular byte sequence is represented
* in D by the same byte sequence, although sometimes this is the
* case.
*
*
Every charset contains itself.
*
*
This method computes an approximation of the containment relation:
* If it returns true then the given charset is known to be
* contained by this charset; if it returns false, however, then
* it is not necessarily the case that the given charset is not contained
* in this charset.
*
* @return true if, and only if, the given charset
* is contained in this charset
*/
public boolean contains(Charset cs) {
return this.getClass().isInstance(cs);
}
/**
* Constructs a new decoder for this charset.
*
* @return a new decoder for this charset
*/
public CharsetDecoder newDecoder() {
return new Decoder(this);
}
/**
* Constructs a new encoder for this charset.
*
* @return a new encoder for this charset
*
* @throws UnsupportedOperationException
* if this charset does not support encoding
*/
public CharsetEncoder newEncoder() {
return new Encoder(this);
}
/**
* The Encoder inner class handles the encoding of the
* charset using the lookup tables.
*/
protected class Encoder extends CharsetEncoder {
/**
* Constructs an Encoder.
*
* @param cs the charset to which this encoder belongs
*/
protected Encoder(Charset cs) {
super(cs, 1f, 2f);
}
/**
* Constructs an Encoder.
*
* @param cs the charset to which this encoder belongs
* @param averageBytesPerChar a positive float value indicating the expected
* number of bytes that will be produced for each input character
*
* @param maxBytesPerChar a positive float value indicating the maximum
* number of bytes that will be produced for each input character
*/
protected Encoder(Charset cs,
float averageBytesPerChar,
float maxBytesPerChar) {
super(cs, averageBytesPerChar, maxBytesPerChar);
}
/**
* Encodes one or more characters into one or more bytes.
*
* @param in the input character buffer
* @param out the output byte buffer
* @return a coder-result object describing the reason for termination
*/
protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
int b, c;
int[] table;
int remaining = in.remaining();
while (remaining-- > 0) {
// make sure we have room for output
if (out.remaining() < 1)
return CoderResult.OVERFLOW;
// get next char
c = in.get();
// look for corresponding regular byte
table = CHAR_TO_BYTE[c >> 8];
b = table == null ? -1 : table[c & 0xFF];
if (b == -1) {
// look for corresponding escaped byte
table = CHAR_TO_BYTE_ESCAPED[c >> 8];
b = table == null ? -1 : table[c & 0xFF];
if (b == -1) {
// there's no regular nor escaped byte - it's unmappable
in.position(in.position() - 1); // unread the char
return CoderResult.unmappableForLength(1);
}
// it's an escapable char, make sure we have room for output
if (out.remaining() < 2) {
in.position(in.position() - 1); // unread the char
return CoderResult.OVERFLOW;
}
// write the escape byte (output byte will follow)
out.put(ESCAPE);
}
// write the output byte
out.put((byte)(b & 0xFF));
}
// no more input available
return CoderResult.UNDERFLOW;
}
}
/**
* The Decoder inner class handles the decoding of the
* charset using the inverse lookup tables.
*/
protected class Decoder extends CharsetDecoder {
/**
* Constructs a Decoder.
*
* @param cs the charset to which this decoder belongs
*/
protected Decoder(Charset cs) {
super(cs, 1f, 1f);
}
/**
* Constructs a Decoder.
*
* @param cs the charset to which this decoder belongs
* @param averageCharsPerByte a positive float value indicating the expected
* number of characters that will be produced for each input byte
* @param maxCharsPerByte a positive float value indicating the maximum
* number of characters that will be produced for each input byte
*/
protected Decoder(Charset cs,
float averageCharsPerByte,
float maxCharsPerByte) {
super(cs, averageCharsPerByte, maxCharsPerByte);
}
/**
* Decodes one or more bytes into one or more characters.
*
* @param in the input byte buffer
* @param out the output character buffer
* @return a coder-result object describing the reason for termination
*/
protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
int b, c;
int remaining = in.remaining();
while (remaining-- > 0) {
// make sure we have room for output
if (out.remaining() < 1)
return CoderResult.OVERFLOW;
// get next byte
b = in.get();
if (b == ESCAPE) {
// it's the escape byte - make sure we have the next byte
if (remaining-- == 0) {
in.position(in.position() - 1); // unread the byte
return CoderResult.UNDERFLOW;
}
// get next byte
b = in.get();
// look for corresponding escaped char
c = BYTE_TO_CHAR_ESCAPED[b & 0xFF];
} else {
// look for corresponding regular char
c = BYTE_TO_CHAR[b & 0xFF];
}
if (c == -1) {
// there's no regular nor escaped char - it's malformed
in.position(in.position() - 1); // unread the byte
return CoderResult.malformedForLength(1);
}
// write the output char
out.put((char)c);
}
// no more input available
return CoderResult.UNDERFLOW;
}
}
}