net.freeutils.charset.EscapedByteLookupCharset Maven / Gradle / Ivy

Go to download
/*
 *  Copyright © 2005-2015 Amichai Rothman
 *
 *  This file is part of JCharset - the Java Charset package.
 *
 *  JCharset is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  JCharset is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with JCharset.  If not, see .
 *
 *  For additional info see http://www.freeutils.net/source/jcharset/
 */

package net.freeutils.charset;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;

/**
 * The EscapedByteLookupCharset class handles the encoding and
 * decoding of simple charsets where the byte-to-char conversion
 * is performed using a simple lookup table, with the addition of a special
 * escape byte, such that the single byte following it is converted using
 * an alternate lookup table.
 *
 * @author  Amichai Rothman
 * @since   2007-03-26
 */
public abstract class EscapedByteLookupCharset extends Charset {

    int[] BYTE_TO_CHAR;
    int[] BYTE_TO_CHAR_ESCAPED;
    int[][] CHAR_TO_BYTE;
    int[][] CHAR_TO_BYTE_ESCAPED;
    byte ESCAPE;

    /**
     * Initializes a new charset with the given canonical name and alias
     * set, and byte-to-char/char-to-byte lookup tables.
     *
     * @param canonicalName the canonical name of this charset
     * @param aliases an array of this charset's aliases, or null if it has no aliases
     * @param escape the special escape byte value
     * @param byteToChar a byte-to-char conversion table for this charset
     * @param byteToCharEscaped a byte-to-char conversion table for this charset
     *        for the escaped characters
     * @param charToByte a char-to-byte conversion table for this charset. It can
     *        be generated on-the-fly by calling
     *        {@link ByteLookupCharset#createInverseLookupTable
     *        createInverseLookupTable(byteToChar)}.
     * @param charToByteEscaped a char-to-byte conversion table for this charset
     *        for the escaped characters
     * @throws java.nio.charset.IllegalCharsetNameException
     *         if the canonical name or any of the aliases are illegal
     */
    protected EscapedByteLookupCharset(String canonicalName, String[] aliases,
        byte escape, int[] byteToChar, int[] byteToCharEscaped,
        int[][] charToByte, int[][] charToByteEscaped) {
        super(canonicalName, aliases);
        ESCAPE = escape;
        BYTE_TO_CHAR = byteToChar;
        CHAR_TO_BYTE = charToByte;
        BYTE_TO_CHAR_ESCAPED = byteToCharEscaped;
        CHAR_TO_BYTE_ESCAPED = charToByteEscaped;
    }

    /**
     * Tells whether or not this charset contains the given charset.
     *
     *  A charset C is said to contain a charset D if,
     * and only if, every character representable in D is also
     * representable in C.  If this relationship holds then it is
     * guaranteed that every string that can be encoded in D can also be
     * encoded in C without performing any replacements.
     *
     * 
 That C contains D does not imply that each character
     * representable in C by a particular byte sequence is represented
     * in D by the same byte sequence, although sometimes this is the
     * case.
     *
     * 
 Every charset contains itself.
     *
     *  This method computes an approximation of the containment relation:
     * If it returns true then the given charset is known to be
     * contained by this charset; if it returns false, however, then
     * it is not necessarily the case that the given charset is not contained
     * in this charset.
     *
     * @return true if, and only if, the given charset
     *         is contained in this charset
     */
    public boolean contains(Charset cs) {
        return this.getClass().isInstance(cs);
    }

    /**
     * Constructs a new decoder for this charset.
     *
     * @return a new decoder for this charset
     */
    public CharsetDecoder newDecoder() {
        return new Decoder(this);
    }

    /**
     * Constructs a new encoder for this charset.
     *
     * @return a new encoder for this charset
     *
     * @throws UnsupportedOperationException
     *         if this charset does not support encoding
     */
    public CharsetEncoder newEncoder() {
        return new Encoder(this);
    }

    /**
     * The Encoder inner class handles the encoding of the
     * charset using the lookup tables.
     */
    protected class Encoder extends CharsetEncoder {

        /**
         * Constructs an Encoder.
         *
         * @param cs the charset to which this encoder belongs
         */
        protected Encoder(Charset cs) {
            super(cs, 1f, 2f);
        }

        /**
         * Constructs an Encoder.
         *
         * @param cs the charset to which this encoder belongs
         * @param averageBytesPerChar a positive float value indicating the expected
         *        number of bytes that will be produced for each input character
         *
         * @param maxBytesPerChar a positive float value indicating the maximum
         *        number of bytes that will be produced for each input character
         */
        protected Encoder(Charset cs,
                 float averageBytesPerChar,
                 float maxBytesPerChar) {
            super(cs, averageBytesPerChar, maxBytesPerChar);
        }

        /**
         * Encodes one or more characters into one or more bytes.
         *
         * @param in the input character buffer
         * @param out the output byte buffer
         * @return a coder-result object describing the reason for termination
         */
        protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
            int b, c;
            int[] table;
            int remaining = in.remaining();

            while (remaining-- > 0) {
                // make sure we have room for output
                if (out.remaining() < 1)
                    return CoderResult.OVERFLOW;
                // get next char
                c = in.get();
                // look for corresponding regular byte
                table = CHAR_TO_BYTE[c >> 8];
                b = table == null ? -1 : table[c & 0xFF];
                if (b == -1) {
                    // look for corresponding escaped byte
                    table = CHAR_TO_BYTE_ESCAPED[c >> 8];
                    b = table == null ? -1 : table[c & 0xFF];
                    if (b == -1) {
                        // there's no regular nor escaped byte - it's unmappable
                        in.position(in.position() - 1); // unread the char
                        return CoderResult.unmappableForLength(1);
                    }
                    // it's an escapable char, make sure we have room for output
                    if (out.remaining() < 2) {
                        in.position(in.position() - 1); // unread the char
                        return CoderResult.OVERFLOW;
                    }
                    // write the escape byte (output byte will follow)
                    out.put(ESCAPE);
                }
                // write the output byte
                out.put((byte)(b & 0xFF));
            }
            // no more input available
            return CoderResult.UNDERFLOW;
        }

    }

    /**
     * The Decoder inner class handles the decoding of the
     * charset using the inverse lookup tables.
     */
    protected class Decoder extends CharsetDecoder {

        /**
         * Constructs a Decoder.
         *
         * @param cs the charset to which this decoder belongs
         */
        protected Decoder(Charset cs) {
            super(cs, 1f, 1f);
        }

        /**
         * Constructs a Decoder.
         *
         * @param cs the charset to which this decoder belongs
         * @param averageCharsPerByte a positive float value indicating the expected
         *        number of characters that will be produced for each input byte
         * @param maxCharsPerByte a positive float value indicating the maximum
         *        number of characters that will be produced for each input byte
         */
        protected Decoder(Charset cs,
                          float averageCharsPerByte,
                          float maxCharsPerByte) {
            super(cs, averageCharsPerByte, maxCharsPerByte);
        }

        /**
         * Decodes one or more bytes into one or more characters.
         *
         * @param in the input byte buffer
         * @param out the output character buffer
         * @return a coder-result object describing the reason for termination
         */
        protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
            int b, c;
            int remaining = in.remaining();
            while (remaining-- > 0) {
                // make sure we have room for output
                if (out.remaining() < 1)
                    return CoderResult.OVERFLOW;
                // get next byte
                b = in.get();
                if (b == ESCAPE) {
                    // it's the escape byte - make sure we have the next byte
                    if (remaining-- == 0) {
                        in.position(in.position() - 1); // unread the byte
                        return CoderResult.UNDERFLOW;
                    }
                    // get next byte
                    b = in.get();
                    // look for corresponding escaped char
                    c = BYTE_TO_CHAR_ESCAPED[b & 0xFF];
                } else {
                    // look for corresponding regular char
                    c = BYTE_TO_CHAR[b & 0xFF];
                }

                if (c == -1) {
                    // there's no regular nor escaped char - it's malformed
                    in.position(in.position() - 1); // unread the byte
                    return CoderResult.malformedForLength(1);
                }
                // write the output char
                out.put((char)c);
            }
            // no more input available
            return CoderResult.UNDERFLOW;
        }
    }

}