net.freeutils.charset.UTF7Charset Maven / Gradle / Ivy

Go to download
/*
 *  Copyright © 2005-2015 Amichai Rothman
 *
 *  This file is part of JCharset - the Java Charset package.
 *
 *  JCharset is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  JCharset is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with JCharset.  If not, see .
 *
 *  For additional info see http://www.freeutils.net/source/jcharset/
 */

package net.freeutils.charset;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.*;

/**
 * The UTF7Charset class handles the encoding and decoding of the
 * UTF-7 charset.
 *
 * The encoding and decoding are based on RFC 2152
 * (http://www.ietf.org/rfc/rfc2152.txt)
 *
 * @author Amichai Rothman
 * @since  2005-06-10
 */
public class UTF7Charset extends Charset {

    static final String NAME = "UTF-7";

    static final String[] ALIASES = {
        "UTF7", "UNICODE-1-1-UTF-7", "csUnicode11UTF7", "UNICODE-2-0-UTF-7" };

    // a lookup table for characters that are part of the D Set
    static final boolean[] D_SET = {
        false, false, false, false, false, false, false, false,
        false, true,  true,  false, false, true,  false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        true,  false, false, false, false, false, false, true,
        true,  true,  false, false, true,  true,  true,  true,
        true,  true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  false, false, false, false, true,
        false, true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  false, false, false, false, false,
        false, true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  false, false, false, false, false,
    };

    // a lookup table for characters that are part of the O Set
    static final boolean[] O_SET = {
        false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, true,  true,  true,  true,  true,  true,  false,
        false, false, true,  false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, true,  true,  true,  true,  false,
        true,  false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, true,  false, true,  true,  true,
        true,  false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, true,  true,  true,  false, false,
    };

    // a lookup table for characters that are part of the B Set
    static final int[] B_SET = {
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
        52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
        -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
        -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
        41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
    };

    // an inverse lookup table for characters that are part of the B Set
    static final int[] B_SET_INVERSE = {
        65,  66,  67,  68,  69,  70,  71,  72,
        73,  74,  75,  76,  77,  78,  79,  80,
        81,  82,  83,  84,  85,  86,  87,  88,
        89,  90,  97,  98,  99,  100, 101, 102,
        103, 104, 105, 106, 107, 108, 109, 110,
        111, 112, 113, 114, 115, 116, 117, 118,
        119, 120, 121, 122, 48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  43,  47,
    };

    // the RFC specifies that the O-set characters may
    // optionally be directly encoded. Whether they are
    // encoded directly or using a shift sequence depends
    // on the value of the optionalDirect flag.
    final boolean optionalDirect;

    static boolean isDSet(byte b) {
        return b >= 0 && D_SET[b];
    }

    static boolean isDSet(char c) {
        return c < 0x80 && D_SET[c];
    }

    static boolean isOSet(byte b) {
        return b >= 0 && O_SET[b];
    }

    static boolean isOSet(char c) {
        return c < 0x80 && O_SET[c];
    }

    static boolean isDorOSet(byte b) {
        return b >= 0 && (D_SET[b] || O_SET[b]);
    }

    static boolean isDorOSet(char c) {
        return c < 0x80 && (D_SET[c] || O_SET[c]);
    }

    static boolean isBSet(byte b) {
        return b >= 0 && B_SET[b] != -1;
    }

    static boolean isBSet(char c) {
        return c < 0x80 && B_SET[c] != -1;
    }

    static byte fromBase64(byte b) {
        return (byte)(b < 0 ? -1 : B_SET[b]);
    }

    static byte toBase64(byte b) {
        return (byte)(b < 0 || b >= 64 ? -1 : B_SET_INVERSE[b]);
    }

    /**
     * Constructs an instance of the UTF7Charset.
     *
     * O-set characters are not directly encoded.
     */
    public UTF7Charset() {
        this(NAME, ALIASES, false);
    }

    /**
     * Constructs an instance of the UTF7Charset, specifying whether the
     * O-set characters are to be encoded directly or using a shift sequence.
     *
     * @param canonicalName the canonical name of this charset
     * @param aliases an array of this charset's aliases, or null if it has no aliases
     * @param optionalDirect if true, O-set characters are encoded directly,
     *                       otherwise they are encoded using a shift sequence
     * @throws IllegalCharsetNameException
     *         if the canonical name or any of the aliases are illegal
     */
    public UTF7Charset(String canonicalName, String[] aliases, boolean optionalDirect) {
        super(canonicalName, aliases);
        this.optionalDirect = optionalDirect;
    }

    /**
     * Returns whether the given character is encoded directly
     * or using a shift sequence.
     *
     * @param c the character to check
     * @return true if the character is encoded directly,
     *         false if it is encoded using a shift sequence
     */
    boolean isDirect(char c) {
        return c < 0x80 && (D_SET[c] || (optionalDirect && O_SET[c]));
    }

    /**
     * Tells whether or not this charset contains the given charset.
     *
     *  A charset C is said to contain a charset D if,
     * and only if, every character representable in D is also
     * representable in C.  If this relationship holds then it is
     * guaranteed that every string that can be encoded in D can also be
     * encoded in C without performing any replacements.
     *
     * 
 That C contains D does not imply that each character
     * representable in C by a particular byte sequence is represented
     * in D by the same byte sequence, although sometimes this is the
     * case.
     *
     * 
 Every charset contains itself.
     *
     *  This method computes an approximation of the containment relation:
     * If it returns true then the given charset is known to be
     * contained by this charset; if it returns false, however, then
     * it is not necessarily the case that the given charset is not contained
     * in this charset.
     *
     * @return true if, and only if, the given charset
     *         is contained in this charset
     */
    @Override
    public boolean contains(Charset charset) {
        return getClass().isInstance(charset) || Charset.forName("UTF-16").contains(charset);
    }

    /**
     * Constructs a new decoder for this charset.
     *
     * @return a new decoder for this charset
     */
    @Override
    public CharsetDecoder newDecoder() {
        return new Decoder(this);
    }

    /**
     * Constructs a new encoder for this charset.
     *
     * @return a new encoder for this charset
     *
     * @throws UnsupportedOperationException
     *         if this charset does not support encoding
     */
    @Override
    public CharsetEncoder newEncoder() {
        return new Encoder(this);
    }

    /**
     * The Encoder inner class handles the encoding of the UTF7 charset.
     */
    protected class Encoder extends CharsetEncoder {

        boolean shifted;  // flags whether we are currently in a shift sequence
        char encodedChar; // holds the bits of previous partially encoded char
        int requiredBits; // number of bits required to complete a 6-bit value

        /**
         * Constructs an Encoder.
         *
         * @param charset the charset to which this encoder belongs
         */
        protected Encoder(Charset charset) {
            super(charset, 1f, 5f);
        }

        /**
         * Resets this decoder, clearing any charset-specific internal state.
         */
        @Override
        protected void implReset() {
            shifted = false;
        }

        /**
         * Flushes this encoder.
         *
         * @param out the output byte buffer
         * @return a coder-result object, either {@link CoderResult#UNDERFLOW} or
         *         {@link CoderResult#OVERFLOW}
         */
        @Override
        protected CoderResult implFlush(ByteBuffer out) {
            if (shifted) {
                if (out.remaining() < 2)
                    return CoderResult.OVERFLOW;
                flushBase64Char(out);
                out.put((byte)'-'); // terminate shift sequence explicitly
            }
            return CoderResult.UNDERFLOW;
        }

        /**
         * Encodes one or more characters into one or more bytes.
         *
         * @param in the input character buffer
         * @param out the output byte buffer
         * @return a coder-result object describing the reason for termination
         */
        @Override
        protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
            int remaining = in.remaining();

            while (remaining-- > 0) {
                // make sure we have output space (the max we might need)
                if (out.remaining() < 3)
                    return CoderResult.OVERFLOW;

                // get next byte
                char c = in.get();

                // if not in shift sequence
                if (!shifted) {
                    // if char is in set D, write it as byte directly
                    if (isDirect(c)) {
                        out.put((byte)c);
                    } else if (c == '+') { // specially encoded char
                        out.put((byte)'+').put((byte)'-');
                    } else { // start shift sequence
                        out.put((byte)'+');
                        shifted = true;
                        requiredBits = 6;
                        writeBase64Char(out, c);
                    }
                } else { // shifted
                    if (isDirect(c)) { // direct char
                        // terminate shift sequence
                        shifted = false;
                        flushBase64Char(out);
                        if (isBSet(c) || c == '-') // requires explicit termination
                            out.put((byte)'-');
                        // write direct char
                        out.put((byte)c);
                    } else { // another encoded char
                        writeBase64Char(out, c);
                    }
                }
            }

            return CoderResult.UNDERFLOW;
        }

        /**
         * Writes the base64 bytes representing the given character
         * to the given output ByteBuffer. Bits left over from
         * previously written characters are written first, followed
         * by this character's bits. Similarly, bits left over from
         * this character are saved until the next call to this method.
         *
         * @param out the ByteBuffer to which the base64 bytes are written
         * @param c the character to be written
         */
        void writeBase64Char(ByteBuffer out, char c) {
            int bits = requiredBits; // getfield bytecode optimization
            byte b = (byte)(((encodedChar << bits) & 0x3F) | (c >>> (16 - bits)));
            out.put(toBase64(b));
            b = (byte)((c >>> (10 - bits)) & 0x3F);
            out.put(toBase64(b));

            if (bits != 6) {
                b = (byte)((c >>> (4 - bits)) & 0x3F);
                out.put(toBase64(b));
                requiredBits += 2;
            } else {
                requiredBits = 2;
            }
            encodedChar = c;
        }

        /**
         * Writes any left-over base64 bits.
         *
         * @param out the ByteBuffer to which the base64 bytes are written
         */
        void flushBase64Char(ByteBuffer out) {
            if (requiredBits != 6) { // dump last encoded byte, zero-bit padded
                byte b = (byte)((encodedChar << requiredBits) & 0x3F);
                out.put(toBase64(b));
            }
        }

    } // Encoder class

    /**
     * The Decoder inner class handles the decoding of the UTF7 charset.
     */
    protected class Decoder extends CharsetDecoder {

        boolean shifted; // flags whether we are currently in a shift sequence
        boolean emptyShift; // flags whether the current shift sequence is empty
        char decodedChar; // holds the bits of previous partially decoded char
        int requiredBits; // number of bits required to complete a 16-bit char

        /**
         * Constructs a Decoder.
         *
         * @param charset the charset to which this decoder belongs
         */
        protected Decoder(Charset charset) {
            super(charset, 1f, 1f);
        }

        /**
         * Resets this decoder, clearing any charset-specific internal state.
         */
        @Override
        protected void implReset() {
            shifted = false;
        }

        /**
         * Decodes one or more bytes into one or more characters.
         *
         * @param in the input byte buffer
         * @param out the output character buffer
         * @return a coder-result object describing the reason for termination
         */
        @Override
        protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
            int val;
            int remaining = in.remaining();

            while (remaining-- > 0) {
                // make sure we have output space
                if (out.remaining() < 1)
                    return CoderResult.OVERFLOW;

                // get next byte
                byte b = in.get();

                // if not in shift sequence
                if (!shifted) {
                    // if byte is in set D or O, write it as char
                    if (isDorOSet(b)) {
                        out.put((char)b);
                    } else if (b == '+') { // start shift sequence
                        shifted = true;
                        emptyShift = true;
                        requiredBits = 16;
                    } else { // invalid byte
                        in.position(in.position() - 1); // position input at error byte
                        return CoderResult.malformedForLength(1); // invalid byte
                    }
                } else if ((val = fromBase64(b)) != -1) { // valid base64 byte
                    // get bits from shift sequence byte
                    emptyShift = false;
                    // 6 is the max number of bits we can get from a single input byte
                    int bits = requiredBits >= 6 ? 6 : requiredBits;
                    // add new bits to currently decoded char
                    decodedChar = (char)((decodedChar << bits) | (val >> (6 - bits)));
                    requiredBits -= bits;
                    // check if we're done decoding a full 16-bit char
                    if (requiredBits == 0) {
                        // output it
                        out.put(decodedChar);
                        // and start off next char with remaining bits
                        requiredBits = 10 + bits; // 16 - (6 - bits)
                        decodedChar = (char)val; // save the extra bits for later
                    }
                } else { // terminating a shift sequence
                    shifted = false;
                    // any leftover bits when terminating the shift sequence
                    // are discarded if they are zero, or invalid if they are nonzero
                    if ((char)(decodedChar << requiredBits) != 0) {
                        in.position(in.position() - 1); // position input at error byte
                        return CoderResult.malformedForLength(1); // invalid byte
                    }
                    // process implicit or explicit shift sequence termination
                    if (b == '-') {
                        if (emptyShift) // a "+-" sequence outputs a '+'
                            out.put('+');
                        // otherwise shift ends, and '-' is absorbed
                    } else {
                        // process regular char that ended base64 sequence
                        if (isDorOSet(b)) { // output regular char
                            out.put((char)b);
                        } else {
                            in.position(in.position() - 1); // position input at error byte
                            return CoderResult.malformedForLength(1); // invalid byte
                        }
                    }
                }
            }

            return CoderResult.UNDERFLOW;
        }
    } // Decoder class

}