All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.freeutils.charset.UTF7Charset Maven / Gradle / Ivy

/*
 *  Copyright © 2005-2015 Amichai Rothman
 *
 *  This file is part of JCharset - the Java Charset package.
 *
 *  JCharset is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  JCharset is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with JCharset.  If not, see .
 *
 *  For additional info see http://www.freeutils.net/source/jcharset/
 */

package net.freeutils.charset;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.*;

/**
 * The UTF7Charset class handles the encoding and decoding of the
 * UTF-7 charset.
 *
 * The encoding and decoding are based on RFC 2152
 * (http://www.ietf.org/rfc/rfc2152.txt)
 *
 * @author Amichai Rothman
 * @since  2005-06-10
 */
public class UTF7Charset extends Charset {

    static final String NAME = "UTF-7";

    static final String[] ALIASES = {
        "UTF7", "UNICODE-1-1-UTF-7", "csUnicode11UTF7", "UNICODE-2-0-UTF-7" };

    // a lookup table for characters that are part of the D Set
    static final boolean[] D_SET = {
        false, false, false, false, false, false, false, false,
        false, true,  true,  false, false, true,  false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        true,  false, false, false, false, false, false, true,
        true,  true,  false, false, true,  true,  true,  true,
        true,  true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  false, false, false, false, true,
        false, true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  false, false, false, false, false,
        false, true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  true,  true,  true,  true,  true,
        true,  true,  true,  false, false, false, false, false,
    };

    // a lookup table for characters that are part of the O Set
    static final boolean[] O_SET = {
        false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, true,  true,  true,  true,  true,  true,  false,
        false, false, true,  false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, true,  true,  true,  true,  false,
        true,  false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, true,  false, true,  true,  true,
        true,  false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false,
        false, false, false, true,  true,  true,  false, false,
    };

    // a lookup table for characters that are part of the B Set
    static final int[] B_SET = {
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
        52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
        -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
        -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
        41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
    };

    // an inverse lookup table for characters that are part of the B Set
    static final int[] B_SET_INVERSE = {
        65,  66,  67,  68,  69,  70,  71,  72,
        73,  74,  75,  76,  77,  78,  79,  80,
        81,  82,  83,  84,  85,  86,  87,  88,
        89,  90,  97,  98,  99,  100, 101, 102,
        103, 104, 105, 106, 107, 108, 109, 110,
        111, 112, 113, 114, 115, 116, 117, 118,
        119, 120, 121, 122, 48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  43,  47,
    };

    // the RFC specifies that the O-set characters may
    // optionally be directly encoded. Whether they are
    // encoded directly or using a shift sequence depends
    // on the value of the optionalDirect flag.
    final boolean optionalDirect;

    static boolean isDSet(byte b) {
        return b >= 0 && D_SET[b];
    }

    static boolean isDSet(char c) {
        return c < 0x80 && D_SET[c];
    }

    static boolean isOSet(byte b) {
        return b >= 0 && O_SET[b];
    }

    static boolean isOSet(char c) {
        return c < 0x80 && O_SET[c];
    }

    static boolean isDorOSet(byte b) {
        return b >= 0 && (D_SET[b] || O_SET[b]);
    }

    static boolean isDorOSet(char c) {
        return c < 0x80 && (D_SET[c] || O_SET[c]);
    }

    static boolean isBSet(byte b) {
        return b >= 0 && B_SET[b] != -1;
    }

    static boolean isBSet(char c) {
        return c < 0x80 && B_SET[c] != -1;
    }

    static byte fromBase64(byte b) {
        return (byte)(b < 0 ? -1 : B_SET[b]);
    }

    static byte toBase64(byte b) {
        return (byte)(b < 0 || b >= 64 ? -1 : B_SET_INVERSE[b]);
    }

    /**
     * Constructs an instance of the UTF7Charset.
     *
     * O-set characters are not directly encoded.
     */
    public UTF7Charset() {
        this(NAME, ALIASES, false);
    }

    /**
     * Constructs an instance of the UTF7Charset, specifying whether the
     * O-set characters are to be encoded directly or using a shift sequence.
     *
     * @param canonicalName the canonical name of this charset
     * @param aliases an array of this charset's aliases, or null if it has no aliases
     * @param optionalDirect if true, O-set characters are encoded directly,
     *                       otherwise they are encoded using a shift sequence
     * @throws IllegalCharsetNameException
     *         if the canonical name or any of the aliases are illegal
     */
    public UTF7Charset(String canonicalName, String[] aliases, boolean optionalDirect) {
        super(canonicalName, aliases);
        this.optionalDirect = optionalDirect;
    }

    /**
     * Returns whether the given character is encoded directly
     * or using a shift sequence.
     *
     * @param c the character to check
     * @return true if the character is encoded directly,
     *         false if it is encoded using a shift sequence
     */
    boolean isDirect(char c) {
        return c < 0x80 && (D_SET[c] || (optionalDirect && O_SET[c]));
    }

    /**
     * Tells whether or not this charset contains the given charset.
     *
     * 

A charset C is said to contain a charset D if, * and only if, every character representable in D is also * representable in C. If this relationship holds then it is * guaranteed that every string that can be encoded in D can also be * encoded in C without performing any replacements. * *

That C contains D does not imply that each character * representable in C by a particular byte sequence is represented * in D by the same byte sequence, although sometimes this is the * case. * *

Every charset contains itself. * *

This method computes an approximation of the containment relation: * If it returns true then the given charset is known to be * contained by this charset; if it returns false, however, then * it is not necessarily the case that the given charset is not contained * in this charset. * * @return true if, and only if, the given charset * is contained in this charset */ @Override public boolean contains(Charset charset) { return getClass().isInstance(charset) || Charset.forName("UTF-16").contains(charset); } /** * Constructs a new decoder for this charset. * * @return a new decoder for this charset */ @Override public CharsetDecoder newDecoder() { return new Decoder(this); } /** * Constructs a new encoder for this charset. * * @return a new encoder for this charset * * @throws UnsupportedOperationException * if this charset does not support encoding */ @Override public CharsetEncoder newEncoder() { return new Encoder(this); } /** * The Encoder inner class handles the encoding of the UTF7 charset. */ protected class Encoder extends CharsetEncoder { boolean shifted; // flags whether we are currently in a shift sequence char encodedChar; // holds the bits of previous partially encoded char int requiredBits; // number of bits required to complete a 6-bit value /** * Constructs an Encoder. * * @param charset the charset to which this encoder belongs */ protected Encoder(Charset charset) { super(charset, 1f, 5f); } /** * Resets this decoder, clearing any charset-specific internal state. */ @Override protected void implReset() { shifted = false; } /** * Flushes this encoder. * * @param out the output byte buffer * @return a coder-result object, either {@link CoderResult#UNDERFLOW} or * {@link CoderResult#OVERFLOW} */ @Override protected CoderResult implFlush(ByteBuffer out) { if (shifted) { if (out.remaining() < 2) return CoderResult.OVERFLOW; flushBase64Char(out); out.put((byte)'-'); // terminate shift sequence explicitly } return CoderResult.UNDERFLOW; } /** * Encodes one or more characters into one or more bytes. * * @param in the input character buffer * @param out the output byte buffer * @return a coder-result object describing the reason for termination */ @Override protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) { int remaining = in.remaining(); while (remaining-- > 0) { // make sure we have output space (the max we might need) if (out.remaining() < 3) return CoderResult.OVERFLOW; // get next byte char c = in.get(); // if not in shift sequence if (!shifted) { // if char is in set D, write it as byte directly if (isDirect(c)) { out.put((byte)c); } else if (c == '+') { // specially encoded char out.put((byte)'+').put((byte)'-'); } else { // start shift sequence out.put((byte)'+'); shifted = true; requiredBits = 6; writeBase64Char(out, c); } } else { // shifted if (isDirect(c)) { // direct char // terminate shift sequence shifted = false; flushBase64Char(out); if (isBSet(c) || c == '-') // requires explicit termination out.put((byte)'-'); // write direct char out.put((byte)c); } else { // another encoded char writeBase64Char(out, c); } } } return CoderResult.UNDERFLOW; } /** * Writes the base64 bytes representing the given character * to the given output ByteBuffer. Bits left over from * previously written characters are written first, followed * by this character's bits. Similarly, bits left over from * this character are saved until the next call to this method. * * @param out the ByteBuffer to which the base64 bytes are written * @param c the character to be written */ void writeBase64Char(ByteBuffer out, char c) { int bits = requiredBits; // getfield bytecode optimization byte b = (byte)(((encodedChar << bits) & 0x3F) | (c >>> (16 - bits))); out.put(toBase64(b)); b = (byte)((c >>> (10 - bits)) & 0x3F); out.put(toBase64(b)); if (bits != 6) { b = (byte)((c >>> (4 - bits)) & 0x3F); out.put(toBase64(b)); requiredBits += 2; } else { requiredBits = 2; } encodedChar = c; } /** * Writes any left-over base64 bits. * * @param out the ByteBuffer to which the base64 bytes are written */ void flushBase64Char(ByteBuffer out) { if (requiredBits != 6) { // dump last encoded byte, zero-bit padded byte b = (byte)((encodedChar << requiredBits) & 0x3F); out.put(toBase64(b)); } } } // Encoder class /** * The Decoder inner class handles the decoding of the UTF7 charset. */ protected class Decoder extends CharsetDecoder { boolean shifted; // flags whether we are currently in a shift sequence boolean emptyShift; // flags whether the current shift sequence is empty char decodedChar; // holds the bits of previous partially decoded char int requiredBits; // number of bits required to complete a 16-bit char /** * Constructs a Decoder. * * @param charset the charset to which this decoder belongs */ protected Decoder(Charset charset) { super(charset, 1f, 1f); } /** * Resets this decoder, clearing any charset-specific internal state. */ @Override protected void implReset() { shifted = false; } /** * Decodes one or more bytes into one or more characters. * * @param in the input byte buffer * @param out the output character buffer * @return a coder-result object describing the reason for termination */ @Override protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) { int val; int remaining = in.remaining(); while (remaining-- > 0) { // make sure we have output space if (out.remaining() < 1) return CoderResult.OVERFLOW; // get next byte byte b = in.get(); // if not in shift sequence if (!shifted) { // if byte is in set D or O, write it as char if (isDorOSet(b)) { out.put((char)b); } else if (b == '+') { // start shift sequence shifted = true; emptyShift = true; requiredBits = 16; } else { // invalid byte in.position(in.position() - 1); // position input at error byte return CoderResult.malformedForLength(1); // invalid byte } } else if ((val = fromBase64(b)) != -1) { // valid base64 byte // get bits from shift sequence byte emptyShift = false; // 6 is the max number of bits we can get from a single input byte int bits = requiredBits >= 6 ? 6 : requiredBits; // add new bits to currently decoded char decodedChar = (char)((decodedChar << bits) | (val >> (6 - bits))); requiredBits -= bits; // check if we're done decoding a full 16-bit char if (requiredBits == 0) { // output it out.put(decodedChar); // and start off next char with remaining bits requiredBits = 10 + bits; // 16 - (6 - bits) decodedChar = (char)val; // save the extra bits for later } } else { // terminating a shift sequence shifted = false; // any leftover bits when terminating the shift sequence // are discarded if they are zero, or invalid if they are nonzero if ((char)(decodedChar << requiredBits) != 0) { in.position(in.position() - 1); // position input at error byte return CoderResult.malformedForLength(1); // invalid byte } // process implicit or explicit shift sequence termination if (b == '-') { if (emptyShift) // a "+-" sequence outputs a '+' out.put('+'); // otherwise shift ends, and '-' is absorbed } else { // process regular char that ended base64 sequence if (isDorOSet(b)) { // output regular char out.put((char)b); } else { in.position(in.position() - 1); // position input at error byte return CoderResult.malformedForLength(1); // invalid byte } } } } return CoderResult.UNDERFLOW; } } // Decoder class }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy