net.freeutils.charset.ByteLookupCharset Maven / Gradle / Ivy

Go to download
/*
 *  Copyright © 2005-2015 Amichai Rothman
 *
 *  This file is part of JCharset - the Java Charset package.
 *
 *  JCharset is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  JCharset is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with JCharset.  If not, see .
 *
 *  For additional info see http://www.freeutils.net/source/jcharset/
 */

package net.freeutils.charset;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;

/**
 * The ByteLookupCharset class handles the encoding and decoding of
 * single-byte charsets where the byte-to-char conversion is performed
 * using a simple lookup table.
 *
 * @author  Amichai Rothman
 * @since   2005-06-30
 */
public abstract class ByteLookupCharset extends Charset {

    int[] BYTE_TO_CHAR;
    int[][] CHAR_TO_BYTE;

    /**
     * Returns whether the running JDK version is 1.5 or higher.
     *
     * @return true if running in JDK 1.5 or higher, false otherwise.
     */
    static boolean isJDK15() {
        float version;
        try {
            version = Float.parseFloat(System.getProperty("java.class.version"));
        } catch (Exception e) {
            version = 0;
        }
        return version >= 49.0; // 49.0 is the class version of JDK 1.5
    }

    /**
     * Returns a copy of the given array in which several items
     * are modified.
     *
     * @param src the array to mutate
     * @param ind the array indices in which the values will be modified
     * @param val the respective values to place in these indices
     * @return the mutated array
     */
    protected static int[] mutate(int[] src, int[] ind, int[] val) {
        int[] mut = new int[src.length];
        System.arraycopy(src, 0, mut, 0, src.length);
        for (int i = 0; i < ind.length; i++)
            mut[ind[i]] = val[i];
        return mut;
    }

    /**
     * Creates an inverse lookup table for the given byte-to-char lookup table.
     *
     * The returned table contains 256 tables, one per high-order byte of a
     * potential character to be converted (unused ones are null), and each
     * such table can be indexed using the character's low-order byte, to
     * obtain the actual converted byte value.
     * A null table in the top level table, or a -1 within a lower level table,
     * both indicate that there is no legal mapping for the given character.
     *
     * @param chars a lookup table which holds the character value
     *        that each byte value (0-255) is converted to.
     * @return the created inverse lookup (char-to-byte) table.
     */
    public static int[][] createInverseLookupTable(int[] chars) {
        int[][] tables = new int[256][];
        for (int i = 0; i < 256; i++) {
            int c = chars[i];
            if (c > -1)
                updateInverseLookupTable(tables, c, i);
        }
        return tables;
    }

    /**
     * Updates an inverse lookup table with an additional mapping,
     * replacing a previous mapping of the same value if it exists.
     *
     * @param tables the inverse lookup table to update
     *        (see {@link #createInverseLookupTable})
     * @param c the character to map
     * @param b the byte value to which c is mapped, or -1 to mark an illegal mapping
     * @return the updated inverse lookup (char-to-byte) table
     */
    public static int[][] updateInverseLookupTable(int[][] tables, int c, int b) {
        int high = (c >>> 8) & 0xFF;
        int low = c & 0xFF;
        int[] table = tables[high];
        if (b > -1) {
            if (table == null) {
                table = new int[256];
                for (int j = 0; j < table.length; j++)
                    table[j] = -1;
                tables[high] = table;
            }
            table[low] = b;
        } else {
            if (table != null)
                table[low] = -1;
        }
        return tables;
    }

    /**
     * Updates an inverse lookup table with additional mappings,
     * replacing previous mappings of the same values if they exists.
     *
     * @param tables the inverse lookup table to update
     *        (see {@link #createInverseLookupTable})
     * @param chars the characters to map
     * @param bytes the respective byte values to which the chars are mapped,
     *        or -1 to mark an illegal mapping
     * @return the updated inverse lookup (char-to-byte) table
     */
    public static int[][] updateInverseLookupTable(int[][] tables, int[] chars, int[] bytes) {
        for (int i = 0; i < chars.length; i++)
            updateInverseLookupTable(tables, chars[i], bytes[i]);
        return tables;
    }

    /**
     * Returns a string containing Java definitions of the inverse lookup
     * table returned by getInverseLookupTable for the given byte-to-char
     * lookup table.
     *
     * This is a convenient utility method for design-time building
     * of charsets based on lookup table mapping, as an alternative to
     * creating these inverse lookup tables on-the-fly.
     *
     * @param chars a lookup table which holds the character value
     *        that each byte value (0-255) is converted to.
     * @return the Java definitions of the created inverse lookup
     *         (char-to-byte) table.
     */
    public static String createInverseLookupTableDefinition(int[] chars) {
        int[][] tables = createInverseLookupTable(chars);
        StringBuffer sb = new StringBuffer();
        int nulls = 0;
        sb.append("static final int[][] CHAR_TO_BYTE = {\n\t");
        for (int i = 0; i < tables.length; i++) {
            int[] table = tables[i];
            if (table == null) {
                if (nulls++ % 8 == 0 && nulls > 1)
                    sb.append("\n\t");
                sb.append("null, ");
            } else {
                if (nulls > 0)
                    sb.append("\n\t");
                nulls = 0;
                sb.append("{ // high byte = 0x");
                if (i < 0x10)
                    sb.append('0');
                sb.append(Integer.toHexString(i));
                sb.append("\n\t");
                for (int j = 0; j < table.length; j++) {
                    if (table[j] == -1) {
                        sb.append("  -1, ");
                    } else {
                        sb.append("0x");
                        if (table[j] < 0x10)
                            sb.append('0');
                        sb.append(Integer.toHexString(table[j])).append(", ");
                    }
                    if ((j + 1) % 8 == 0)
                        sb.append("\n\t");
                }
                sb.append("}, \n\t");
            }
        }
        sb.append("\n\t};");
        return sb.toString();
    }

    /**
     * Initializes a new charset with the given canonical name and alias
     * set, and byte-to-char/char-to-byte lookup tables.
     *
     * @param canonicalName the canonical name of this charset
     * @param aliases an array of this charset's aliases, or null if it has no aliases
     * @param byteToChar a byte-to-char conversion table for this charset
     * @param charToByte a char-to-byte conversion table for this charset. It can
     *        be generated on-the-fly by calling createInverseLookupTable(byteToChar).
     * @throws java.nio.charset.IllegalCharsetNameException
     *         if the canonical name or any of the aliases are illegal
     */
    protected ByteLookupCharset(String canonicalName, String[] aliases,
        int[] byteToChar, int[][] charToByte) {
        super(canonicalName, aliases);
        BYTE_TO_CHAR = byteToChar;
        CHAR_TO_BYTE = charToByte;
    }

    /**
     * Tells whether or not this charset contains the given charset.
     *
     *  A charset C is said to contain a charset D if,
     * and only if, every character representable in D is also
     * representable in C.  If this relationship holds then it is
     * guaranteed that every string that can be encoded in D can also be
     * encoded in C without performing any replacements.
     *
     * 
 That C contains D does not imply that each character
     * representable in C by a particular byte sequence is represented
     * in D by the same byte sequence, although sometimes this is the
     * case.
     *
     * 
 Every charset contains itself.
     *
     *  This method computes an approximation of the containment relation:
     * If it returns true then the given charset is known to be
     * contained by this charset; if it returns false, however, then
     * it is not necessarily the case that the given charset is not contained
     * in this charset.
     *
     * @return true if, and only if, the given charset
     *         is contained in this charset
     */
    public boolean contains(Charset cs) {
        return this.getClass().isInstance(cs);
    }

    /**
     * Constructs a new decoder for this charset.
     *
     * @return a new decoder for this charset
     */
    public CharsetDecoder newDecoder() {
        return new Decoder(this);
    }

    /**
     * Constructs a new encoder for this charset.
     *
     * @return a new encoder for this charset
     *
     * @throws UnsupportedOperationException
     *         if this charset does not support encoding
     */
    public CharsetEncoder newEncoder() {
        return new Encoder(this);
    }

    /**
     * The Encoder inner class handles the encoding of the
     * charset using the lookup table.
     */
    protected class Encoder extends CharsetEncoder {

        /**
         * Constructs an Encoder.
         *
         * @param cs the charset to which this encoder belongs
         */
        protected Encoder(Charset cs) {
            super(cs, 1f, 1f);
        }

        /**
         * Encodes one or more characters into one or more bytes.
         *
         * @param in the input character buffer
         * @param out the output byte buffer
         * @return a coder-result object describing the reason for termination
         */
        protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
            int b, c;
            int[][] lookup = CHAR_TO_BYTE; // getfield bytecode optimization
            int[] table;
            int remaining = in.remaining();
            while (remaining-- > 0) {
                if (out.remaining() < 1)
                    return CoderResult.OVERFLOW; // we need exactly one byte per char
                c = in.get();
                table = lookup[c >>> 8];
                b =  table == null ? -1 : table[c & 0xFF];
                if (b == -1) {
                    in.position(in.position() - 1);
                    return CoderResult.unmappableForLength(1);
                }
                out.put((byte)(b & 0xFF));
            }
            return CoderResult.UNDERFLOW;
        }

    }

    /**
     * The Decoder inner class handles the decoding of the
     * charset using the inverse lookup table.
     */
    protected class Decoder extends CharsetDecoder {

        /**
         * Constructs a Decoder.
         *
         * @param cs the charset to which this decoder belongs
         */
        protected Decoder(Charset cs) {
            super(cs, 1f, 1f);
        }

        /**
         * Decodes one or more bytes into one or more characters.
         *
         * @param in the input byte buffer
         * @param out the output character buffer
         * @return a coder-result object describing the reason for termination
         */
        protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
            int c;
            int[] lookup = BYTE_TO_CHAR; // getfield bytecode optimization
            int remaining = in.remaining();
            while (remaining-- > 0) {
                if (out.remaining() < 1)
                    return CoderResult.OVERFLOW; // we need exactly one char per byte
                c = lookup[in.get() & 0xFF];
                if (c == -1) {
                    in.position(in.position() - 1);
                    return CoderResult.malformedForLength(1);
                }
                out.put((char)c);
            }
            return CoderResult.UNDERFLOW;
        }
    }

}