org.jwat.common.UTF8 Maven / Gradle / Ivy

Go to download
/**
 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
 * and GZip files. (http://jwat.org/)
 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.jwat.common;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

/**
 * Small class to decode and encode UTF-8 characters. Decoding also keeps
 * track of encoding errors. Processes one UTF-8 character at a time.
 * The decoding method reports encoding validity in a field.
 *
 * Char. number range  |        UTF-8 octet sequence
 *       (hexadecimal)    |              (binary)
 *    --------------------+---------------------------------------------
 *    0000 0000-0000 007F | 0xxxxxxx
 *    0000 0080-0000 07FF | 110xxxxx 10xxxxxx
 *    0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 *    0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 *
 * @author nicl
 */
public class UTF8 {

    /** Complete or partial UTF-8 character, depending on conversion errors. */
    public int utf8_c;

    /** Bytes read in the decoding process. */
    public byte[] chars_read;

    /** UTF-8 validity status on last read character. */
    public boolean bValidChar = false;

    /**
     * Given a character and an input stream returns the next decoded UTF-8
     * character. The encoded UTF-8 character is between 1 and 4 bytes long.
     * In order to preserve the validity and character value, the character is
     * returned by the method and its validity is available through the
     * bValidChar {link #bValidChar} field.
     * @param c initial character
     * @param in input stream used to read extra UTF-8 encoded data
     * @return UTF-8 character or -1
     * @throws IOException if an i/o error occurs while reading
     */
    public int readUtf8(int c, InputStream in) throws IOException {
        ByteArrayOutputStream charsOut = new ByteArrayOutputStream(4);
        byte utf8_read;
        byte utf8_octets;
        utf8_c = 0;
        bValidChar = false;
        if ((c & 0x80) == 0x00) {
            // US-ASCII/UTF-8: 0000 0000-0000 007F | 0xxxxxxx
            bValidChar = true;
            utf8_c = c;
        } else {
            utf8_read = 1;
            bValidChar = true;
            if ((c & 0xE0) == 0xC0) {
                // UTF-8: 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
                utf8_c = c & 0x1F;
                utf8_octets = 2;
            } else if ((c & 0xF0) == 0xE0) {
                // UTF-8: 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
                utf8_c = c & 0x0F;
                utf8_octets = 3;
            } else if ((c & 0xF8) == 0xF0) {
                // UTF-8: 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                utf8_c = c & 0x07;
                utf8_octets = 4;
            } else {
                // Invalid UTF-8 octet.
                utf8_c = 0;
                utf8_read = 0;
                utf8_octets = 0;
                bValidChar = false;
            }
            // Read the remaining octets.
            while (bValidChar && utf8_read < utf8_octets) {
                c = in.read();
                if (c == -1) {
                    // EOF.
                    bValidChar = false;
                    chars_read = charsOut.toByteArray();
                    return -1;
                } else {
                    charsOut.write(c);
                    if ((c & 0xC0) == 0x80) {
                        utf8_c = (utf8_c << 6) | (c & 0x3F);
                        ++utf8_read;
                    } else {
                        // Invalid UTF-8 octet.
                        bValidChar = false;
                    }
                }
            }
            // Correctly encoded.
            if (utf8_read == utf8_octets) {
                switch (utf8_octets) {
                case 2:
                    if (utf8_c < 0x00000080) {
                        // Incorrectly encoded value.
                        bValidChar = false;
                    }
                    break;
                case 3:
                    if (utf8_c < 0x00000800) {
                        // Incorrectly encoded value.
                        bValidChar = false;
                    }
                    break;
                case 4:
                    if (utf8_c < 0x00010000) {
                        // Incorrectly encoded value.
                        bValidChar = false;
                    }
                    break;
                }
            }
            c = utf8_c;
        }
        chars_read = charsOut.toByteArray();
        return c;
    }

    /**
     * UTF-8 encodes a character and outputs in onto the stream.
     * Returns the number of bytes used to encode the character.
     * @param c character to UTF-8 encode
     * @param out UTF-8 output stream
     * @return the number of bytes used to encode the character
     * @throws IOException if an i/o error occurs while writing
     */
    public int writeUtf8(int c, OutputStream out) throws IOException {
        byte utf8_write = 1;
        byte utf8_octets;
        int shift;
        int b;
        if (c < 0x00000080) {
            // US-ASCII/UTF-8: 0000 0000-0000 007F | 0xxxxxxx
            out.write(c);
            utf8_octets = 1;
            shift = 0;
        } else if (c < 0x00000800) {
            // UTF-8: 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
            b = (c >> 6) | 0xC0;
            out.write(b);
            utf8_octets = 2;
            shift = 0;
        } else if (c < 0x00010000) {
            // UTF-8: 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
            b = (c >> 12) | 0xE0;
            out.write(b);
            utf8_octets = 3;
            shift = 6;
        } else if (c < 0x00110000) {
            // UTF-8: 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
            b = (c >> 18) | 0xF0;
            out.write(b);
            utf8_octets = 4;
            shift = 12;
        } else {
            throw new IOException("Character (0x" + Integer.toHexString(c) + ") not UTF-8 encodable!");
        }
        while (utf8_write < utf8_octets) {
            b = ((c >> shift) & 0x3F) | 0x80;
            out.write(b);
            shift -= 6;
            ++utf8_write;
        }
        return utf8_write;
    }

}