All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.googlecode.d2j.util.Utf8Utils Maven / Gradle / Ivy

There is a newer version: 2.25.11
Show newest version
/*
 * Copyright (C) 2007 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * As per the Apache license requirements, this file has been modified
 * from its original state.
 *
 * Such modifications are Copyright (C) 2010 Ben Gruver, and are released
 * under the original license
 */

package com.googlecode.d2j.util;

import java.io.IOException;
import java.io.Writer;

/**
 * Constants of type CONSTANT_Utf8_info.
 */
public final class Utf8Utils {

    /**
     * Converts a string into its Java-style UTF-8 form. Java-style UTF-8 differs from normal UTF-8 in the handling of
     * character '\0' and surrogate pairs.
     * 
     * @param string
     *            non-null; the string to convert
     * @return non-null; the UTF-8 bytes for it
     */
    public static byte[] stringToUtf8Bytes(String string) {
        int len = string.length();
        byte[] bytes = new byte[len * 3]; // Avoid having to reallocate.
        int outAt = 0;

        for (int i = 0; i < len; i++) {
            char c = string.charAt(i);
            if ((c != 0) && (c < 0x80)) {
                bytes[outAt] = (byte) c;
                outAt++;
            } else if (c < 0x800) {
                bytes[outAt] = (byte) (((c >> 6) & 0x1f) | 0xc0);
                bytes[outAt + 1] = (byte) ((c & 0x3f) | 0x80);
                outAt += 2;
            } else {
                bytes[outAt] = (byte) (((c >> 12) & 0x0f) | 0xe0);
                bytes[outAt + 1] = (byte) (((c >> 6) & 0x3f) | 0x80);
                bytes[outAt + 2] = (byte) ((c & 0x3f) | 0x80);
                outAt += 3;
            }
        }

        byte[] result = new byte[outAt];
        System.arraycopy(bytes, 0, result, 0, outAt);
        return result;
    }

    private static char[] tempBuffer = null;

    /**
     * Converts an array of UTF-8 bytes into a string.
     * 
     * This method uses a global buffer to avoid having to allocate one every time, so it is *not* thread-safe
     * 
     * @param bytes
     *            non-null; the bytes to convert
     * @param start
     *            the start index of the utf8 string to convert
     * @param length
     *            the length of the utf8 string to convert, not including any null-terminator that might be present
     * @return non-null; the converted string
     */
    public static String utf8BytesToString(byte[] bytes, int start, int length) {
        if (tempBuffer == null || tempBuffer.length < length) {
            tempBuffer = new char[length];
        }
        char[] chars = tempBuffer;
        int outAt = 0;

        for (int at = start; length > 0; /* at */) {
            int v0 = bytes[at] & 0xFF;
            char out;
            switch (v0 >> 4) {
            case 0x00:
            case 0x01:
            case 0x02:
            case 0x03:
            case 0x04:
            case 0x05:
            case 0x06:
            case 0x07: {
                // 0XXXXXXX -- single-byte encoding
                length--;
                if (v0 == 0) {
                    // A single zero byte is illegal.
                    return throwBadUtf8(v0, at);
                }
                out = (char) v0;
                at++;
                break;
            }
            case 0x0c:
            case 0x0d: {
                // 110XXXXX -- two-byte encoding
                length -= 2;
                if (length < 0) {
                    return throwBadUtf8(v0, at);
                }
                int v1 = bytes[at + 1] & 0xFF;
                if ((v1 & 0xc0) != 0x80) {
                    return throwBadUtf8(v1, at + 1);
                }
                int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f);
                if ((value != 0) && (value < 0x80)) {
                    /*
                     * This should have been represented with one-byte encoding.
                     */
                    return throwBadUtf8(v1, at + 1);
                }
                out = (char) value;
                at += 2;
                break;
            }
            case 0x0e: {
                // 1110XXXX -- three-byte encoding
                length -= 3;
                if (length < 0) {
                    return throwBadUtf8(v0, at);
                }
                int v1 = bytes[at + 1] & 0xFF;
                if ((v1 & 0xc0) != 0x80) {
                    return throwBadUtf8(v1, at + 1);
                }
                int v2 = bytes[at + 2] & 0xFF;
                if ((v1 & 0xc0) != 0x80) {
                    return throwBadUtf8(v2, at + 2);
                }
                int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) | (v2 & 0x3f);
                if (value < 0x800) {
                    /*
                     * This should have been represented with one- or two-byte encoding.
                     */
                    return throwBadUtf8(v2, at + 2);
                }
                out = (char) value;
                at += 3;
                break;
            }
            default: {
                // 10XXXXXX, 1111XXXX -- illegal
                return throwBadUtf8(v0, at);
            }
            }
            chars[outAt] = out;
            outAt++;
        }

        return new String(chars, 0, outAt);
    }

    /**
     * Helper for {@link #utf8BytesToString}, which throws the right exception for a bogus utf-8 byte.
     * 
     * @param value
     *            the byte value
     * @param offset
     *            the file offset
     * @return never
     * @throws IllegalArgumentException
     *             always thrown
     */
    private static String throwBadUtf8(int value, int offset) {
        throw new IllegalArgumentException("bad utf-8 byte " + String.format("%02x", value) + " at offset "
                + String.format("%08x", offset));
    }

    public static void writeEscapedChar(Writer writer, char c) throws IOException {
        if ((c >= ' ') && (c < 0x7f)) {
            if ((c == '\'') || (c == '\"') || (c == '\\')) {
                writer.write('\\');
            }
            writer.write(c);
            return;
        } else if (c <= 0x7f) {
            switch (c) {
            case '\n':
                writer.write("\\n");
                return;
            case '\r':
                writer.write("\\r");
                return;
            case '\t':
                writer.write("\\t");
                return;
            }
        }

        writer.write("\\u");
        writer.write(Character.forDigit(c >> 12, 16));
        writer.write(Character.forDigit((c >> 8) & 0x0f, 16));
        writer.write(Character.forDigit((c >> 4) & 0x0f, 16));
        writer.write(Character.forDigit(c & 0x0f, 16));

    }

    public static void writeEscapedString(Writer writer, String value) throws IOException {
        for (int i = 0; i < value.length(); i++) {
            char c = value.charAt(i);

            if ((c >= ' ') && (c < 0x7f)) {
                if ((c == '\'') || (c == '\"') || (c == '\\')) {
                    writer.write('\\');
                }
                writer.write(c);
                continue;
            } else if (c <= 0x7f) {
                switch (c) {
                case '\n':
                    writer.write("\\n");
                    continue;
                case '\r':
                    writer.write("\\r");
                    continue;
                case '\t':
                    writer.write("\\t");
                    continue;
                }
            }

            writer.write("\\u");
            writer.write(Character.forDigit(c >> 12, 16));
            writer.write(Character.forDigit((c >> 8) & 0x0f, 16));
            writer.write(Character.forDigit((c >> 4) & 0x0f, 16));
            writer.write(Character.forDigit(c & 0x0f, 16));
        }
    }

    public static String escapeString(String value) {
        int len = value.length();
        StringBuilder sb = new StringBuilder(len * 3 / 2);

        for (int i = 0; i < len; i++) {
            char c = value.charAt(i);

            if ((c >= ' ') && (c < 0x7f)) {
                if ((c == '\'') || (c == '\"') || (c == '\\')) {
                    sb.append('\\');
                }
                sb.append(c);
                continue;
            } else if (c <= 0x7f) {
                switch (c) {
                case '\n':
                    sb.append("\\n");
                    continue;
                case '\r':
                    sb.append("\\r");
                    continue;
                case '\t':
                    sb.append("\\t");
                    continue;
                }
            }

            sb.append("\\u");
            sb.append(Character.forDigit(c >> 12, 16));
            sb.append(Character.forDigit((c >> 8) & 0x0f, 16));
            sb.append(Character.forDigit((c >> 4) & 0x0f, 16));
            sb.append(Character.forDigit(c & 0x0f, 16));
        }

        return sb.toString();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy