All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jboss.marshalling.UTFUtils Maven / Gradle / Ivy

/*
 * JBoss, Home of Professional Open Source
 * Copyright 2008, JBoss Inc., and individual contributors as indicated
 * by the @authors tag. See the copyright.txt in the distribution for a
 * full listing of individual contributors.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */

package org.jboss.marshalling;

import java.io.IOException;
import java.io.UTFDataFormatException;
import java.io.EOFException;

/**
 * Handy utility methods for dealing with strings in the modified UTF-8 format.
 * @apiviz.exclude
 */
public final class UTFUtils {
    private static final String INVALID_BYTE = "Invalid byte";
    private static final String MALFORMED = "Malformed UTF-8 sequence";

    private UTFUtils() {
    }

    private static final int UTF_BUFS_CHAR_CNT = 256;
    private static final int UTF_BUFS_BYTE_CNT = UTF_BUFS_CHAR_CNT * 3;

    private static final class BytesHolder extends ThreadLocal {
        protected byte[] initialValue() {
            return new byte[UTF_BUFS_BYTE_CNT];
        }
    }

    private static final BytesHolder BYTES_HOLDER = new BytesHolder();

    /**
     * Get the number of bytes used by the modified UTF-8 encoded form of the given string.  If the length is
     * greater than {@code 65536}, an exception is thrown.
     *
     * @param s the string
     * @return the length
     * @throws UTFDataFormatException if the string is longer than {@code 65536} characters
     * @see java.io.DataInput#readUTF()
     */
    public static int getShortUTFLength(final String s) throws UTFDataFormatException {
        final int length = s.length();
        int l = 0;
        for (int i = 0; i < length; i ++) {
            final char c = s.charAt(i);
            if (c > 0 && c <= 0x7f) {
                l ++;
            } else if (c <= 0x07ff) {
                l += 2;
            } else {
                l += 3;
            }
            if (l > 65535) {
                throw new UTFDataFormatException("String is too long for writeUTF");
            }
        }
        return l;
    }

    /**
     * Get the number of bytes used by the modified UTF-8 encoded form of the given string.
     *
     * @param s the string
     * @return the length
     * @see java.io.DataInput#readUTF()
     */
    public static long getLongUTFLength(final String s) {
        final int length = s.length();
        long l = 0;
        for (int i = 0; i < length; i ++) {
            final char c = s.charAt(i);
            if (c > 0 && c <= 0x7f) {
                l ++;
            } else if (c <= 0x07ff) {
                l += 2L;
            } else {
                l += 3L;
            }
        }
        return l;
    }

    /**
     * Write the modified UTF-8 form of the given string to the given output.
     *
     * @param output the output to write to
     * @param s the string
     * @throws IOException if an I/O error occurs
     * @see java.io.DataOutput#writeUTF(String)
     */
    public static void writeUTFBytes(final ByteOutput output, final String s) throws IOException {
        final byte[] byteBuf = BYTES_HOLDER.get();

        final int length = s.length();

        int strIdx = 0;
        int byteIdx = 0;
        while (strIdx < length) {
            final char c = s.charAt(strIdx ++);
            if (c > 0 && c <= 0x7f) {
                byteBuf[byteIdx ++] = (byte) c;
            } else if (c <= 0x07ff) {
                byteBuf[byteIdx ++] = (byte)(0xc0 | 0x1f & c >> 6);
                byteBuf[byteIdx ++] = (byte)(0x80 | 0x3f & c);
            } else {
                byteBuf[byteIdx ++] = (byte)(0xe0 | 0x0f & c >> 12);
                byteBuf[byteIdx ++] = (byte)(0x80 | 0x3f & c >> 6);
                byteBuf[byteIdx ++] = (byte)(0x80 | 0x3f & c);
            }
            if (byteIdx > UTF_BUFS_BYTE_CNT - 4) {
                output.write(byteBuf, 0, byteIdx);
                byteIdx = 0;
            }
        }
        if (byteIdx > 0) {
            output.write(byteBuf, 0, byteIdx);
        }
    }

    /**
     * Read the given number of characters from the given byte input.  The length given is in characters,
     * NOT in bytes.
     *
     * @param input the byte source
     * @param len the number of characters to read
     * @return the string
     * @throws IOException if an I/O error occurs
     * @see java.io.DataInput#readUTF()
     */
    public static String readUTFBytes(final ByteInput input, final int len) throws IOException {
        final byte[] byteBuf = BYTES_HOLDER.get();
        final char[] chars = new char[len];
        int i = 0, cnt = 0, charIdx = 0;
        while (charIdx < len) {
            if (i == cnt) {
                cnt = input.read(byteBuf, 0, Math.min(UTF_BUFS_BYTE_CNT, len - charIdx));
                if (cnt < 0) {
                    throw new EOFException();
                }
                i = 0;
            }
            final int a = byteBuf[i++] & 0xff;
            if (a < 0x80) {
                // low bit clear
                chars[charIdx ++] = (char) a;
            } else if (a < 0xc0) {
                throw new UTFDataFormatException(INVALID_BYTE);
            } else if (a < 0xe0) {
                if (i == cnt) {
                    cnt = input.read(byteBuf, 0, Math.min(UTF_BUFS_BYTE_CNT, len - charIdx));
                    if (cnt < 0) {
                        throw new EOFException();
                    }
                    i = 0;
                }
                final int b = byteBuf[i ++] & 0xff;
                if ((b & 0xc0) != 0x80) {
                    throw new UTFDataFormatException(INVALID_BYTE);
                }
                chars[charIdx ++] = (char) ((a & 0x1f) << 6 | b & 0x3f);
            } else if (a < 0xf0) {
                if (i == cnt) {
                    cnt = input.read(byteBuf, 0, Math.min(UTF_BUFS_BYTE_CNT, len - charIdx));
                    if (cnt < 0) {
                        throw new EOFException();
                    }
                    i = 0;
                }
                final int b = byteBuf[i ++] & 0xff;
                if ((b & 0xc0) != 0x80) {
                    throw new UTFDataFormatException(INVALID_BYTE);
                }
                if (i == cnt) {
                    cnt = input.read(byteBuf, 0, Math.min(UTF_BUFS_BYTE_CNT, len - charIdx));
                    if (cnt < 0) {
                        throw new EOFException();
                    }
                    i = 0;
                }
                final int c = byteBuf[i ++] & 0xff;
                if ((c & 0xc0) != 0x80) {
                    throw new UTFDataFormatException(INVALID_BYTE);
                }
                chars[charIdx ++] = (char) ((a & 0x0f) << 12 | (b & 0x3f) << 6 | c & 0x3f);
            } else {
                throw new UTFDataFormatException(INVALID_BYTE);
            }
        }
        return String.valueOf(chars);
    }

    /**
     * Read the given number of characters from the given byte input.  The length given is in bytes.
     *
     * @param input the byte source
     * @param len the number of bytes to read
     * @return the string
     * @throws IOException if an I/O error occurs
     * @see java.io.DataInput#readUTF()
     */
    public static String readUTFBytesByByteCount(final ByteInput input, final long len) throws IOException {
        final StringBuilder builder = new StringBuilder();
        for (long i = 0; i < len; i ++) {
            final int a = input.read();
            if (a < 0) {
                throw new EOFException("Expected " + (len - i) + " more bytes");
            } else if (a == 0) {
                builder.append('\0');
            } else if (a < 0x80) {
                builder.append((char) a);
            } else if (a < 0xc0) {
                throw new UTFDataFormatException(INVALID_BYTE);
            } else if (a < 0xe0) {
                if (++i < len) {
                    final int b = input.read();
                    if (b == -1) {
                        throw new EOFException("Expected " + (len - i) + " more bytes");
                    } else if ((b & 0xc0) != 0x80) {
                        throw new UTFDataFormatException(INVALID_BYTE);
                    }
                    builder.append((char) ((a & 0x1f) << 6 | b & 0x3f));
                } else {
                    throw new UTFDataFormatException(MALFORMED);
                }
            } else if (a < 0xf0) {
                if (++i < len) {
                    final int b = input.read();
                    if (b == -1) {
                        throw new EOFException("Expected " + (len - i) + " more bytes");
                    } else if ((b & 0xc0) != 0x80) {
                        throw new UTFDataFormatException(INVALID_BYTE);
                    }
                    if (++i < len) {
                        final int c1 = input.read();
                        if (c1 == -1) {
                            throw new EOFException("Expected " + (len - i) + " more bytes");
                        } else if ((c1 & 0xc0) != 0x80) {
                            throw new UTFDataFormatException(INVALID_BYTE);
                        }
                        builder.append((char) ((a & 0x0f) << 12 | (b & 0x3f) << 6 | c1 & 0x3f));
                    } else {
                        throw new UTFDataFormatException(MALFORMED);
                    }
                } else {
                    throw new UTFDataFormatException(MALFORMED);
                }
            } else {
                throw new UTFDataFormatException(INVALID_BYTE);
            }
        }
        return builder.toString();
    }

    /**
     * Read a null-terminated modified UTF-8 string from the given byte input.  Bytes are read until a 0 is found or
     * until the end of the stream, whichever comes first.
     *
     * @param input the input
     * @return the string
     * @throws IOException if an I/O error occurs
     * @see java.io.DataInput#readUTF()
     */
    public static String readUTFZBytes(final ByteInput input) throws IOException {
        final StringBuilder builder = new StringBuilder();
        for (;;) {
            final int c = readUTFChar(input);
            if (c == -1) {
                return builder.toString();
            }
            builder.append((char) c);
        }
    }

    private static int readUTFChar(final ByteInput input) throws IOException {
        final int a = input.read();
        if (a < 0) {
            throw new EOFException();
        } else if (a == 0) {
            return -1;
        } else if (a < 0x80) {
            return (char)a;
        } else if (a < 0xc0) {
            throw new UTFDataFormatException(INVALID_BYTE);
        } else if (a < 0xe0) {
            final int b = input.read();
            if (b == -1) {
                throw new EOFException();
            } else if ((b & 0xc0) != 0x80) {
                throw new UTFDataFormatException(INVALID_BYTE);
            }
            return (a & 0x1f) << 6 | b & 0x3f;
        } else if (a < 0xf0) {
            final int b = input.read();
            if (b == -1) {
                throw new EOFException();
            } else if ((b & 0xc0) != 0x80) {
                throw new UTFDataFormatException(INVALID_BYTE);
            }
            final int c = input.read();
            if (c == -1) {
                throw new EOFException();
            } else if ((c & 0xc0) != 0x80) {
                throw new UTFDataFormatException(INVALID_BYTE);
            }
            return (a & 0x0f) << 12 | (b & 0x3f) << 6 | c & 0x3f;
        } else {
            throw new UTFDataFormatException(INVALID_BYTE);
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy