io.protostuff.parser.TextFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of protostuff-parser Show documentation
parser for .proto files
There is a newer version: 3.1.40
//========================================================================
//Copyright 2007-2009 David Yu [email protected]
//------------------------------------------------------------------------
//Licensed under the Apache License, Version 2.0 (the "License");
//you may not use this file except in compliance with the License.
//You may obtain a copy of the License at 
//http://www.apache.org/licenses/LICENSE-2.0
//Unless required by applicable law or agreed to in writing, software
//distributed under the License is distributed on an "AS IS" BASIS,
//WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//See the License for the specific language governing permissions and
//limitations under the License.
//========================================================================

// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc.  All rights reserved.
// http://code.google.com/p/protobuf/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

package io.protostuff.parser;

import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;

/**
 * Provide ascii text parsing and formatting support for proto2 instances. The implementation largely follows
 * google/protobuf/text_format.cc.
 * 
 * @author [email protected] Wenbo Zhu
 * @author [email protected] Kenton Varda
 * @author David Yu
 */
public final class TextFormat
{
    private TextFormat()
    {
    }

    static final Charset UTF8 = Charset.forName("UTF-8"), ISO_8859_1 = Charset.forName("ISO-8859-1");

    // =================================================================
    // Utility functions
    //
    // Some of these methods are package-private because Descriptors.java uses
    // them.

    /**
     * Escapes bytes in the format used in protocol buffer text format, which is the same as the format used for C
     * string literals. All bytes that are not printable 7-bit ASCII characters are escaped, as well as backslash,
     * single-quote, and double-quote characters. Characters for which no defined short-hand escape sequence is defined
     * will be escaped using 3-digit octal sequences.
     */
    static StringBuilder escapeBytes(ByteBuffer input)
    {
        // input.flip();
        int length = input.limit();
        final StringBuilder builder = new StringBuilder(length);
        for (int i = 0; i < length; i++)
        {
            final byte b = input.get(i);
            switch (b)
            {
            // Java does not recognize \a or \v, apparently.
                case 0x07:
                    builder.append("\\007");
                    break;
                case '\b':
                    builder.append("\\b");
                    break;
                case '\f':
                    builder.append("\\f");
                    break;
                case '\n':
                    builder.append("\\n");
                    break;
                case '\r':
                    builder.append("\\r");
                    break;
                case '\t':
                    builder.append("\\t");
                    break;
                case 0x0b:
                    builder.append("\\013");
                    break;
                case '\\':
                    builder.append("\\\\");
                    break;
                case '\'':
                    builder.append("\\\'");
                    break;
                case '"':
                    builder.append("\\\"");
                    break;
                default:
                    if (b >= 0x20)
                    {
                        builder.append((char) b);
                    }
                    else
                    {
                        builder.append('\\');
                        builder.append((char) ('0' + ((b >>> 6) & 3)));
                        builder.append((char) ('0' + ((b >>> 3) & 7)));
                        builder.append((char) ('0' + (b & 7)));
                    }
                    break;
            }
        }
        return builder;
    }

    /**
     * Un-escape a byte sequence as escaped using {@link #escapeBytes(ByteString)}. Two-digit hex escapes (starting with
     * "\x") are also recognized.
     */
    static ByteBuffer unescapeBytes(final CharSequence input)
    {
        int pos = 0, len = input.length();
        final byte[] result = new byte[len];
        ByteBuffer buffer = ByteBuffer.wrap(result);
        for (int i = 0; i < len; i++)
        {
            char c = input.charAt(i);
            if (c == '\\')
            {
                if (i + 1 < len)
                {
                    ++i;
                    c = input.charAt(i);
                    if (isOctal(c))
                    {
                        // Octal escape.
                        int code = digitValue(c);
                        if (i + 1 < len && isOctal(input.charAt(i + 1)))
                        {
                            ++i;
                            code = code * 8 + digitValue(input.charAt(i));
                        }
                        if (i + 1 < len && isOctal(input.charAt(i + 1)))
                        {
                            ++i;
                            code = code * 8 + digitValue(input.charAt(i));
                        }
                        result[pos++] = (byte) code;
                    }
                    else
                    {
                        switch (c)
                        {
                            case 'a':
                                result[pos++] = 0x07;
                                break;
                            case 'b':
                                result[pos++] = '\b';
                                break;
                            case 'f':
                                result[pos++] = '\f';
                                break;
                            case 'n':
                                result[pos++] = '\n';
                                break;
                            case 'r':
                                result[pos++] = '\r';
                                break;
                            case 't':
                                result[pos++] = '\t';
                                break;
                            case 'v':
                                result[pos++] = 0x0b;
                                break;
                            case '\\':
                                result[pos++] = '\\';
                                break;
                            case '\'':
                                result[pos++] = '\'';
                                break;
                            case '"':
                                result[pos++] = '\"';
                                break;

                            case 'x':
                                // hex escape
                                int code = 0;
                                if (i + 1 < len && isHex(input.charAt(i + 1)))
                                {
                                    ++i;
                                    code = digitValue(input.charAt(i));
                                }
                                else
                                {
                                    throw new InvalidEscapeSequenceException(
                                            "Invalid escape sequence: '\\x' with no digits");
                                }
                                if (i + 1 < len && isHex(input.charAt(i + 1)))
                                {
                                    ++i;
                                    code = code * 16 + digitValue(input.charAt(i));
                                }
                                result[pos++] = (byte) code;
                                break;

                            default:
                                throw new InvalidEscapeSequenceException(
                                        "Invalid escape sequence: '\\" + c + '\'');
                        }
                    }
                }
                else
                {
                    throw new InvalidEscapeSequenceException(
                            "Invalid escape sequence: '\\' at end of string.");
                }
            }
            else
            {
                result[pos++] = (byte) c;
            }
        }
        buffer.limit(pos);
        return buffer;
    }

    /**
     * Thrown by {@link TextFormat#unescapeBytes} and {@link TextFormat#unescapeText} when an invalid escape sequence is
     * seen.
     */
    static class InvalidEscapeSequenceException extends RuntimeException
    {
        private static final long serialVersionUID = -8164033650142593305L;

        InvalidEscapeSequenceException(final String description)
        {
            super(description);
        }
    }

    /**
     * Like {@link #escapeBytes(ByteString)}, but escapes a text string. Non-ASCII characters are first encoded as
     * UTF-8, then each byte is escaped individually as a 3-digit octal escape. Yes, it's weird.
     */
    static String escapeText(final String input)
    {
        return escapeBytes(ByteBuffer.wrap(input.getBytes(ISO_8859_1))).toString();
    }

    /**
     * Un-escape a text string as escaped using {@link #escapeText(String)}. Two-digit hex escapes (starting with "\x")
     * are also recognized.
     */
    static String unescapeText(String input)
    {
        ByteBuffer buffer = unescapeBytes(input);
        return new String(buffer.array(), buffer.position(), buffer.limit(), ISO_8859_1);
    }

    /**
     * Is this an octal digit?
     */
    private static boolean isOctal(final char c)
    {
        return '0' <= c && c <= '7';
    }

    /**
     * Is this a hex digit?
     */
    private static boolean isHex(final char c)
    {
        return ('0' <= c && c <= '9') ||
                ('a' <= c && c <= 'f') ||
                ('A' <= c && c <= 'F');
    }

    /**
     * Interpret a character as a digit (in any base up to 36) and return the numeric value. This is like
     * {@code Character.digit()} but we don't accept non-ASCII digits.
     */
    private static int digitValue(final char c)
    {
        if ('0' <= c && c <= '9')
        {
            return c - '0';
        }
        else if ('a' <= c && c <= 'z')
        {
            return c - 'a' + 10;
        }
        else
        {
            return c - 'A' + 10;
        }
    }

    /**
     * Parse a 32-bit signed integer from the text. Unlike the Java standard {@code Integer.parseInt()}, this function
     * recognizes the prefixes "0x" and "0" to signify hexidecimal and octal numbers, respectively.
     */
    static int parseInt32(final String text) throws NumberFormatException
    {
        return (int) parseInteger(text, true, false);
    }

    /**
     * Parse a 32-bit unsigned integer from the text. Unlike the Java standard {@code Integer.parseInt()}, this function
     * recognizes the prefixes "0x" and "0" to signify hexidecimal and octal numbers, respectively. The result is
     * coerced to a (signed) {@code int} when returned since Java has no unsigned integer type.
     */
    static int parseUInt32(final String text) throws NumberFormatException
    {
        return (int) parseInteger(text, false, false);
    }

    /**
     * Parse a 64-bit signed integer from the text. Unlike the Java standard {@code Integer.parseInt()}, this function
     * recognizes the prefixes "0x" and "0" to signify hexidecimal and octal numbers, respectively.
     */
    static long parseInt64(final String text) throws NumberFormatException
    {
        return parseInteger(text, true, true);
    }

    /**
     * Parse a 64-bit unsigned integer from the text. Unlike the Java standard {@code Integer.parseInt()}, this function
     * recognizes the prefixes "0x" and "0" to signify hexidecimal and octal numbers, respectively. The result is
     * coerced to a (signed) {@code long} when returned since Java has no unsigned long type.
     */
    static long parseUInt64(final String text) throws NumberFormatException
    {
        return parseInteger(text, false, true);
    }

    private static long parseInteger(final String text,
            final boolean isSigned,
            final boolean isLong)
            throws NumberFormatException
    {
        int pos = 0;

        boolean negative = false;
        if (text.startsWith("-", pos))
        {
            if (!isSigned)
            {
                throw new NumberFormatException("Number must be positive: " + text);
            }
            ++pos;
            negative = true;
        }

        int radix = 10;
        if (text.startsWith("0x", pos))
        {
            pos += 2;
            radix = 16;
        }
        else if (text.startsWith("0", pos))
        {
            radix = 8;
        }

        final String numberText = text.substring(pos);

        long result = 0;
        if (numberText.length() < 16)
        {
            // Can safely assume no overflow.
            result = Long.parseLong(numberText, radix);
            if (negative)
            {
                result = -result;
            }

            // Check bounds.
            // No need to check for 64-bit numbers since they'd have to be 16 chars
            // or longer to overflow.
            if (!isLong)
            {
                if (isSigned)
                {
                    if (result > Integer.MAX_VALUE || result < Integer.MIN_VALUE)
                    {
                        throw new NumberFormatException(
                                "Number out of range for 32-bit signed integer: " + text);
                    }
                }
                else
                {
                    if (result >= (1L << 32) || result < 0)
                    {
                        throw new NumberFormatException(
                                "Number out of range for 32-bit unsigned integer: " + text);
                    }
                }
            }
        }
        else
        {
            BigInteger bigValue = new BigInteger(numberText, radix);
            if (negative)
            {
                bigValue = bigValue.negate();
            }

            // Check bounds.
            if (!isLong)
            {
                if (isSigned)
                {
                    if (bigValue.bitLength() > 31)
                    {
                        throw new NumberFormatException(
                                "Number out of range for 32-bit signed integer: " + text);
                    }
                }
                else
                {
                    if (bigValue.bitLength() > 32)
                    {
                        throw new NumberFormatException(
                                "Number out of range for 32-bit unsigned integer: " + text);
                    }
                }
            }
            else
            {
                if (isSigned)
                {
                    if (bigValue.bitLength() > 63)
                    {
                        throw new NumberFormatException(
                                "Number out of range for 64-bit signed integer: " + text);
                    }
                }
                else
                {
                    if (bigValue.bitLength() > 64)
                    {
                        throw new NumberFormatException(
                                "Number out of range for 64-bit unsigned integer: " + text);
                    }
                }
            }

            result = bigValue.longValue();
        }

        return result;
    }
}