com.amazon.ion.impl._Private_IonTextAppender Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of ion-java Show documentation
A Java implementation of the Amazon Ion data notation.
The newest version!
/*
 * Copyright 2007-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License").
 * You may not use this file except in compliance with the License.
 * A copy of the License is located at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * or in the "license" file accompanying this file. This file is distributed
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package com.amazon.ion.impl;

import static com.amazon.ion.impl._Private_IonConstants.MAX_LONG_TEXT_SIZE;
import static com.amazon.ion.impl._Private_IonConstants.isHighSurrogate;
import static com.amazon.ion.impl._Private_IonConstants.isLowSurrogate;
import static com.amazon.ion.impl._Private_IonConstants.makeUnicodeScalar;

import com.amazon.ion.Decimal;
import com.amazon.ion.impl.Base64Encoder.TextStream;
import com.amazon.ion.system.IonTextWriterBuilder;
import com.amazon.ion.util._Private_FastAppendable;
import java.io.ByteArrayInputStream;
import java.io.Closeable;
import java.io.Flushable;
import java.io.IOException;
import java.io.OutputStream;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.CharBuffer;
import java.nio.charset.Charset;


/**
 * NOT FOR APPLICATION USE!
 * 
 * Generic text sink that enables optimized output of both ASCII and UTF-16.
 */
public final class _Private_IonTextAppender
    implements Closeable, Flushable
{
    private static boolean is8bitValue(int v)
    {
        return (v & ~0xff) == 0;
    }

    private static boolean isDecimalDigit(int codePoint)
    {
        return (codePoint >= '0' && codePoint <= '9');
    }

    private static final boolean[] IDENTIFIER_START_CHAR_FLAGS;
    private static final boolean[] IDENTIFIER_FOLLOW_CHAR_FLAGS;
    static
    {
        IDENTIFIER_START_CHAR_FLAGS = new boolean[256];
        IDENTIFIER_FOLLOW_CHAR_FLAGS = new boolean[256];

        for (int ii='a'; ii<='z'; ii++) {
            IDENTIFIER_START_CHAR_FLAGS[ii]  = true;
            IDENTIFIER_FOLLOW_CHAR_FLAGS[ii] = true;
        }
        for (int ii='A'; ii<='Z'; ii++) {
            IDENTIFIER_START_CHAR_FLAGS[ii]  = true;
            IDENTIFIER_FOLLOW_CHAR_FLAGS[ii] = true;
        }
        IDENTIFIER_START_CHAR_FLAGS ['_'] = true;
        IDENTIFIER_FOLLOW_CHAR_FLAGS['_'] = true;

        IDENTIFIER_START_CHAR_FLAGS ['$'] = true;
        IDENTIFIER_FOLLOW_CHAR_FLAGS['$'] = true;

        for (int ii='0'; ii<='9'; ii++) {
            IDENTIFIER_FOLLOW_CHAR_FLAGS[ii] = true;
        }
    }

    public static boolean isIdentifierStart(int codePoint) {
        return IDENTIFIER_START_CHAR_FLAGS[codePoint & 0xff] && is8bitValue(codePoint);
    }

    public static boolean isIdentifierPart(int codePoint) {
        return IDENTIFIER_FOLLOW_CHAR_FLAGS[codePoint & 0xff] && is8bitValue(codePoint);
    }

    public static final boolean[] OPERATOR_CHAR_FLAGS;
    static
    {
        final char[] operatorChars = {
            '<', '>', '=', '+', '-', '*', '&', '^', '%',
            '~', '/', '?', '.', ';', '!', '|', '@', '`', '#'
           };

        OPERATOR_CHAR_FLAGS = new boolean[256];

        for (int ii=0; iinull, true, or false.
     * 

     * This does not check for non-identifier keywords such as
     * null.int.
     *
     * @param text the symbol to check.
     * @return true if the text is an identifier keyword.
     */
    public static boolean isIdentifierKeyword(CharSequence text)
    {
        int pos = 0;
        int valuelen = text.length();

        if(valuelen == 0)
        {
            return false;
        }

        boolean keyword = false;

        // there has to be at least 1 character or we wouldn't be here
        switch (text.charAt(pos++)) {
        case '$':
            if (valuelen == 1) return false;
            while (pos < valuelen) {
                char c = text.charAt(pos++);
                if (! isDecimalDigit(c)) return false;
            }
            return true;
        case 'f':
            if (valuelen == 5 //      'f'
             && text.charAt(pos++) == 'a'
             && text.charAt(pos++) == 'l'
             && text.charAt(pos++) == 's'
             && text.charAt(pos++) == 'e'
            ) {
                keyword = true;
            }
            break;
        case 'n':
            if (valuelen == 4 //      'n'
             && text.charAt(pos++) == 'u'
             && text.charAt(pos++) == 'l'
             && text.charAt(pos++) == 'l'
            ) {
                keyword = true;
            }
            else if (valuelen == 3 // 'n'
             && text.charAt(pos++) == 'a'
             && text.charAt(pos++) == 'n'
            ) {
                keyword = true;
            }
            break;
        case 't':
            if (valuelen == 4 //      't'
             && text.charAt(pos++) == 'r'
             && text.charAt(pos++) == 'u'
             && text.charAt(pos++) == 'e'
            ) {
                keyword = true;
            }
            break;
        }

        return keyword;
    }


    /**
     * Determines whether the text of a symbol requires (single) quotes.
     *
     * @param symbol must be a non-empty string.
     * @param quoteOperators indicates whether the caller wants operators to be
     * quoted; if true then operator symbols like !=
     * will return true.
     * has looser quoting requirements than other containers.
     * @return true if the given symbol requires quoting.
     *
     * @throws NullPointerException
     *         if symbol is null.
     */
    public static boolean symbolNeedsQuoting(CharSequence symbol,
                                             boolean      quoteOperators)
    {
        int length = symbol.length();

        // If the symbol's text matches an Ion keyword or it's an empty symbol, we must quote it.
        // Eg, the symbol 'false' and '' must be rendered quoted.
        if(length == 0 || isIdentifierKeyword(symbol))
        {
            return true;
        }

        char c = symbol.charAt(0);

        // Surrogates are neither identifierStart nor operatorPart, so the
        // first one we hit will fall through and return true.
        // TODO test that

        if (!quoteOperators && isOperatorPart(c))
        {
            for (int ii = 0; ii < length; ii++) {
                c = symbol.charAt(ii);
                // We don't need to look for escapes since all
                // operator characters are ASCII.
                if (!isOperatorPart(c)) {
                    return true;
                }
            }
            return false;
        }
        else if (isIdentifierStart(c))
        {
            for (int ii = 0; ii < length; ii++) {
                c = symbol.charAt(ii);
                if ((c == '\'' || c < 32 || c > 126)
                    || !isIdentifierPart(c))
                {
                    return true;
                }
            }
            return false;
        }

        // quote by default
        return true;
    }


    /**
     * Print an Ion Symbol type. This method will check if symbol needs quoting
     * @param text
     * @throws IOException
     */
    public final void printSymbol(CharSequence text)
        throws IOException
    {
        if (text == null)
        {
            appendAscii("null.symbol");
        }
        else if (symbolNeedsQuoting(text, true)) {
            appendAscii('\'');
            printCodePoints(text, SYMBOL_ESCAPE_CODES);
            appendAscii('\'');
        }
        else
        {
            appendAscii(text);
        }
    }

    /**
     * Print single-quoted Ion Symbol type
     * @param text
     * @throws IOException
     */
    public final void printQuotedSymbol(CharSequence text)
        throws IOException
    {
        if (text == null)
        {
            appendAscii("null.symbol");
        }
        else
        {
            appendAscii('\'');
            printCodePoints(text, SYMBOL_ESCAPE_CODES);
            appendAscii('\'');
        }
    }

    private final void printCodePoints(CharSequence text, String[] escapes)
        throws IOException
    {
        int len = text.length();
        for (int i = 0; i < len; ++i)
        {
            // Find a span of non-escaped ASCII code points so we can write
            // them as quickly as possible.
            char c = 0;
            int j;
            for (j = i; j < len; ++j) {
                c = text.charAt(j);
                // The escapes array always includes U+80 through U+FF.
                if (c >= 0x100 || escapes[c] != null)
                {
                    // c is escaped and/or outside ASCII range.
                    if (j > i) {
                        appendAscii(text, i, j);
                        i = j;
                    }
                    break;
                }
            }
            if (j == len) {
                // we've reached the end of sequence; append it and break
                appendAscii(text, i, j);
                break;
            }

            // We've found a code point that's escaped and/or non-ASCII.

            if (c < 0x80)
            {
                // An escaped ASCII character.
                assert escapes[c] != null;
                appendAscii(escapes[c]);
            }
            else if (c < 0x100)
            {
                // Non-ASCII LATIN-1; we will have an escape sequence but may
                // not use it.
                assert escapes[c] != null;

                // Always escape the C1 control codes U+80 through U+9F.
                if (escapeNonAscii || c <= 0x9F) {
                    appendAscii(escapes[c]);
                } else {
                    appendUtf16(c);
                }
            }
            else if (c < 0xD800 || c >= 0xE000)
            {
                // Not LATIN-1, but still in the BMP.
                String s = Integer.toHexString(c);
                if (escapeNonAscii) {
                    appendAscii(HEX_4_PREFIX);
                    appendAscii(ZERO_PADDING[4 - s.length()]);
                    appendAscii(s);
                } else {
                    appendUtf16(c);
                }
            }
            else if (isHighSurrogate(c))
            {
                // Outside the BMP! High surrogate must be followed by low.
                char c2;
                if (++i == len || !isLowSurrogate(c2 = text.charAt(i))) {
                    String message =
                        "text is invalid UTF-16. It contains an unmatched " +
                        "leading surrogate 0x" + Integer.toHexString(c) +
                        " at index " + (i-1);
                    throw new IllegalArgumentException(message);
                }
                if (escapeNonAscii) {
                    int cp = makeUnicodeScalar(c, c2);
                    String s = Integer.toHexString(cp);
                    appendAscii(HEX_8_PREFIX);
                    appendAscii(ZERO_PADDING[8 - s.length()]);
                    appendAscii(s);
                } else {
                    appendUtf16Surrogate(c, c2);
                }
            }
            else
            {
                // unmatched low surrogate
                assert isLowSurrogate(c);

                String message =
                    "text is invalid UTF-16. It contains an unmatched " +
                    "trailing surrogate 0x" + Integer.toHexString(c) +
                    " at index " + i;
                throw new IllegalArgumentException(message);
            }
        }
    }


    //=========================================================================
    // Numeric scalars


    /** ONLY FOR USE BY {@link #printInt(long)}. */
    private final char[] _fixedIntBuffer = new char[MAX_LONG_TEXT_SIZE];

    public void printInt(long value)
        throws IOException
    {
        int j = _fixedIntBuffer.length;
        if (value == 0) {
            _fixedIntBuffer[--j] = '0';
        } else {
            if (value < 0) {
                while (value != 0) {
                    _fixedIntBuffer[--j] = (char)(0x30 - value % 10);
                    value /= 10;
                }
                _fixedIntBuffer[--j] = '-';
            } else {
                while (value != 0) {
                    _fixedIntBuffer[--j] = (char)(0x30 + value % 10);
                    value /= 10;
                }
            }
        }

        // Using CharBuffer avoids copying the _fixedIntBuffer into a String
        appendAscii(CharBuffer.wrap(_fixedIntBuffer),
                    j,
                    _fixedIntBuffer.length);
    }


    public void printInt(BigInteger value)
        throws IOException
    {
        if (value == null)
        {
            appendAscii("null.int");
            return;
        }

        appendAscii(bigIntegerToString(value));
    }


    public void printDecimal(_Private_IonTextWriterBuilder _options,
                             BigDecimal                    value)
        throws IOException
    {
        if (value == null)
        {
            appendAscii("null.decimal");
            return;
        }

        BigInteger unscaled = value.unscaledValue();

        int signum = value.signum();
        if (signum < 0)
        {
            appendAscii('-');
            unscaled = unscaled.negate();
        }
        else if (value instanceof Decimal
             && ((Decimal)value).isNegativeZero())
        {
            // for the various forms of negative zero we have to
            // write the sign ourselves, since neither BigInteger
            // nor BigDecimal recognize negative zero, but Ion does.
            appendAscii('-');
        }

        final String unscaledText = bigIntegerToString(unscaled);
        final int significantDigits = unscaledText.length();

        final int scale = value.scale();
        final int exponent = -scale;

        if (_options._decimal_as_float)
        {
            appendAscii(unscaledText);
            appendAscii('e');
            appendAscii(Integer.toString(exponent));
        }
        else if (exponent == 0)
        {
            appendAscii(unscaledText);
            appendAscii('.');
        }
        else if (exponent < 0)
        {
            // Avoid printing small negative exponents using a heuristic
            // adapted from http://speleotrove.com/decimal/daconvs.html

            final int adjustedExponent = significantDigits - 1 - scale;
            if (adjustedExponent >= 0)
            {
                int wholeDigits = significantDigits - scale;
                appendAscii(unscaledText, 0, wholeDigits);
                appendAscii('.');
                appendAscii(unscaledText, wholeDigits,
                                    significantDigits);
            }
            else if (adjustedExponent >= -6)
            {
                appendAscii("0.");
                appendAscii("00000", 0, scale - significantDigits);
                appendAscii(unscaledText);
            }
            else
            {
                appendAscii(unscaledText);
                appendAscii("d-");
                appendAscii(Integer.toString(scale));
            }
        }
        else // (exponent > 0)
        {
            // We cannot move the decimal point to the right, adding
            // rightmost zeros, because that would alter the precision.
            appendAscii(unscaledText);
            appendAscii('d');
            appendAscii(Integer.toString(exponent));
        }
    }


    public void printFloat(_Private_IonTextWriterBuilder _options, double value)
        throws IOException
    {
        // shortcut zero cases
        if (value == 0.0)
        {
            if (Double.compare(value, 0d) == 0)  // Only matches positive zero
            {
                appendAscii("0e0");
            }
            else
            {
                appendAscii("-0e0");
            }
        }
        else if (Double.isNaN(value))
        {
            if (_options._float_nan_and_inf_as_null) {
                appendAscii("null");
            } else {
                appendAscii("nan");
            }
        }
        else if (value == Double.POSITIVE_INFINITY)
        {
            if (_options._float_nan_and_inf_as_null) {
                appendAscii("null");
            } else {
                appendAscii("+inf");
            }
        }
        else if (value == Double.NEGATIVE_INFINITY)
        {
            if (_options._float_nan_and_inf_as_null) {
                appendAscii("null");
            } else {
                appendAscii("-inf");
            }
        }
        else
        {
            // Double.toString() forces a digit after the decimal point.
            // Remove it when it's not meaningful.
            String str = Double.toString(value);
            if (str.endsWith(".0"))
            {
                appendAscii(str, 0, str.length() - 2);
                appendAscii("e0");
            }
            else
            {
                appendAscii(str);
                if (str.indexOf('E') == -1)
                {
                    appendAscii("e0");
                }
            }
        }
    }

    public void printFloat(_Private_IonTextWriterBuilder _options, Double value)
        throws IOException
    {
        if (value == null)
        {
            appendAscii("null.float");
        }
        else
        {
            printFloat(_options, value.doubleValue());
        }
    }


    //=========================================================================
    // LOBs


    public void printBlob(_Private_IonTextWriterBuilder _options,
                          byte[] value, int start, int len)
        throws IOException
    {
        if (value == null)
        {
            appendAscii("null.blob");
            return;
        }

        @SuppressWarnings("resource")
        TextStream ts =
            new TextStream(new ByteArrayInputStream(value, start, len));

        // base64 encoding is 6 bits per char so
        // it evens out at 3 bytes in 4 characters
        char[] buf = new char[_options.isPrettyPrintOn() ? 80 : 400];
        CharBuffer cb = CharBuffer.wrap(buf);

        if (_options._blob_as_string)
        {
            appendAscii('"');
        }
        else
        {
            appendAscii("{{");
            if (_options.isPrettyPrintOn())
            {
                appendAscii(' ');
            }
        }

        for (;;)
        {
            // TODO is it better to fill up the CharBuffer before outputting?
            int clen = ts.read(buf, 0, buf.length);
            if (clen < 1) break;
            appendAscii(cb, 0, clen);
        }

        if (_options._blob_as_string)
        {
            appendAscii('"');
        }
        else
        {
            if (_options.isPrettyPrintOn())
            {
                appendAscii(' ');
            }
            appendAscii("}}");
        }
    }


    private void printClobBytes(byte[] value, int start, int end,
                                String[] escapes)
        throws IOException
    {
        for (int i = start; i < end; i++) {
            char c = (char)(value[i] & 0xff);
            String escapedByte = escapes[c];
            if (escapedByte != null) {
                appendAscii(escapedByte);
            } else {
                appendAscii(c);
            }
        }
    }


    public void printClob(_Private_IonTextWriterBuilder _options,
                          byte[] value, int start, int len)
        throws IOException
    {
        if (value == null)
        {
            appendAscii("null.clob");
            return;
        }


        final boolean json =
            _options._clob_as_string && _options._string_as_json;

        final int threshold = _options.getLongStringThreshold();
        final boolean longString = (0 < threshold && threshold < value.length);

        if (!_options._clob_as_string)
        {
            appendAscii("{{");
            if (_options.isPrettyPrintOn())
            {
                appendAscii(' ');
            }
        }

        if (json)
        {
            appendAscii('"');
            printClobBytes(value, start, start + len, JSON_ESCAPE_CODES);
            appendAscii('"');
        }
        else if (longString)
        {
            // This may escape more often than is necessary, but doing it
            // minimally is very tricky. Must be sure to account for
            // quotes at the end of the content.

            // TODO Account for NL versus CR+NL streams
            appendAscii(TRIPLE_QUOTES);
            printClobBytes(value, start, start + len, LONG_STRING_ESCAPE_CODES);
            appendAscii(TRIPLE_QUOTES);
        }
        else
        {
            appendAscii('"');
            printClobBytes(value, start, start + len, STRING_ESCAPE_CODES);
            appendAscii('"');
        }

        if (! _options._clob_as_string)
        {
            if (_options.isPrettyPrintOn())
            {
                appendAscii(' ');
            }
            appendAscii("}}");
        }
    }

    /**
     * Convert {@link BigInteger} to a {@link String}.
     * 

     * The current implementation of {@link BigInteger#toString()} is suboptimal
     * speed-wise, as it uses non-native division for arbitrary-sized integers
     * to convert the binary representation to a string.
     * This is inefficient for integers that can fit into a {@link Long}, where
     * native division can be used to convert the long to its string representation,
     * which is currently implemented in {@link Long#toString()}.
     * 

     * This method delegates to {@link Long#toString()} if it's possible to
     * do so, which results in a speedup from 2x to 5x.
     *
     * @param value the integer to convert
     * @return the string representation in base 10
     */
    private String bigIntegerToString(final BigInteger value)
    {
        if (value.bitLength() >= 64) {
            // if it's out of long range, the only way is through toString()
            return value.toString();
        } else {
            return Long.toString(value.longValue());
        }
    }
}