All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.amazon.ion.impl.IonReaderTextRawTokensX Maven / Gradle / Ivy

There is a newer version: 1.11.9
Show newest version
/*
 * Copyright 2007-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License").
 * You may not use this file except in compliance with the License.
 * A copy of the License is located at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * or in the "license" file accompanying this file. This file is distributed
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package com.amazon.ion.impl;

import static com.amazon.ion.impl.IonTokenConstsX.CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1;
import static com.amazon.ion.impl.IonTokenConstsX.CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2;
import static com.amazon.ion.impl.IonTokenConstsX.CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3;
import static com.amazon.ion.impl.IonTokenConstsX.CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_1;
import static com.amazon.ion.impl.IonTokenConstsX.CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_2;
import static com.amazon.ion.impl.IonTokenConstsX.CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3;
import static com.amazon.ion.util.IonTextUtils.printCodePointAsString;

import com.amazon.ion.IonException;
import com.amazon.ion.IonType;
import com.amazon.ion.UnexpectedEofException;
import com.amazon.ion.impl.IonTokenConstsX.CharacterSequence;
import com.amazon.ion.impl.UnifiedSavePointManagerX.SavePoint;
import com.amazon.ion.util.IonTextUtils;
import java.io.IOException;

/**
 * Tokenizer for the Ion text parser in IonTextIterator. This
 * reads bytes and returns the interesting tokens it recognizes
 * or an error.  While, currently, this does UTF-8 decoding
 * as it goes that is unnecessary.  The main entry point is
 * lookahead(n) which gets the token type n tokens ahead (0
 * is the next token).  The tokens type, its starting offset
 * in the input stream and its ending offset in the input stream
 * are cached, so lookahead() can be called repeatedly with
 * little overhead.  This supports a 7 token lookahead and requires
 * a "recompile" to change this limit.  (this could be "fixed"
 * but seems unnecessary at this time - the limit is in
 * IonTextTokenizer._token_lookahead_size which is 1 larger than
 * the size of the lookahead allowed)  Tokens are consumed by
 * a call to consumeToken, or the helper consumeTokenAsString.
 * The informational interfaces - getValueStart(), getValueEnd()
 * getValueAsString() can be used to get the contents of the
 * value once the caller has decided how to use it.
 *
 *  This is a copy and paste from IonTextTokenize on the introduction of
 *  the new input abstraction IonInputStream as the source of characters
 *  and bytes for the reader.
 *
 *  This variation does NOT make local copies of the tokens.  It does
 *  start "marking" at the beginning of the token and the end.  The stream
 *  will buffer the input until the mark is released.
 *
 *  The result is that only the most recent token is available to the
 *  calling reader.
 *
 */
final class IonReaderTextRawTokensX
{
    static final boolean _debug = false;

    private static final Appendable NULL_APPENDABLE = new Appendable()
    {
        public Appendable append(CharSequence csq) throws IOException
        {
            return this;
        }

        public Appendable append(CharSequence csq, int start, int end)
            throws IOException
        {
            return this;
        }

        public Appendable append(char c) throws IOException
        {
            return this;
        }
    };

    static final int   BASE64_EOF = 128; // still a byte, not -1, none of the low 6 bits on
    static final int[] BASE64_CHAR_TO_BIN = Base64Encoder.Base64EncodingCharToInt;
    static final int   BASE64_TERMINATOR_CHAR = Base64Encoder.Base64EncodingTerminator;

    private UnifiedInputStreamX  _stream = null;
    private int                 _token = -1;
    /** are we at the beginning of this token (false == done with it) */
    private boolean             _unfinished_token;
    private long                _line_count;
    private long                _line_starting_position;
    private boolean             _line_count_has_cached = false;
    private long                _line_count_cached;
    private long                _line_offset_cached;

    /** number of base64 decoded bytes in the stack, used to decode base64 */
    private int                 _base64_prefetch_count;
    /**
     * since this "stack" will only 0-2 bytes deep, we'll just shift them
     * into an int
     */
    private int                 _base64_prefetch_stack;


    /**
     * IonTokenReader constructor requires a UnifiedInputStream
     * as the source of bytes/chars that serve as the basic input
     *
     * @param iis wrapped input stream
     */
    public IonReaderTextRawTokensX(UnifiedInputStreamX iis) {
        this(iis, 1, 1);
    }

    public IonReaderTextRawTokensX(UnifiedInputStreamX iis, long starting_line,
                                   long starting_column)
    {
        _stream = iis;
        _line_count = starting_line;
        _line_starting_position = _stream.getPosition() - starting_column;
    }

    public void close()
        throws IOException
    {
        _stream.close();
    }

    public int  getToken()      { return _token; }
    public long getLineNumber() { return _line_count; }
    public long getLineOffset() {
        long stream_position = _stream.getPosition();
        long offset = stream_position - _line_starting_position;
        return offset;
    }

    UnifiedInputStreamX getSourceStream() { return this._stream; }

    public final boolean isBufferedInput()
    {
        boolean is_buffered = ! _stream._is_stream;
        return is_buffered;
    }

    protected String input_position() {
        String s = " at line "
                + getLineNumber()
                + " offset "
                + getLineOffset();
        return s;
    }
    public final boolean isUnfinishedToken() { return  _unfinished_token; }

    public final void tokenIsFinished() {
        _unfinished_token = false;
        _base64_prefetch_count = 0;
    }

    //
    //  character routines to fetch characters and
    //  handle look ahead and line counting and such
    //
    protected final int read_char() throws IOException
    {
        int c = _stream.read();
        if (c == '\r' || c == '\n') {
            c = line_count(c);
        }
        return c;
    }

    /**
     * NOT for use outside of string/symbol/clob!
     * Absorbs backslash-NL pairs, returning
     * {@link #CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1} etc.
     */
    protected final int read_string_char(ProhibitedCharacters prohibitedCharacters) throws IOException
    {
        int c = _stream.read();
        if (prohibitedCharacters.includes(c)) {
            error("invalid character [" + printCodePointAsString(c) + "]");
        }
        // the c == '\\' clause will cause us to eat ALL slash-newlines
        if (c == '\r' || c == '\n' || c == '\\') {
            c = line_count(c);
        }
        return c;
    }

    private final void unread_char(int c)
    {
        if (c < 0) {
            switch (c) {
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_1:
                line_count_unread(c);
                _stream.unread('\n');
                break;
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_2:
                line_count_unread(c);
                _stream.unread('\r');
                break;
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3:
                line_count_unread(c);
                _stream.unread('\n');
                _stream.unread('\r');
                break;
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
                _stream.unread('\n');
                _stream.unread('\\');
                break;
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
                _stream.unread('\r');
                _stream.unread('\\');
                break;
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3:
                _stream.unread('\n');
                _stream.unread('\r');
                _stream.unread('\\');
                break;
            case UnifiedInputStreamX.EOF:
                _stream.unread(UnifiedInputStreamX.EOF);
                break;
            default:
                assert false
                    : "INVALID SPECIAL CHARACTER ENCOUNTERED: " + c;
            }
        }
        else  {
            _stream.unread(c);
        }
    }

    private final int line_count_unread(int c) {
        assert( c == CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_1
             || c == CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_2
             || c == CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3
             || c == CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1
             || c == CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2
             || c == CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3
        );
        if (_line_count_has_cached) {
            _line_count = _line_count_cached;
            _line_starting_position = _line_offset_cached;
            _line_count_has_cached = false;
        }
        return c;
    }
    private final int line_count(int c) throws IOException
    {
        // check for the slash new line case (and we'l
        // consume both here it that's what we find
        switch (c) {
        case '\\':
            {
                int c2 = _stream.read();
                switch (c2) {
                case '\r':  // DOS   or old Mac 
                    int c3 = _stream.read();
                    if (c3 != '\n') {
                        unread_char(c3);
                        c = CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2;
                    }
                    else {
                        c = CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3;
                    }
                    break;
                case '\n':
                    // Unix and new Mac (also Unix) 
                    c = CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1;
                    break;
                default:
                    // not a slash new line, so we'll just return the slash
                    // leave it to be handled elsewhere
                    unread_char(c2);
                    return c;
                }
            }
            break;
        case '\r':
            {
                // convert '\r' or '\r\n' into the appropriate CHAR_SEQ
                // pseudo character
                int c2 = _stream.read();
                if (c2 == '\n') {
                    c = CHAR_SEQ_NEWLINE_SEQUENCE_3;
                }
                else {
                    unread_char(c2);
                    c = CHAR_SEQ_NEWLINE_SEQUENCE_2;
                }
            }
            break;
        case '\n':
            c = CHAR_SEQ_NEWLINE_SEQUENCE_1;
            break;
        default:
            throw new IllegalStateException();
        }

        // before we adjust the line count we save it so that
        // we can recover from a unread of a line terminator
        // note that we can only recover from a single line
        // terminator unread, but that should be enough.  We
        // only unread whitespace if it's a delimiter, and
        // then we only have to unread a single instance.
        _line_count_cached = _line_count;
        _line_offset_cached = _line_starting_position;
        _line_count_has_cached = true;

        // anything else (and that should only be either a new line
        // of IonTokenConsts.ESCAPED_NEWLINE_SEQUENCE passed in) we will
        // return the char unchanged and line count
        _line_count++;
        // since we want the first character of the line to be 1, not 0:
        _line_starting_position = _stream.getPosition() - 1;

        return c;
    }

    /**
     * peeks into the input stream to see if the next token
     * would be a double colon.  If indeed this is the case
     * it skips the two colons and returns true.  If not
     * it unreads the 1 or 2 real characters it read and
     * return false.
     * It always consumes any preceding whitespace.
     * @return true if the next token is a double colon, false otherwise
     * @throws IOException
     */
    public final boolean skipDoubleColon() throws IOException
    {
        int c = skip_over_whitespace();
        if (c != ':') {
            unread_char(c);
            return false;
        }
        c = read_char();
        if (c != ':') {
            unread_char(c);
            unread_char(':');
            return false;
        }
        return true;
    }


    /**
     * peeks into the input stream to see if we have an
     * unquoted symbol that resolves to one of the ion
     * types.  If it does it consumes the input and
     * returns the type keyword id.  If not is unreads
     * the non-whitespace characters and the dot, which
     * the input argument 'c' should be.
     */
    public final int peekNullTypeSymbol() throws IOException
    {
        // the '.' has to follow the 'null' immediately
        int c = read_char();
        if (c != '.') {
            unread_char(c);
            return IonTokenConstsX.KEYWORD_none;
        }

        // we have a dot, start reading through the following non-whitespace
        // and we'll collect it so that we can unread it in the event
        // we don't actually see a type name
        int[] read_ahead = new int[IonTokenConstsX.TN_MAX_NAME_LENGTH + 1];
        int read_count = 0;
        int possible_names = IonTokenConstsX.KW_ALL_BITS;

        while (read_count < IonTokenConstsX.TN_MAX_NAME_LENGTH + 1) {
            c = read_char();
            read_ahead[read_count++] = c;
            int letter_idx = IonTokenConstsX.typeNameLetterIdx(c);
            if (letter_idx < 1) {
                if (IonTokenConstsX.isValidTerminatingCharForInf(c)) {
                    // it's not a letter we care about but it is
                    // a valid end of const, so maybe we have a keyword now
                    // we always exit the loop here since we look
                    // too far so any letter is invalid at pos 10
                    break;
                }
                return peekNullTypeSymbolUndo(read_ahead, read_count);
            }
            int mask = IonTokenConstsX.typeNamePossibilityMask(read_count - 1, letter_idx);
            possible_names &= mask;
            if (possible_names == 0) {
                // in this case it can't be a valid keyword since
                // it has identifier chars (letters) at 1 past the
                // last possible end (at least)
                return peekNullTypeSymbolUndo(read_ahead, read_count);
            }
        }
        // now lets get the keyword value from our bit mask
        // at this point we can fail since we may have hit
        // a valid terminator before we're done with all key
        // words.  We even have to check the length.
        // for example "in)" matches both letters to the
        // typename int and terminates validly - but isn't
        // long enough, but with length we have enough to be sure
        // with the actual type names we're using in 1.0
        int kw = IonTokenConstsX.typeNameKeyWordFromMask(possible_names, read_count-1);
        if (kw == IonTokenConstsX.KEYWORD_unrecognized) {
            peekNullTypeSymbolUndo(read_ahead, read_count);
        }
        else {
            // since we're accepting the rest we aren't unreading anything
            // else - but we still have to unread the character that stopped us
            unread_char(c);
        }
        return kw;
    }
    private final int peekNullTypeSymbolUndo(int[] read_ahead, int read_count)
    {
        String type_error = "";
        for (int ii=0; ii': case '*': case '=': case '^': case '&': case '|':
        case '~': case ';': case '!': case '?': case '@': case '%': case '`':
            unread_char(c);
            return next_token_finish(IonTokenConstsX.TOKEN_SYMBOL_OPERATOR, true);
        case '"':
            return next_token_finish(IonTokenConstsX.TOKEN_STRING_DOUBLE_QUOTE, true);
        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
        case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
        case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
        case 's': case 't': case 'u': case 'v': case 'w': case 'x':
        case 'y': case 'z':
        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
        case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
        case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
        case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
        case 'Y': case 'Z':
        case '$': case '_':
            unread_char(c);
            return next_token_finish(IonTokenConstsX.TOKEN_SYMBOL_IDENTIFIER, true);
        case '0': case '1': case '2': case '3': case '4':
        case '5': case '6': case '7': case '8': case '9':
            t = scan_for_numeric_type(c);
            unread_char(c);
            return next_token_finish(t, true);
        case '-':
            // see if we have a number or what might be an extended symbol
            c2 = read_char();
            unread_char(c2);
            if (IonTokenConstsX.isDigit(c2)) {
                t = scan_negative_for_numeric_type(c);
                unread_char(c);
                return next_token_finish(t, true);
            }
            else if (peek_inf_helper(c)) // this will consume the inf if it succeeds
            {
                return next_token_finish(IonTokenConstsX.TOKEN_FLOAT_MINUS_INF, false);
            }
            else {
                unread_char(c);
                return next_token_finish(IonTokenConstsX.TOKEN_SYMBOL_OPERATOR, true);
            }
        default:
            bad_token_start(c); // throws
        }
        throw new IonException("invalid state: next token switch shouldn't exit");
    }
    private final int next_token_finish(int token, boolean content_is_waiting) {
        _token = token;
        _unfinished_token = content_is_waiting;
        return _token;
    }

    /**
     * Defines strategies to apply when comments are encountered.
     */
    private enum CommentStrategy
    {
        /**
         * Skip over all of the comment's text.
         */
        IGNORE
        {

            @Override
            boolean onComment(IonReaderTextRawTokensX tokenizer)
                throws IOException
            {
                int next = tokenizer.read_char();
                switch(next) {
                case '/':
                    tokenizer.skip_single_line_comment();
                    return true; // valid comment
                case '*':
                    tokenizer.skip_block_comment();
                    return true; // valid comment
                default:
                    tokenizer.unread_char(next);
                    return false; // invalid comment
                }
            }

        },
        /**
         * If it's a valid comment, throw an error.
         */
        ERROR
        {

            @Override
            boolean onComment(IonReaderTextRawTokensX tokenizer)
                throws IOException
            {
                int next = tokenizer.read_char();
                if (next == '/' || next == '*')
                {
                    tokenizer.error("Illegal comment");
                }
                else
                {
                    tokenizer.unread_char(next);
                }
                return false; // invalid comment
            }

        },
        /**
         * A '/' character has been found, so break the loop as it may be a valid blob character.
         */
        BREAK
        {

            @Override
            boolean onComment(IonReaderTextRawTokensX tokenizer)
                throws IOException
            {
                return false;
            }

        };

        /**
         * Called when positioned after the first '/'.
         * @return true if a valid comment was found, otherwise false
         * @throws IonReaderTextTokenException when the ERROR strategy encounters a comment
         */
        abstract boolean onComment(IonReaderTextRawTokensX tokenizer) throws IOException;
    }

    /**
     * Skip over any whitespace, ignoring any comments.
     * @return the next character in the stream
     * @throws IOException
     */
    private final int skip_over_whitespace() throws IOException
    {
        return skip_over_whitespace(CommentStrategy.IGNORE);
    }

    /**
     * Skip over any whitespace, applying the given CommentStrategy to
     * any comments found.
     * @param commentStrategy the strategy to use upon encountering comments.
     * @return the next character in the stream
     * @throws IOException
     */
    private final int skip_over_whitespace(CommentStrategy commentStrategy) throws IOException
    {
        skip_whitespace(commentStrategy);
        return read_char();
    }

    /**
     * The type of lob is not yet known. Break the loop on encountering
     * a / character and defer to the blob validation.
     * @return the next character in the stream
     * @throws IOException
     */
    private final int skip_over_lob_whitespace() throws IOException
    {
        return skip_over_blob_whitespace();
    }

    /**
     * Skip over whitespace, but not the / character, as it's a valid
     * Base64 character.
     * @return the next character in the stream
     * @throws IOException
     */
    private final int skip_over_blob_whitespace() throws IOException
    {
        return skip_over_whitespace(CommentStrategy.BREAK);
    }

    /**
     * Skip over the whitespace after the clob string and before the closing
     * braces. Throw if a comment is encountered.
     * @return the next character in the stream
     * @throws IOException
     */
    private final int skip_over_clob_whitespace() throws IOException
    {
        return skip_over_whitespace(CommentStrategy.ERROR);
    }

    /**
     * Skips whitespace and comments and finishes at the starting position
     * of the next token.
     * @return true if whitespace or comments were encountered
     * @throws IOException
     */
    protected final boolean skip_whitespace() throws IOException
    {
        return skip_whitespace(CommentStrategy.IGNORE);
    }

    /**
     * Skips whitespace and applies the given CommentStrategy to any comments
     * found. Finishes at the starting position of the next token.
     * @param commentStrategy
     * @return true if whitespace was skipped and/or comments ignored
     * @throws IOException
     */
    private final boolean skip_whitespace(CommentStrategy commentStrategy) throws IOException
    {
        boolean any_whitespace = false;
        int c;

        loop: for (;;) {
            c = read_char();
            switch (c) {
            case -1:
                break loop;
            case ' ':
            case '\t':
            // new line normalization and counting is handled in read_char
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_1:
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_2:
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3:
                any_whitespace = true;
                break;
            case '/':
                if (!commentStrategy.onComment(this))
                {
                    break loop;
                }
                any_whitespace = true;
                break;
            default:
                break loop;
            }
        }
        unread_char(c);
        return any_whitespace;
    }

    private final void skip_single_line_comment() throws IOException
    {
        for (;;) {
            int c = read_char();
            switch (c) {
            // new line normalization and counting is handled in read_char
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_1:
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_2:
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3:
                return;
            case -1:
                return;
            default:
                break; // and read another character
            }
        }
    }

    private final void skip_block_comment() throws IOException
    {
        int c;
        for (;;) {
            c = this.read_char();
            switch (c) {
                case '*':
                    // read back to back '*'s until you hit a '/' and terminate the comment
                    // or you see a non-'*'; in which case you go back to the outer loop.
                    // this just avoids the read-unread pattern on every '*' in a line of '*'
                    // commonly found at the top and bottom of block comments
                    for (;;) {
                        c = this.read_char();
                        if (c == '/') return;
                        if (c != '*') break;
                    }
                    break;
                case -1:
                    bad_token_start(c);
                default:
                    break;
            }
        }
    }

    /**
     * this peeks ahead to see if the next two characters
     * are single quotes. this would finish off a triple
     * quote when the first quote has been read.
     * if it succeeds it "consumes" the two quotes
     * it reads.
     * if it fails it unreads
     * @return true if the next two characters are single quotes
     * @throws IOException
     */
    private final boolean is_2_single_quotes_helper() throws IOException
    {
        int c = read_char();
        if (c != '\'') {
            unread_char(c);
            return false;
        }
        c = read_char();
        if (c != '\'') {
            unread_char(c);
            unread_char('\'');
            return false;
        }
        return true;
    }

    private final boolean peek_inf_helper(int c) throws IOException
    {
        if (c != '+' && c != '-') return false;
        c = read_char();
        if (c == 'i') {
            c = read_char();
            if (c == 'n') {
                c = read_char();
                if (c == 'f') {
                    c = read_char();
                    if (is_value_terminating_character(c)) {
                        unread_char(c);
                        return true;
                    }
                    unread_char(c);
                    c = 'f';
                }
                unread_char(c);
                c = 'n';
            }
            unread_char(c);
            c = 'i';
        }
        unread_char(c);
        return false;
    }

    /**
     * we encountered a character that starts a number,
     * a digit or a dash (minus).  Now we'll scan a little
     * ways ahead to spot some of the numeric types.
     *
     * this only looks far enough (2 or 6 chars) to identify
     * hex and timestamps
     * it might encounter a decimal or a 'd' or an 'e' and
     * decide this token is float or decimal (or int if we
     * hit a non-numeric char) but it may return TOKEN_UNKNOWN_NUMERIC;
     *
     * if will unread everything it's read, and the character
     * passed in as the first digit encountered
     *
     * @param c first char of number read by caller
     * @return numeric token type
     * @throws IOException
     */
    private final int scan_for_numeric_type(int c1) throws IOException
    {
        int   t = IonTokenConstsX.TOKEN_UNKNOWN_NUMERIC;
        int[] read_chars = new int[6];
        int   read_char_count = 0;
        int   c;

        if (!IonTokenConstsX.isDigit(c1)) {
            error(String.format("Expected digit, got U+%04X", c1));
        }

        // the caller needs to unread this if they want to: read_chars[read_char_count++] = c1;

        c = read_char();
        read_chars[read_char_count++] = c;

        if (c1 == '0') {
            // check for hex
            switch(c) {
            case 'x':
            case 'X':
                t = IonTokenConstsX.TOKEN_HEX;
                break;
            case 'd':
            case 'D':
                t = IonTokenConstsX.TOKEN_DECIMAL;
                break;
            case 'e':
            case 'E':
                t = IonTokenConstsX.TOKEN_FLOAT;
                break;
            case 'b':
            case 'B':
                t = IonTokenConstsX.TOKEN_BINARY;
                break;
            case '.':
                // the decimal might have an 'e' somewhere down the line so we
                // don't really know the type here
                break;
            default:
                if (is_value_terminating_character(c)) {
                    t = IonTokenConstsX.TOKEN_INT;
                }
                break;
            }
        }
        if (t == IonTokenConstsX.TOKEN_UNKNOWN_NUMERIC) { // oh for goto :(
            if (IonTokenConstsX.isDigit(c)) { // 2nd digit
                // it might be a timestamp if we have 4 digits, a dash,
                // and a digit
                c = read_char();
                read_chars[read_char_count++] = c;
                if (IonTokenConstsX.isDigit(c)) { // digit 3
                    c = read_char();
                    read_chars[read_char_count++] = c;
                    if (IonTokenConstsX.isDigit(c)) {
                        // last digit of possible year
                        c = read_char();
                        read_chars[read_char_count++] = c;
                        if (c == '-' || c =='T') {
                            // we have dddd- or ddddT looks like a timestamp
                            // (or invalid input)
                            t = IonTokenConstsX.TOKEN_TIMESTAMP;
                        }
                    }
                }
            }
        }

        // unread whatever we read, including the passed in char
        do {
            read_char_count--;
            c = read_chars[read_char_count];
            unread_char(c);
        } while (read_char_count > 0);

        return t;
    }

    private final boolean is_value_terminating_character(int c)
        throws IOException
    {
        boolean isTerminator;

        switch (c) {
            case '/':
            // this is terminating only if it starts a comment of some sort
            c = read_char();
            unread_char(c);  // we never "keep" this character
            isTerminator = (c == '/' || c == '*');
            break;
        // new line normalization and counting is handled in read_char
        case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_1:
        case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_2:
        case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3:
        case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
        case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
        case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3:
            isTerminator = true;
            break;
        default:
            isTerminator = IonTextUtils.isNumericStop(c);
            break;
        }

        return isTerminator;
    }

    /**
     * variant of scan_numeric_type where the passed in
     * start character was preceded by a minus sign.
     * this will also unread the minus sign.
     *
     * @param c first char of number read by caller
     * @return numeric token type
     * @throws IOException
     */
    private final int scan_negative_for_numeric_type(int c) throws IOException
    {
        assert(c == '-');
        c = read_char();
        int t = scan_for_numeric_type(c);
        if (t == IonTokenConstsX.TOKEN_TIMESTAMP) {
            bad_token(c);
        }
        unread_char(c); // and the caller need to unread the '-'
        return t;
    }

    // TODO: need new test cases since stepping out over values
    //       (or next-ing over them) is quite different from
    //       fully parsing them.  It is generally more lenient
    //       and that may not be best.

    /**
     * this is used to load a previously marked set of bytes
     * into the StringBuilder without escaping.  It expects
     * the caller to have set a save point so that the EOF
     * will stop us at the right time.
     * This does handle UTF8 decoding and surrogate encoding
     * as the bytes are transfered.
     */
    protected void load_raw_characters(StringBuilder sb) throws IOException
    {
        int c = read_char();
        for (;;) {
            c = read_char();
            switch (c) {
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3:
            // WAS: case IonTokenConstsX.ESCAPED_NEWLINE_SEQUENCE:
                continue;
            case -1:
                return;
            default:
                if (!IonTokenConstsX.is7bitValue(c)) {
                    c = read_large_char_sequence(c);
                }
            }
            if (IonUTF8.needsSurrogateEncoding(c)) {
                sb.append(IonUTF8.highSurrogate(c));
                c = IonUTF8.lowSurrogate(c);
            }
            sb.append((char)c);
        }
    }

    protected void skip_over_struct() throws IOException
    {
        skip_over_container('}');
    }
    protected void skip_over_list() throws IOException
    {
        skip_over_container(']');
    }
    protected void skip_over_sexp() throws IOException
    {
        skip_over_container(')');
    }
    private void skip_over_container(int terminator) throws IOException
    {
        assert( terminator == '}' || terminator == ']' || terminator == ')' );
        int c;

        for (;;) {
            c = skip_over_whitespace();
            switch (c) {
            case -1:
                unexpected_eof();
            case '}':
            case ']':
            case ')':
                if (c == terminator) { // no point is checking this on every char
                    return;
                }
                break;
            case '"':
                skip_double_quoted_string_helper();
                break;
            case '\'':
                if (is_2_single_quotes_helper()) {
                    skip_triple_quoted_string(null);
                }
                else {
                    c = skip_single_quoted_string(null);
                    unread_char(c);
                }
                break;
            case '(':
                skip_over_container(')');
                break;
            case '[':
                skip_over_container(']');
                break;
            case '{':
                // this consumes lobs as well since the double
                // braces count correctly and the contents
                // of either clobs or blobs will be just content
                c = read_char();
                if (c == '{') {
                    // 2nd '{' - it's a lob of some sort - let's find out what sort
                    c = skip_over_lob_whitespace();

                    int lobType;
                    if (c == '"') {
                        // clob, double quoted
                        lobType = IonTokenConstsX.TOKEN_STRING_DOUBLE_QUOTE;
                    }
                    else if (c == '\'') {
                        // clob, triple quoted - or error
                        if (!is_2_single_quotes_helper()) {
                            error("invalid single quote in lob content");
                        }
                        lobType = IonTokenConstsX.TOKEN_STRING_TRIPLE_QUOTE;
                    }
                    else {
                        // blob
                        unread_char(c);
                        lobType = IonTokenConstsX.TOKEN_OPEN_DOUBLE_BRACE;
                    }

                    skip_over_lob(lobType, null);
                }
                else if (c == '}') {
                    // do nothing, we just opened and closed an empty struct
                    // move on, there's nothing to see here ...
                }
                else {
                    unread_char(c);
                    skip_over_container('}');
                }
                break;
            default:
                break;
            }
        }
    }

    private int skip_over_number(SavePoint sp) throws IOException
    {
        int c = read_char();

        // first consume any leading 0 to get it out of the way
        if (c == '-') {
            c = read_char();
        }
        // could be a long int, a decimal, a float
        // it cannot be a hex or a valid timestamp
        // so scan digits - if decimal can more digits
        // if d or e eat possible sign
        // scan to end of digits
        c = skip_over_digits(c);
        if (c == '.') {
            c = read_char();
            c = skip_over_digits(c);
        }
        if (c == 'd' || c == 'D' || c == 'e' || c == 'E') {
            c = read_char();
            if (c == '-' || c == '+') {
                c = read_char();
            }
            c = skip_over_digits(c);
        }
        if (!is_value_terminating_character(c)) {
            bad_token(c);
        }
        if (sp != null) {
            sp.markEnd(-1);
        }
        return c;
    }
    private int skip_over_int(SavePoint sp) throws IOException
    {
        int c = read_char();
        if (c == '-') {
            c = read_char();
        }
        c = skip_over_digits(c);
        if (!is_value_terminating_character(c)) {
            bad_token(c);
        }
        if (sp != null) {
            sp.markEnd(-1);
        }
        return c;
    }
    private int skip_over_digits(int c) throws IOException
    {
        while (IonTokenConstsX.isDigit(c)) {
            c = read_char();
        }
        return c;
    }

    private int skipOverRadix(SavePoint sp, Radix radix) throws IOException
    {
        int c;

        c = read_char();
        if (c == '-') {
            c = read_char();
        }
        assert(c == '0');
        c = read_char();
        radix.assertPrefix(c);

        c = readNumeric(NULL_APPENDABLE, radix);

        if (!is_value_terminating_character(c)) {
            bad_token(c);
        }
        if (sp != null) {
            sp.markEnd(-1);
        }

        return c;
    }

    private int skip_over_decimal(SavePoint sp) throws IOException
    {
        int c = skip_over_number(sp);
        return c;
    }
    private int skip_over_float(SavePoint sp) throws IOException
    {
        int c = skip_over_number(sp);
        return c;
    }
    private int skip_over_timestamp(SavePoint sp) throws IOException
    {
        // we know we have dddd- or ddddT we don't know what follows
        // is should be dddd-mm
        int c = skip_timestamp_past_digits(4);
        if (c == 'T') {
            // yyyyT
            if (sp != null) {
                sp.markEnd(0);
            }
            return skip_over_whitespace(); // prefetch
        }
        if (c != '-') {
            error("invalid timestamp encountered");
        }
        // yyyy-mmT
        // yyyy-mm-ddT
        // yyyy-mm-ddT+hh:mm
        // yyyy-mm-ddThh:mm+hh:mm
        // yyyy-mm-ddThh:mm:ss+hh:mm
        // yyyy-mm-ddThh:mm:ss.dddd+hh:mm
        // yyyy-mm-ddThh:mmZ
        // yyyy-mm-ddThh:mm:ssZ
        // yyyy-mm-ddThh:mm:ss.ddddZ
        c = skip_timestamp_past_digits(2);
        if (c == 'T') {
            // yyyy-mmT
            if (sp != null) {
                sp.markEnd(0);
            }
            return skip_over_whitespace(); // prefetch
        }
        skip_timestamp_validate(c, '-');
        c = skip_timestamp_past_digits(2);
        if ( c != 'T' ) {
            return skip_timestamp_finish(c, sp);
        }
        c = read_char();
        if (!IonTokenConstsX.isDigit(c)) {
            // yyyy-mm-ddT
            return skip_timestamp_finish(skip_optional_timestamp_offset(c), sp);
        }
        // one hour digit already read above
        c = skip_timestamp_past_digits(1);
        if (c != ':') {
            bad_token(c);
        }
        c = skip_timestamp_past_digits(2);
        if (c != ':') {
            // yyyy-mm-ddThh:mm?
            return skip_timestamp_offset_or_z(c, sp);
        }
        c = skip_timestamp_past_digits(2);
        if (c != '.') {
            // yyyy-mm-ddThh:mm:ss?
            return skip_timestamp_offset_or_z(c, sp);
        }
        c = read_char();
        if (IonTokenConstsX.isDigit(c)) {
            c = skip_over_digits(c);
        }
        // yyyy-mm-ddThh:mm:ss.ddd?

        return skip_timestamp_offset_or_z(c, sp);
    }

    private int skip_timestamp_finish(int c, SavePoint sp) throws IOException {
        if (!is_value_terminating_character(c)) {
            bad_token(c);
        }
        if (sp != null) {
            sp.markEnd(-1);
        }
        return c;
    }
    private int skip_optional_timestamp_offset(int c) throws IOException
    {
        if (c == '-' || c == '+') {
            c = skip_timestamp_past_digits(2);
            if (c != ':') {
                bad_token( c );
            }
            c = skip_timestamp_past_digits(2);
        }
        return c;
    }
    private int skip_timestamp_offset_or_z(int c, SavePoint sp) throws IOException
    {
        if (c == '-' || c == '+') {
            c = skip_timestamp_past_digits(2);
            if (c != ':') {
                bad_token( c );
            }
            c = skip_timestamp_past_digits(2);
        }
        else if (c == 'Z' || c == 'z') {
            c = read_char();
        } else {
            bad_token(c);
        }
        return skip_timestamp_finish(c, sp);
    }
    private final void skip_timestamp_validate(int c, int expected) {
        if (c != expected) {
            error("invalid character '"+(char)c+
                  "' encountered in timestamp (when '"+(char)expected+
                  "' was expected");
        }
    }

    /**
     * Helper method for skipping embedded digits inside a timestamp value.
     * This overload skips exactly the number indicated, and errors if a
     * non-digit is encountered.
     */
    private final int skip_timestamp_past_digits(int len) throws IOException
    {
        // special case of the other overload
        return skip_timestamp_past_digits(len, len);
    }

    /**
     * Helper method for skipping embedded digits inside a timestamp value
     * This overload skips at least min and at most max digits, and errors
     * if a non-digit is encountered in the first min characters read
     */
    private final int skip_timestamp_past_digits(int min, int max)
        throws IOException
    {
        int c;

        // scan the first min characters insuring they're digits
        while (min > 0) {
            c = read_char();
            if (!IonTokenConstsX.isDigit(c)) {
                error("invalid character '"+(char)c+"' encountered in timestamp");
            }
            --min;
            --max;
        }
        // stop at the first non digit between min and max
        while (max > 0) {
            c = read_char();
            if (!IonTokenConstsX.isDigit(c)) {
                return c;
            }
            --max;
        }
        // max characters reached; stop
        return read_char();
    }
    protected IonType load_number(StringBuilder sb) throws IOException
    {
        boolean has_sign = false;
        int     t, c;

        // this reads int, float, decimal and timestamp strings
        // anything staring with a +, a - or a digit
        //case '0': case '1': case '2': case '3': case '4':
        //case '5': case '6': case '7': case '8': case '9':
        //case '-': case '+':

        //start_pos = _stream.getPosition();
        c = read_char();
        has_sign = ((c == '-') || (c == '+'));
        if (has_sign) {
            // if there is a sign character, we just consume it
            // here and get whatever is next in line
            sb.append((char)c);
            c = read_char();
        }

        // first leading digit - to look for hex and
        // to make sure that there is at least 1 digit (or
        // this isn't really a number
        if (!IonTokenConstsX.isDigit(c)) {
            // if it's not a digit, this isn't a number
            // the only non-digit it could have been was a
            // sign character, and we'll have read past that
            // by now
            // TODO this will be a confusing error message,
            // but I can't figure out when it will be reached.
            bad_token(c);
        }

        // the first digit is a special case
        boolean starts_with_zero = (c == '0');
        if (starts_with_zero) {
            // if it's a leading 0 check for a hex value
            int c2 = read_char();
            if (Radix.HEX.isPrefix(c2)) {
                sb.append((char)c);
                c = loadRadixValue(sb, has_sign, c2, Radix.HEX);
                return load_finish_number(sb, c, IonTokenConstsX.TOKEN_HEX);
            } else if (Radix.BINARY.isPrefix(c2)) {
                sb.append((char) c);
                c = loadRadixValue(sb, has_sign, c2, Radix.BINARY);
                return load_finish_number(sb, c, IonTokenConstsX.TOKEN_BINARY);
            }
            // not a next value, back up and try again
            unread_char(c2);
        }

        // remaining (after the first, c is the first) leading digits
        c = load_digits(sb, c);

        if (c == '-' || c == 'T') {
            // this better be a timestamp and it starts with a 4 digit
            // year followed by a dash and no leading sign
            if (has_sign) {
                error("Numeric value followed by invalid character: "
                      + sb + (char)c);
            }
            int len = sb.length();
            if (len != 4) {
                error("Numeric value followed by invalid character: "
                      + sb + (char)c);
            }
            IonType tt = load_timestamp(sb, c);
            return tt;
        }

        if (starts_with_zero) {
            // Ion doesn't allow leading zeros, so make sure our buffer only
            // has one character.
            int len = sb.length();
            if (has_sign) {
                len--; // we don't count the sign
            }
            if (len != 1) {
                error("Invalid leading zero in number: " + sb);
            }
        }

        if (c == '.') {
            // so if it's a float of some sort
            // mark it as at least a DECIMAL
            // and read the "fraction" digits
            sb.append((char)c);
            c = read_char();
            c = load_digits(sb, c);
            t = IonTokenConstsX.TOKEN_DECIMAL;
        }
        else {
            t = IonTokenConstsX.TOKEN_INT;
        }

        // see if we have an exponential as in 2d+3
        if (c == 'e' || c == 'E') {
            t = IonTokenConstsX.TOKEN_FLOAT;
            sb.append((char)c);
            c = load_exponent(sb);  // the unused lookahead char
        }
        else if (c == 'd' || c == 'D') {
            t = IonTokenConstsX.TOKEN_DECIMAL;
            sb.append((char)c);
            c = load_exponent(sb);
        }
        return load_finish_number(sb, c, t);
    }

    private final IonType load_finish_number(CharSequence numericText, int c,
                                             int token)
    throws IOException
    {
        // all forms of numeric need to stop someplace rational
        if (! is_value_terminating_character(c)) {
            error("Numeric value followed by invalid character: "
                  + numericText + (char)c);
        }

        // we read off the end of the number, so put back
        // what we don't want, but what ever we have is an int
        unread_char(c);
        IonType it = IonTokenConstsX.ion_type_of_scalar(token);
        return it;
    }
    // this returns the lookahead character it didn't use so the caller
    // can unread it
    private final int load_exponent(StringBuilder sb) throws IOException
    {
        int c = read_char();
        if (c == '-' || c == '+') {
            sb.append((char)c);
            c = read_char();
        }
        c = load_digits(sb, c);

        if (c == '.') {
            sb.append((char)c);
            c = read_char();
            c = load_digits(sb, c);
        }
        return c;
    }

    /**
     * Accumulates digits into the buffer, starting with the given character.
     *
     * @return the first non-digit character on the input. Could be the given
     *  character if its not a digit.
     *
     * @see IonTokenConstsX#isDigit(int)
     */
    private final int load_digits(StringBuilder sb, int c) throws IOException
    {
        if (!IonTokenConstsX.isDigit(c))
        {
            return c;
        }
        sb.append((char) c);

        return readNumeric(sb, Radix.DECIMAL, NumericState.DIGIT);
    }

    private final void load_fixed_digits(StringBuilder sb, int len)
        throws IOException
    {
        int c;

        switch (len) {
        default:
            while (len > 4) {
                c = read_char();
                if (!IonTokenConstsX.isDigit(c)) bad_token(c);
                sb.append((char)c);
                len--;
            }
            // fall through
        case 4:
            c = read_char();
            if (!IonTokenConstsX.isDigit(c)) bad_token(c);
            sb.append((char)c);
            // fall through
        case 3:
            c = read_char();
            if (!IonTokenConstsX.isDigit(c)) bad_token(c);
            sb.append((char)c);
            // fall through
        case 2:
            c = read_char();
            if (!IonTokenConstsX.isDigit(c)) bad_token(c);
            sb.append((char)c);
            // fall through
        case 1:
            c = read_char();
            if (!IonTokenConstsX.isDigit(c)) bad_token(c);
            sb.append((char)c);
            break;
        }

        return;
    }
    private final IonType load_timestamp(StringBuilder sb, int c)
        throws IOException
    {
        // we read the year in our caller, we should only be
        // here is we read 4 digits and then a dash or a 'T'
        assert (c == '-' || c == 'T');

        sb.append((char)c);

        // if it's 'T' we done: yyyyT
        if (c == 'T') {
            c = read_char(); // because we'll unread it before we return
            return load_finish_number(sb, c, IonTokenConstsX.TOKEN_TIMESTAMP);
        }

        // read month
        load_fixed_digits(sb, 2);

        c = read_char();
        if (c == 'T') {
            sb.append((char)c);
            c = read_char(); // because we'll unread it before we return
            return load_finish_number(sb, c, IonTokenConstsX.TOKEN_TIMESTAMP);
        }
        if (c != '-') bad_token(c);

        // read day
        sb.append((char)c);
        load_fixed_digits(sb, 2);

        // look for the 'T', otherwise we're done (and happy about it)
        c = read_char();
        if (c != 'T') {
            return load_finish_number(sb, c, IonTokenConstsX.TOKEN_TIMESTAMP);
        }

        // so either we're done or we must at least hours and minutes
        // hour
        sb.append((char)c);
        c = read_char();
        if (!IonTokenConstsX.isDigit(c)) {
            return load_finish_number(sb, c, IonTokenConstsX.TOKEN_TIMESTAMP);
        }
        sb.append((char)c);
        load_fixed_digits(sb,1); // we already read the first digit
        c = read_char();
        if (c != ':') bad_token(c);

        // minutes
        sb.append((char)c);
        load_fixed_digits(sb, 2);
        c = read_char();
        if (c == ':') {
            // seconds are optional
            // and first we'll have the whole seconds
            sb.append((char)c);
            load_fixed_digits(sb, 2);
            c = read_char();
            if (c == '.') {
                sb.append((char)c);
                c = read_char();
                // Per spec and W3C Note http://www.w3.org/TR/NOTE-datetime
                // We require at least one digit after the decimal point.
                if (!IonTokenConstsX.isDigit(c)) {
                    expected_but_found("at least one digit after timestamp's decimal point", c);
                }
                c = load_digits(sb,c);
            }
        }

        // since we have a time, we have to have a timezone of some sort
        // the timezone offset starts with a '+' '-' 'Z' or 'z'
        if (c == 'z' || c == 'Z') {
            sb.append((char)c);
            // read ahead since we'll check for a valid ending in a bit
            c = read_char();
        }
        else if (c == '+' || c == '-') {
            // then ... hours of time offset
            sb.append((char)c);
            load_fixed_digits(sb, 2);
            c = read_char();
            if (c != ':') {
                // those hours need their minutes if it wasn't a 'z'
                // (above) then it has to be a +/- hours { : minutes }
                bad_token(c);
            }
            // and finally the *not* optional minutes of time offset
            sb.append((char)c);
            load_fixed_digits(sb, 2);
            c = read_char();
        }
        else {
            // some sort of offset is required with a time value
            // if it wasn't a 'z' (above) then it has to be a +/- hours { : minutes }
            bad_token(c);
        }
        return load_finish_number(sb, c, IonTokenConstsX.TOKEN_TIMESTAMP);
    }

    private final int loadRadixValue(StringBuilder sb, boolean has_sign, int c2, Radix radix)
        throws IOException
    {
        radix.assertPrefix(c2);
        sb.append((char) c2);

        return readNumeric(sb, radix);
    }

    private final int skip_over_symbol_identifier(SavePoint sp) throws IOException
    {
        int c = read_char();

        while(IonTokenConstsX.isValidSymbolCharacter(c)) {
            c = read_char();
        }

        if (sp != null) {
            sp.markEnd(0);
         }
        return c;
    }

    protected void load_symbol_identifier(StringBuilder sb) throws IOException
    {
        int c = read_char();
        while(IonTokenConstsX.isValidSymbolCharacter(c)) {
            sb.append((char)c);
            c = read_char();
        }
        unread_char(c);
    }

    private int skip_over_symbol_operator(SavePoint sp) throws IOException
    {
        int c = read_char();

        // lookahead for +inf and -inf
        if (peek_inf_helper(c)) // this will consume the inf if it succeeds
        {
            // do nothing, peek_inf did all the work for us
            // (such as it is)
            c = read_char();
        }
        else {
            assert(IonTokenConstsX.isValidExtendedSymbolCharacter(c));

            // if it's not +/- inf then we'll just read the characters normally
            while (IonTokenConstsX.isValidExtendedSymbolCharacter(c)) {
                c = read_char();
            }
        }
        if (sp != null) {
            sp.markEnd(0);
        }
        return c;
    }
    protected void load_symbol_operator(StringBuilder sb) throws IOException
    {
        int c = read_char();

        // lookahead for +inf and -inf
        // this will consume the inf if it succeeds
        if ((c == '+' || c == '-') && peek_inf_helper(c)) {
            sb.append((char)c);
            sb.append("inf");
        }
        else {
            assert(IonTokenConstsX.isValidExtendedSymbolCharacter(c));

            // if it's not +/- inf then we'll just read the characters normally
            while (IonTokenConstsX.isValidExtendedSymbolCharacter(c)) {
                sb.append((char)c);
                c = read_char();
            }
            unread_char(c);
        }

        return;
    }
    private final int skip_single_quoted_string(SavePoint sp) throws IOException
    {
        int c;

        // the position should always be correct here
        // since there's no reason to lookahead into a
        // quoted symbol

        for (;;) {
            c = read_string_char(ProhibitedCharacters.NONE);
            switch (c) {
            case -1: unexpected_eof();
            case '\'':
                if (sp != null) {
                   sp.markEnd(-1);
                }
                return read_char(); // Return the next character beyond the token
            case '\\':
                c = read_char();
                break;
            }
        }
    }

    protected int load_single_quoted_string(StringBuilder sb, boolean is_clob)
        throws IOException
    {
        int c;
        boolean expectLowSurrogate = false;

        for (;;) {
            c = read_string_char(ProhibitedCharacters.NONE);
            switch (c) {
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3:
                continue;
            case -1:
            case '\'':
                if (!is_clob) {
                    check_for_low_surrogate(c, expectLowSurrogate);
                }
                return c;
            // new line normalization and counting is handled in read_char
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_1:
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_2:
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3:
                bad_token(c);
            case '\\':
                // TODO why not read_char_escaped() ?
                //  That's how load_double_quoted_string works.
                c = read_char();
                c = read_escaped_char_content_helper(c, is_clob);
                break;
            default:
                if (!is_clob && !IonTokenConstsX.is7bitValue(c)) {
                    c = read_large_char_sequence(c);
                }
            }
            // if this isn't a clob we need to decode UTF8 and
            // handle surrogate encoding (otherwise we don't care)
            if (!is_clob) {
                expectLowSurrogate = check_for_low_surrogate(c, expectLowSurrogate);

                if (IonUTF8.needsSurrogateEncoding(c)) {
                    sb.append(IonUTF8.highSurrogate(c));
                    c = IonUTF8.lowSurrogate(c);
                } else {
                    expectLowSurrogate = IonUTF8.isHighSurrogate(c);
                }
            }
            else if (IonTokenConstsX.is8bitValue(c)) {
                bad_token(c);
            }
            sb.append((char)c);
        }
    }

    private void skip_double_quoted_string(SavePoint sp) throws IOException
    {
        skip_double_quoted_string_helper();
        if (sp != null) {
            sp.markEnd(-1);
        }
    }

    private final void skip_double_quoted_string_helper() throws IOException
    {
        int c;
        for (;;) {
            c = read_string_char(ProhibitedCharacters.NONE);
            switch (c) {
            case -1:
                unexpected_eof(); // throws
            // new line normalization and counting is handled in read_char
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_1:
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_2:
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3:
                bad_token(c); // throws
            case '"':
                return;
            case '\\':
                c = read_char();
                break;
            }
        }
    }

    protected int load_double_quoted_string(StringBuilder sb, boolean is_clob)
        throws IOException
    {
        int c;
        boolean expectLowSurrogate = false;

        for (;;) {
            c = read_string_char(ProhibitedCharacters.SHORT_CHAR);
            switch (c) {
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3:
                continue;
            case -1:
            case '"':
                if (!is_clob) {
                    check_for_low_surrogate(c, expectLowSurrogate);
                }
                return c;
            // new line normalization and counting is handled in read_char
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_1:
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_2:
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3:
                bad_token(c);
            case '\\':
                c = read_char_escaped(c, is_clob);
                break;
            default:
                if (!is_clob && !IonTokenConstsX.is7bitValue(c)) {
                    c = read_large_char_sequence(c);
                }
                break;
            }
            // if this isn't a clob we need to decode UTF8 and
            // handle surrogate encoding (otherwise we don't care)
            if (!is_clob) {
                expectLowSurrogate = check_for_low_surrogate(c, expectLowSurrogate);

                if (IonUTF8.needsSurrogateEncoding(c)) {
                    sb.append(IonUTF8.highSurrogate(c));
                    c = IonUTF8.lowSurrogate(c);
                } else {
                    expectLowSurrogate = IonUTF8.isHighSurrogate(c);
                }
            }
            sb.append((char)c);
        }
    }

    private boolean check_for_low_surrogate(int c, boolean expectLowSurrogate) throws IonException
    {
        if (IonUTF8.isLowSurrogate(c)) {
            if (expectLowSurrogate) {
                return false;
            } else {
                error("unexpected low surrogate " + printCodePointAsString(c));
            }
        } else if (expectLowSurrogate) {
            expected_but_found("a low surrogate", c);
        }
        return false;
    }

    protected int read_double_quoted_char(boolean is_clob) throws IOException
    {
        int c = read_char();
        if(is_clob && c > 127) {
            throw new IonReaderTextTokenException("non ASCII character in clob: " + c);
        }

        switch (c) {
        case '"':
            unread_char(c);
            c = CharacterSequence.CHAR_SEQ_STRING_TERMINATOR;
            break;
        case -1:
            break;
        case '\\':
            c = read_char_escaped(c, is_clob);
            break;
        default:
            if (!is_clob && !IonTokenConstsX.is7bitValue(c)) {
                c = read_large_char_sequence(c);
            }
            break;
        }

        return c;
    }

    /**
     * Skip to the end of a triple quoted string sequence, ignoring any
     * comments encountered between triple quoted string elements.
     * @param sp
     * @throws IOException
     */
    private void skip_triple_quoted_string(SavePoint sp) throws IOException
    {
        skip_triple_quoted_string(sp, CommentStrategy.IGNORE);
    }

    /**
     * Skip to the end of a triple quoted string sequence within a clob,
     * erroring on any comments encountered between triple quoted string
     * elements.
     * @param sp
     * @throws IOException
     */
    private void skip_triple_quoted_clob_string(SavePoint sp) throws IOException
    {
        skip_triple_quoted_string(sp, CommentStrategy.ERROR);
    }

    private void skip_triple_quoted_string(SavePoint sp, CommentStrategy commentStrategy) throws IOException
    {
        // starts AFTER the 3 quotes have been consumed
        int c;
        for (;;) {
            c = read_char();
            switch (c) {
            case -1:
                unexpected_eof();
            case '\'':
                c = read_char();
                if (c == '\'') { // 2nd quote
                    c = read_char(); // possibly the 3rd
                    if (sp != null) {
                        sp.markEnd(-3);
                    }
                    if (c == '\'') { // it is the 3rd quote - end of this segment
                        c = skip_over_whitespace(commentStrategy);
                        if (c == '\'' && is_2_single_quotes_helper()) {
                            // there's another segment so read the next segment as well
                            break;
                        }
                        // end of last segment
                        unread_char(c);
                        return;
                    }
                }
                break;
            case '\\':
                c = read_char();
               break;
            }
        }
    }

    protected int load_triple_quoted_string(StringBuilder sb, boolean is_clob)
        throws IOException
    {
        int c;
        boolean expectLowSurrogate = false;

        for (;;) {
            c = read_triple_quoted_char(is_clob);
            switch(c) {
            case CharacterSequence.CHAR_SEQ_STRING_TERMINATOR:
            case CharacterSequence.CHAR_SEQ_EOF: // was EOF
                if (!is_clob) {
                    check_for_low_surrogate(c, expectLowSurrogate);
                }
                return c;
            // new line normalization and counting is handled in read_char
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_1:
                c = '\n';
                break;
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_2:
                // TODO: uncomment if we don't want to normalize end of line: c = '\r';
                c = '\n';
                break;
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3:
                // TODO: uncomment if we don't want to normalize end of line: sb.append('\r');
                c = '\n';
                break;
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3:
                continue;
            case CharacterSequence.CHAR_SEQ_STRING_NON_TERMINATOR:
                if (!is_clob) {
                    expectLowSurrogate = check_for_low_surrogate(c, expectLowSurrogate);
                }
                continue;
            default:
                break;
            }
            // if this isn't a clob we need to decode UTF8 and
            // handle surrogate encoding (otherwise we don't care)
            if (!is_clob) {
                expectLowSurrogate = check_for_low_surrogate(c, expectLowSurrogate);

                if (IonUTF8.needsSurrogateEncoding(c)) {
                    sb.append(IonUTF8.highSurrogate(c));
                    c = IonUTF8.lowSurrogate(c);
                } else {
                    expectLowSurrogate = IonUTF8.isHighSurrogate(c);
                }
            }
            sb.append((char)c);
        }
    }

    protected int read_triple_quoted_char(boolean is_clob) throws IOException
    {
        int c = read_string_char(ProhibitedCharacters.LONG_CHAR);
        if(is_clob && c > 127) {
            throw new IonReaderTextTokenException("non ASCII character in clob: " + c);
        }

        switch (c) {
        case '\'':
            if (is_2_single_quotes_helper()) {
                // so at this point we are at the end of the closing
                // triple quote - so we need to look ahead to see if
                // there's just whitespace and a new opening triple quote
                c = skip_over_whitespace();
                if (c == '\'' && is_2_single_quotes_helper()) {
                    // there's another segment so read the next segment as well
                    // since we're now just before char 1 of the next segment
                    // loop again, but don't append this char
                    return CharacterSequence.CHAR_SEQ_STRING_NON_TERMINATOR;
                }
                // end of last segment - we're done (although we read a bit too far)
                unread_char(c);
                c = CharacterSequence.CHAR_SEQ_STRING_TERMINATOR;
            }
            break;
        case '\\':
            c = read_char_escaped(c, is_clob);
            break;
        case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
        case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
        case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3:
        case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_1:
        case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_2:
        case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3:
            break;
        case -1:
            break;
        default:
            if (!is_clob && !IonTokenConstsX.is7bitValue(c)) {
                c = read_large_char_sequence(c);
            }
            break;
        }

        return c;
    }

    /** Skips over the closing }} too. */
    protected void skip_over_lob(int lobToken, SavePoint sp) throws IOException {
        switch(lobToken) {
        case IonTokenConstsX.TOKEN_STRING_DOUBLE_QUOTE:
            skip_double_quoted_string(sp);
            skip_clob_close_punctuation();
            break;
        case IonTokenConstsX.TOKEN_STRING_TRIPLE_QUOTE:
            skip_triple_quoted_clob_string(sp);
            skip_clob_close_punctuation();
            break;
        case IonTokenConstsX.TOKEN_OPEN_DOUBLE_BRACE:
            skip_over_blob(sp);
            break;
        default:
            error("unexpected token "+IonTokenConstsX.getTokenName(lobToken)+
                  " encountered for lob content");
        }
    }

    protected void load_clob(int lobToken, StringBuilder sb) throws IOException
    {
        switch(lobToken) {
        case IonTokenConstsX.TOKEN_STRING_DOUBLE_QUOTE:
            load_double_quoted_string(sb, true);
            break;
        case IonTokenConstsX.TOKEN_STRING_TRIPLE_QUOTE:
            load_triple_quoted_string(sb, true);
            break;
        case IonTokenConstsX.TOKEN_OPEN_DOUBLE_BRACE:
            load_blob(sb);
            break;
        default:
            error("unexpected token "+IonTokenConstsX.getTokenName(lobToken)+
                  " encountered for lob content");
        }
    }

    private final int read_char_escaped(int c, boolean is_clob)
        throws IOException
    {
        for (;;) {
            switch (c) {
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
            case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3:
                // loop again, we don't want empty escape chars
                c = read_string_char(ProhibitedCharacters.NONE);
                continue;
            case '\\':
                c = read_char();
                if (c < 0) {
                    unexpected_eof();
                }
                c = read_escaped_char_content_helper(c, is_clob);
                if (c == CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1
                 || c == CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2
                 || c == CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3
                ) {
                    // loop again, we don't want empty escape chars
                    c = read_string_char(ProhibitedCharacters.NONE);
                    continue;
                }
                if (c == IonTokenConstsX.ESCAPE_NOT_DEFINED) bad_escape_sequence();
                break;
            default:
                if (!is_clob && !IonTokenConstsX.is7bitValue(c)) {
                    c = read_large_char_sequence(c);
                }
                break;
            }
            break; // at this point we have a post-escaped character to return to the caller
        }

        if (c == CharacterSequence.CHAR_SEQ_EOF) return c;
        if (is_clob && !IonTokenConstsX.is8bitValue(c)) {
            error("invalid character ["+ printCodePointAsString(c)+"] in CLOB");
        }
        return c;
    }

    private final int read_large_char_sequence(int c) throws IOException
    {
        if (_stream._is_byte_data) {
            return read_ut8_sequence(c);
        }
        if (_Private_IonConstants.isHighSurrogate(c)) {
            int c2 = read_char();
            if (_Private_IonConstants.isLowSurrogate(c2)) {
                c = _Private_IonConstants.makeUnicodeScalar(c, c2);
            }
            else {
                // we don't always pair up surrogates here
                // our caller does that
                unread_char(c2);
            }
        }
        return c;
    }
    private final int read_ut8_sequence(int c) throws IOException
    {
        // this should have the high order bit set
        assert(!IonTokenConstsX.is7bitValue(c));
        int len = IonUTF8.getUTF8LengthFromFirstByte(c);
        int b2, b3, b4;
        switch (len) {
        case 1:
            break;
        case 2:
            b2 = read_char();
            c = IonUTF8.twoByteScalar(c, b2);
            break;
        case 3:
            b2 = read_char();
            b3 = read_char();
            c = IonUTF8.threeByteScalar(c, b2, b3);
            break;
        case 4:
            b2 = read_char();
            b3 = read_char();
            b4 = read_char();
            c = IonUTF8.fourByteScalar(c, b2, b3, b4);
            break;
        default:
            error("invalid UTF8 starting byte");
        }
        return c;
    }

    private void skip_over_blob(SavePoint sp) throws IOException
    {
        int c = skip_over_blob_whitespace();
        for (;;) {
            if (c == UnifiedInputStreamX.EOF) break;
            if (c == '}') break;
            c = skip_over_blob_whitespace();
        }
        if (sp != null) {
            // we don't care about these last 2 closing curly braces
            // but we may have seen one of them already
            int offset = (c == '}') ? -1 : 0;
            sp.markEnd(offset);
        }
        // did we hit EOF or the first '}' ?
        if (c != '}') unexpected_eof();
        c = read_char();
        if (c < 0) {
            unexpected_eof();
        }
        if (c != '}') {
            String message = "improperly closed BLOB, "
                           + IonTextUtils.printCodePointAsString(c)
                           + " encountered when '}' was expected";
            error(message);
        }
        if (sp != null) {
            sp.markEnd();
        }
        return;
    }
    protected void load_blob(StringBuilder sb) throws IOException {
        int c;

        for (;;) {
            c = read_base64_byte();
            if (c == UnifiedInputStreamX.EOF) {
                break;
            }
            sb.append(c);
        }
        // did we hit EOF or the first '}' ?
        if (_stream.isEOF()) unexpected_eof();

        c = read_char();
        if (c < 0) {
            unexpected_eof();
        }
        if (c != '}') {
            String message = "improperly closed BLOB, "
                           + IonTextUtils.printCodePointAsString(c)
                           + " encountered when '}' was expected";
            error(message);
        }
        return;
    }

    private final int read_escaped_char_content_helper(int c1, boolean is_clob)
        throws IOException
    {
        if (c1 < 0) {
            switch (c1) {
            // new line normalization and counting is handled in read_char
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_1:
                return CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1;
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_2:
                return CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2;
            case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3:
                return CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3;
            default:
                bad_escape_sequence(c1);
            }
        }
        if (!IonTokenConstsX.isValidEscapeStart(c1)) {
            bad_escape_sequence(c1);
        }
        int c2 = IonTokenConstsX.escapeReplacementCharacter(c1);
        switch (c2) {
        case IonTokenConstsX.ESCAPE_NOT_DEFINED:
            assert false
                : "invalid escape start characters (line " + ((char)c1)
                + " should have been removed by isValid";
            break;
        case IonTokenConstsX.ESCAPE_LITTLE_U:
            if (is_clob) {
                bad_escape_sequence(c2);
            }
            c2 = read_hex_escape_sequence_value(4);
            break;
        case IonTokenConstsX.ESCAPE_BIG_U:
            if (is_clob) {
                bad_escape_sequence(c2);
            }
            c2 = read_hex_escape_sequence_value(8);
            break;
        case IonTokenConstsX.ESCAPE_HEX:
            c2 = read_hex_escape_sequence_value(2);
            break;
        }
        return c2;
    }
    private final int read_hex_escape_sequence_value(int len) throws IOException
    {
        int hexchar = 0;
        while (len > 0) {
            len--;
            int c = read_char();
            if (c < 0) {
                unexpected_eof();
            }
            int d = IonTokenConstsX.hexDigitValue(c);
            if (d < 0) return -1;
            hexchar = (hexchar << 4) + d;
        }
        if (len > 0) {
            String message = "invalid hex digit ["
                + IonTextUtils.printCodePointAsString(hexchar)
                + "] in escape sequence";
            error(message);
        }
        return hexchar;
    }

    public final int read_base64_byte() throws IOException
    {
        int b;
        if (_base64_prefetch_count < 1) {
            b = read_base64_byte_helper();
        }
        else {
            b = (_base64_prefetch_stack & 0xff);
            _base64_prefetch_stack >>= 8;
            _base64_prefetch_count--;
        }
        return b;
    }
    private final int read_base64_byte_helper() throws IOException
    {
        // if there's any data left to read (the normal case)
        // we'll read 4 characters off the input source and
        // generate 1-3 bytes to return to the user.  That
        // will be 1 byte returned immediately and 0-2 bytes
        // put on the _binhex_stack to return later

        int c = skip_over_blob_whitespace();
        if (c == UnifiedInputStreamX.EOF || c == '}') {
            // we'll figure how which is which by check the stream for eof
            return UnifiedInputStreamX.EOF;
        }

        int c1 = read_base64_getchar_helper(c);
        int c2 = read_base64_getchar_helper();
        int c3 = read_base64_getchar_helper();
        int c4 = read_base64_getchar_helper();

        int b1, len = decode_base64_length(c1, c2, c3, c4);

        _base64_prefetch_stack = 0;
        _base64_prefetch_count = len - 1;
        switch (len) {
        default:
            String message =
                "invalid binhex sequence encountered at offset"+input_position();
            throw new IonReaderTextTokenException(message);
        case 3:
            int b3  = decode_base64_byte3(c1, c2, c3, c4);
            _base64_prefetch_stack = (b3 << 8) & 0xff00;
            // fall through
        case 2:
            int b2  = decode_base64_byte2(c1, c2, c3, c4);
            _base64_prefetch_stack |= (b2 & 0xff);
            // fall through
        case 1:
            b1 = decode_base64_byte1(c1, c2, c3, c4);
            // fall through
        }
        return b1;
    }
    private final int read_base64_getchar_helper(int c) throws IOException {
        assert( ! (c == UnifiedInputStreamX.EOF || c == '}') );

        if (c == UnifiedInputStreamX.EOF || c == '}') {
            return UnifiedInputStreamX.EOF;
        }
        if (c == BASE64_TERMINATOR_CHAR) {
            error("invalid base64 image - excess terminator characters ['=']");
        }
        return read_base64_getchar_helper2(c);
    }
    private final int read_base64_getchar_helper() throws IOException {
        int c = skip_over_blob_whitespace();
        if (c == UnifiedInputStreamX.EOF || c == '}') {
            error("invalid base64 image - too short");
        }
        return read_base64_getchar_helper2(c);
    }
    private final int read_base64_getchar_helper2(int c) throws IOException {
        assert( ! (c == UnifiedInputStreamX.EOF || c == '}') );

        if (c == BASE64_TERMINATOR_CHAR) {
            // we're using a new EOF here since the '=' is in range
            // of 0-63 (6 bits) and we don't want to confuse it with
            // the normal EOF
            return BASE64_EOF;
        }
        int b = BASE64_CHAR_TO_BIN[c & 0xff];
        if (b == UnifiedInputStreamX.EOF || !IonTokenConstsX.is8bitValue(c)) {
            String message = "invalid character "
                           + Character.toString((char)c)
                           + " encountered in base64 value at "
                           + input_position();
            throw new IonReaderTextTokenException(message);
        }
        return b;
    }
    private final static int decode_base64_length(int c1, int c2, int c3, int c4) {
        int len = 3;
        if (c4 != BASE64_EOF)      len = 3;
        else if (c3 != BASE64_EOF) len = 2;
        else                       len = 1;
        return len;
    }
    private final static int decode_base64_byte1(int c1, int c2, int c3, int c4) {
        //extracted from Base64Encoder.java:
        // convert =  c1 << 18;    [6:1] + 18 => [24:19]
        // convert |= (c2 << 12);  [6:1] + 12 => [18:13]
        // b1 = (char)((convert & 0x00FF0000) >> 16);  [32:1] & 0x00FF0000 => [24:17] - 16 => [8:1]
        // byte1 uses the 6 bits in char1 + 2 highest bits (out of 6) from char2
        if (_debug) assert(decode_base64_length(c1, c2, c3, c4) >= 1);
        int b1 = (((c1 << 2) & 0xfc) | ((c2 >> 4) & 0x03));
        return b1;
    }
    private final static int decode_base64_byte2(int c1, int c2, int c3, int c4) {
        //convert |= (c2 << 12);  [6:1]+12 => [18:13]
        //convert |= (c3 << 6);   [6:1]+6  => [12:7]
        //b2 = (char)((convert & 0x0000FF00) >> 8); [32:1] & 0x0000FF00 => [16:9] - 8 => [8:1]
        // [18:13] - 8 -> [10:5] or [6:5] from c2
        // [12:7] - 8 -> [4:-1] or [6:3] - 2 from c3
        //byte2 uses 4 low bits from c2 and 4 high bits from c3
        if (_debug) assert(decode_base64_length(c1, c2, c3, c4) >= 2);
        int b2 = (((c2 << 4) & 0xf0) | ((c3 >> 2) & 0x0f)) & 0xff;
        return b2;
    }
    private final static int decode_base64_byte3(int c1, int c2, int c3, int c4) {
        // convert |= (c3 << 6); [6:1]+6  => [12:7]
        // convert |= (c4 << 0); [6:1]+9  => [6:1]
        // b3 = (char)((convert & 0x000000FF) >> 0);
        // b3 uses low 2 bits from c3 and all 6 bits of c4
        if (_debug) assert(decode_base64_length(c1, c2, c3, c4) >= 3);
        int b3 = (((c3 & 0x03) << 6) | (c4 & 0x3f)) & 0xff;
        return b3;
    }

    protected void save_point_start(SavePoint sp) throws IOException
    {
        assert(sp != null && sp.isClear());
        long line_number = _line_count;
        long line_start = _line_starting_position;
        sp.start(line_number, line_start);
    }
    protected void save_point_activate(SavePoint sp) throws IOException
    {
        assert(sp != null && sp.isDefined());
        long line_number = _line_count;
        long line_start  = _line_starting_position;
        // this will set the "restore" (aka prev) line and start offset so
        // that when we pop the save point we'll get the correct line & char
        _stream._save_points.savePointPushActive(sp, line_number, line_start);
        _line_count = sp.getStartLineNumber();
        _line_starting_position = sp.getStartLineStart();
    }
    protected void save_point_deactivate(SavePoint sp) throws IOException
    {
        assert(sp != null && sp.isActive());

        _stream._save_points.savePointPopActive(sp);
        _line_count = sp.getPrevLineNumber();
        _line_starting_position = sp.getPrevLineStart();
    }

    protected final void error(String message)
    {
        String message2 = message + input_position();
        throw new IonReaderTextTokenException(message2);
    }
    protected final void unexpected_eof()
    {
        String message = "unexpected EOF encountered "+input_position();
        throw new UnexpectedEofException(message);
    }
    protected final void bad_escape_sequence()
    {
        String message = "bad escape character encountered "+input_position();
        throw new IonReaderTextTokenException(message);
    }
    protected final void bad_escape_sequence(int c)
    {
        String message =
            "bad escape character '"+printCodePointAsString(c)+
            "' encountered "+input_position();
        throw new IonReaderTextTokenException(message);
    }
    protected final void bad_token_start(int c)
    {
        String message =
            "bad character ["+c+", "+printCodePointAsString(c)+
            "] encountered where a token was supposed to start "+
            input_position();
        throw new IonReaderTextTokenException(message);
    }
    protected final void bad_token(int c)
    {
        String charStr = IonTextUtils.printCodePointAsString(c);
        String message =
            "a bad character " + charStr + " was encountered "+input_position();
        throw new IonReaderTextTokenException(message);
    }

    protected final void expected_but_found(String expected, int c)
    {
        String charStr = IonTextUtils.printCodePointAsString(c);
        String message =
            "Expected " + expected + " but found " + charStr + input_position();
        throw new IonReaderTextTokenException(message);
    }

    static public class IonReaderTextTokenException extends IonException {
        private static final long serialVersionUID = 1L;
        IonReaderTextTokenException(String msg) {
            super(msg);
        }
    }

    private enum ProhibitedCharacters {
        SHORT_CHAR
        {
            boolean includes(int c)
            {
                return isControlCharacter(c) && !isWhitespace(c);
            }
        },

        LONG_CHAR
        {
            boolean includes(int c)
            {
                return isControlCharacter(c) && !isWhitespace(c) && !isNewline(c);
            }
        },

        NONE
        {
            boolean includes(int c)
            {
                return false;
            }
        };

        abstract boolean includes(int c);

        private static boolean isControlCharacter(int c)
        {
            return c <= 0x1F && 0x00 <= c;
        }

        private static boolean isNewline(int c)
        {
            return c == 0x0A || c == 0x0D;
        }

        private static boolean isWhitespace(int c)
        {
            return c == 0x09 // tab
                || c == 0x0B // vertical tab
                || c == 0x0C // form feed
                || c == 0x20 // space
            ;
        }
    }

    private enum Radix
    {
        BINARY
        {
            boolean isPrefix(int c)
            {
                return c == 'b' || c == 'B';
            }

            boolean isValidDigit(int c)
            {
                return IonTokenConstsX.isBinaryDigit(c);
            }

            @Override
            char normalizeDigit(char c)
            {
                return c; // no normalization required
            }
        },

        DECIMAL
        {
            boolean isPrefix(int c)
            {
                return false;
            }

            boolean isValidDigit(int c)
            {
                return IonTokenConstsX.isDigit(c);
            }

            @Override
            char normalizeDigit(char c)
            {
                return c; // no normalization required
            }
        },

        HEX
        {
            boolean isPrefix(int c)
            {
                return c == 'x' || c == 'X';
            }

            boolean isValidDigit(int c)
            {
                return IonTokenConstsX.isHexDigit(c);
            }

            @Override
            char normalizeDigit(char c)
            {
                return Character.toLowerCase(c);
            }
        };

        abstract boolean isPrefix(int c);
        abstract boolean isValidDigit(int c);
        abstract char normalizeDigit(char c);

        void assertPrefix(int c)
        {
            assert isPrefix(c);
        }
    }

    private int readNumeric(Appendable buffer, Radix radix) throws IOException
    {
        return readNumeric(buffer, radix, NumericState.START);
    }

    private int readNumeric(Appendable buffer, Radix radix, NumericState startingState) throws IOException
    {
        NumericState state = startingState;

        for (;;)
        {
            int c = read_char();
            switch (state)
            {
                case START:
                    if (radix.isValidDigit(c))
                    {
                        buffer.append(radix.normalizeDigit((char) c));
                        state = NumericState.DIGIT;
                    }
                    else
                    {
                        return c;
                    }
                    break;
                case DIGIT:
                    if (radix.isValidDigit(c))
                    {
                        buffer.append(radix.normalizeDigit((char) c));
                        state = NumericState.DIGIT;
                    }
                    else if (c == '_')
                    {
                        state = NumericState.UNDERSCORE;
                    }
                    else
                    {
                        return c;
                    }
                    break;
                case UNDERSCORE:
                    if (radix.isValidDigit(c))
                    {
                        buffer.append(radix.normalizeDigit((char) c));
                        state = NumericState.DIGIT;
                    }
                    else
                    {
                        unread_char(c);
                        return '_';
                    }
                    break;
            }
        }
    }

    private enum NumericState
    {
        START,
        UNDERSCORE,
        DIGIT,
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy