All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.amazon.ion.impl.IonTokenReader Maven / Gradle / Ivy

There is a newer version: 1.11.9
Show newest version
/*
 * Copyright 2007-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License").
 * You may not use this file except in compliance with the License.
 * A copy of the License is located at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * or in the "license" file accompanying this file. This file is distributed
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package com.amazon.ion.impl;

import static com.amazon.ion.util.IonTextUtils.isDigit;
import static com.amazon.ion.util.IonTextUtils.isOperatorPart;
import static com.amazon.ion.util.IonTextUtils.isWhitespace;
import static com.amazon.ion.util.IonTextUtils.printCodePointAsString;

import com.amazon.ion.Decimal;
import com.amazon.ion.IonException;
import com.amazon.ion.Timestamp;
import com.amazon.ion.UnexpectedEofException;
import com.amazon.ion.impl._Private_IonConstants.HighNibble;
import com.amazon.ion.util.IonTextUtils;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.Stack;

/**
 *  This class is responsible for breaking the input stream into Tokens.
 *  It does both the token recognition (aka the scanner) and the state
 *  management needed to use the value underlying some of these tokens.
 */
final class IonTokenReader
{
    // TODO clean up, many of these are unused.
    public static int isPunctuation = 0x0001;
    public static int isKeyword     = 0x0002;
    public static int isTypeName    = 0x0004;
    public static int isConstant    = 0x0008;
    private static int isPosInt     = 0x0010;
    private static int isNegInt     = 0x0020;
    public static int isFloat       = 0x0040;
    public static int isDecimal     = 0x0080;
    public static int isTag         = 0x0100;

    static public enum Type {

        eof                 (isPunctuation,         ""    ),
        tOpenParen          (isPunctuation,         "("        ),
        tCloseParen         (isPunctuation,         ")"        ),
        tOpenSquare         (isPunctuation,         "["        ),
        tCloseSquare        (isPunctuation,         "["        ),
        tOpenCurly          (isPunctuation,         "{"        ),
        tCloseCurly         (isPunctuation,         "}"        ),
        tOpenDoubleCurly    (isPunctuation,         "{{"    ),
        //tCloseDoubleCurly (isPunctuation,         "}}"    ), // only valid at end of blob, not recognized in token read
        tSingleQuote        (isPunctuation,         "'"        ),
        tDoubleQuote        (isPunctuation,         "\""    ),
        //tColon            (isPunctuation,         ":"        ), // filled in during identifier scan with lookahead
        //tDoubleColon      (isPunctuation,         "::"    ), // ditto
        tComma              (isPunctuation,         ","        ),

        kwTrue            ((isConstant + isTag + isKeyword),  "true",           HighNibble.hnBoolean),
        kwFalse           ((isConstant + isTag + isKeyword),  "false",          HighNibble.hnBoolean),
        kwNull            ((isConstant + isTag + isKeyword),  "null",           HighNibble.hnNull),

        kwNullNull        ((isConstant + isTag + isKeyword),  "null.null",      HighNibble.hnNull),
        kwNullInt         ((isConstant + isTag + isKeyword),  "null.int",       HighNibble.hnPosInt),
        kwNullList        ((isConstant + isTag + isKeyword),  "null.list",      HighNibble.hnList),
        kwNullSexp        ((isConstant + isTag + isKeyword),  "null.sexp",      HighNibble.hnSexp),
        kwNullFloat       ((isConstant + isTag + isKeyword),  "null.float",     HighNibble.hnFloat),
        kwNullBlob        ((isConstant + isTag + isKeyword),  "null.blob",      HighNibble.hnBlob),
        kwNullClob        ((isConstant + isTag + isKeyword),  "null.clob",      HighNibble.hnClob),
        kwNullString      ((isConstant + isTag + isKeyword),  "null.string",    HighNibble.hnString),
        kwNullStruct      ((isConstant + isTag + isKeyword),  "null.struct",    HighNibble.hnStruct),  // LN
        kwNullSymbol      ((isConstant + isTag + isKeyword),  "null.symbol",    HighNibble.hnSymbol),
        kwNullBoolean     ((isConstant + isTag + isKeyword),  "null.bool",      HighNibble.hnBoolean),
        kwNullDecimal     ((isConstant + isTag + isKeyword),  "null.decimal",   HighNibble.hnDecimal),
        kwNullTimestamp   ((isConstant + isTag + isKeyword),  "null.timestamp", HighNibble.hnTimestamp),

        kwNan             ((isConstant + isKeyword),          "nan",            HighNibble.hnFloat),
        kwPosInf          ((isConstant + isKeyword),          "+inf",           HighNibble.hnFloat),
        kwNegInf          ((isConstant + isKeyword),          "-inf",           HighNibble.hnFloat),

        constNegInt       ((isConstant + isNegInt),           "cNegInt",        HighNibble.hnNegInt),
        constPosInt       ((isConstant + isPosInt),           "cPosInt",        HighNibble.hnPosInt),
        constFloat        ((isConstant + isFloat),            "cFloat",         HighNibble.hnFloat),
        constDecimal      ((isConstant + isDecimal),          "cDec",           HighNibble.hnDecimal),
        constTime         ((isConstant),                      "cTime",          HighNibble.hnTimestamp),
        constString       ((isConstant + isTag),              "cString",        HighNibble.hnString),
        constSymbol       ((isConstant + isTag),              "cSymbol",        HighNibble.hnSymbol),
        constMemberName   ((isConstant + isTag),              "cMemberName",    HighNibble.hnSymbol),
        constUserTypeDecl ((isConstant + isTag),              "cUserTypeDecl",  HighNibble.hnSymbol),

        none(0);

        private int            flags;
        private String        image;
        private HighNibble    highNibble;

        Type() {}
        Type(int v) {
            this.flags = v;
        }
        Type(int v, String name) {
            flags = v;
            image = name;
        }
        Type(int v, String name, HighNibble ln) {
            flags = v;
            image = name;
            highNibble = ln;
        }

        /**
         * TODO why this class exists at all.  We should store
         * this stuff directly in IonTimestampImpl as a BigDecimal (time) and
         * int (offset, -1==unknown) and avoid constructing more objects.
         * 

* Also, we probably don't need the DateFormats. The parser already * matches against a regex, so we should be able to pull the data * straight from the string to compute the value. */ public static class timeinfo { // TODO remove vestigial class timeinfo static public Timestamp parse(String s) { Timestamp t = null; s = s.trim(); // TODO why is this necessary? try { t = Timestamp.valueOf(s); // TODO should Timestamp just throw an IonException? } catch (IllegalArgumentException e) { throw new IonException(e); } return t; } } // TODO move out of this class into TR. public Type setNumericValue(IonTokenReader tr, String s) { switch (this) { case kwNan: tr.doubleValue = Double.NaN; return this; case kwPosInf: tr.doubleValue = Double.POSITIVE_INFINITY; return this; case kwNegInf: tr.doubleValue = Double.NEGATIVE_INFINITY; return this; case constNegInt: case constPosInt: if (NumberType.NT_HEX.equals(tr.numberType)) { tr.intValue = new BigInteger(s, 16); // In hex case we've discarded the prefix [+-]?0x so // reconstruct the sign if (this == constNegInt) tr.intValue = tr.intValue.negate(); } else { tr.intValue = new BigInteger(s, 10); // Make sure that sign aligns with type. // Note that this allows negative zero. assert (BigInteger.ZERO.equals(tr.intValue) ? true : this == (tr.intValue.signum() < 0 ? constNegInt : constPosInt)); } return this; case constFloat: tr.doubleValue = Double.parseDouble(s); return this; case constDecimal: // BigDecimal parses using e instead of d or D String eFormat = s.replace('d', 'e'); if (eFormat == s) { // No match for 'd' but look for 'D' eFormat = s.replace('D', 'e'); } tr.decimalValue = Decimal.valueOf(eFormat); return this; case constTime: tr.dateValue = timeinfo.parse(s); return this; default: throw new AssertionError("Unknown op for numeric case: " + this); } } public boolean isKeyword() { return ((flags & isKeyword) != 0); } public boolean isConstant() { return ((flags & isConstant) != 0); } public boolean isNumeric() { return ((flags & (isPosInt + isNegInt + isFloat + isDecimal)) != 0); } public String getImage() { return (image == null) ? this.name() : image; } public HighNibble getHighNibble() { return highNibble; } @Override public String toString() { if (this.getImage() != null) return this.getImage(); return super.toString(); } } // // this is a temp reader we use when converting blob's into // blob values // static class LocalReader extends Reader { IonTokenReader _tr; int _sboffset; int _sbavailable; LocalReader(IonTokenReader tr) { _tr = tr; } @Override public void close() throws IOException { _tr = null; return; } @Override public void reset() { _sboffset = 0; _sbavailable = _tr.value.length(); } @Override public int read() throws IOException { int c = -1; if (_sbavailable > 0) { c = _tr.value.charAt(_sboffset++); _sbavailable--; } else { c = _tr.read(); } return c; } @Override public int read(char[] dst, int dstoffset, int len) throws IOException { int needed = len; while (needed-- > 0) { int c = this.read(); if (c < 0) break; dst[dstoffset++] = (char)c; } return len - needed; } } // easy to use number types static enum NumberType { NT_POSINT, NT_NEGINT, NT_HEX, // pos or neg NT_FLOAT, NT_DECIMAL, NT_DECIMAL_NEGATIVE_ZERO } /** * Magic "character" to represent an escape sequence with an empty expansion. * (or, in other words, a non-eof character that should be ignored) */ static final int EMPTY_ESCAPE_SEQUENCE = -2; //-------------------------------------------------------------------- // // this is the parsing context to manage the in list, in struct, // waiting for identifier tag (usertypedecl or member name) vs value // // static public enum Context { NONE, STRING, BLOB, CLOB, EXPRESSION, DATALIST, STRUCT; } public Stack contextStack = new Stack(); public Context context = Context.NONE; public void pushContext(Context newcontext) { contextStack.push(newcontext); context = newcontext; } public Context popContext() { context = contextStack.pop(); return context; } //-------------------------------------------------------------------- // // state that keeps us on track ... // private IonCharacterReader in; private LocalReader localReader; private PushbackReader pushbackReader; public boolean inQuotedContent; public boolean isIncomplete; public boolean isLongString; public boolean quotedIdentifier; public int embeddedSlash; public int endquote; public Type t = Type.none; public Type keyword = Type.none; public StringBuilder value = new StringBuilder(); public String stringValue; public Double doubleValue; public BigInteger intValue; public Timestamp dateValue; public BigDecimal decimalValue; public boolean isNegative; public NumberType numberType; public IonTokenReader(final Reader r) { this.in = new IonCharacterReader( r ); } public long getConsumedAmount() { return in.getConsumedAmount(); } public int getLineNumber() { return in.getLineNumber(); } public int getColumn() { return in.getColumn(); } public String position() { return "line " + this.getLineNumber() + " column " + this.getColumn(); } public String getValueString(boolean is_in_expression) throws IOException { if (this.isIncomplete) { finishScanString(is_in_expression); stringValue = value.toString(); // TODO combine with below? this.inQuotedContent = false; } else if (stringValue == null) { stringValue = value.toString(); } return stringValue; } void resetValue() { isIncomplete = false; stringValue = null; doubleValue = null; intValue = null; dateValue = null; decimalValue = null; isNegative = false; numberType = null; t = null; value.setLength(0); } public PushbackReader getPushbackReader() { if (localReader == null) { localReader = new LocalReader(this); pushbackReader = new PushbackReader(localReader, _Private_Utils.MAX_LOOKAHEAD_UTF16); } localReader.reset(); return pushbackReader ; } /** * Reads the next character using the underlying stream. * @return -1 on end of stream. * @throws IOException */ final int read() throws IOException { final int ch = in.read(); assert ch != '\r'; return ch; } int readIgnoreWhitespace() throws IOException { assert !inQuotedContent; int c; // read through comments - this detects and rejects double slash // and slash star style comments and their contents for (;;) { c = read(); if (c == '/') { // possibly a start of a comment int c2 = read(); if (c2 == '/') { // we have a // comment, scan for the terminating new line while (c2 != '\n' && c2 != -1) { c2 = read(); } c = c2; } else if (c2 == '*') { // we have a /* */ comment scan for closing */ scancomment: while (c2 != -1) { c2 = read(); if (c2 == '*') { c2 = read(); if (c2 == '/') { break scancomment; } unread(c2); // in case this was the '*' we're looking for } } c = read(); } else { // it wasn't a comment start, throw it back unread(c2); } } if (/* inContent ||*/ !isWhitespace(c)) { break; } } return c; } void unread(int c) throws IOException { this.in.unread(c); } // TODO clone the body of next(c) here, later , for perf, we // don't really want TWO methods calls per character, or // mostly we don't want an extra one. (and I doubt that // Java handles the tail optimization) public Type next(boolean is_in_expression) throws IOException { inQuotedContent = false; int c = this.readIgnoreWhitespace(); return next(c, is_in_expression); } private Type next(int c, boolean is_in_expression) throws IOException { int c2; t = Type.none; isIncomplete = false; switch (c) { case -1: return (t = Type.eof); case '{': c2 = read(); if (c2 == '{') { return (t = Type.tOpenDoubleCurly); } unread(c2); return (t = Type.tOpenCurly); case '}': return (t = Type.tCloseCurly); case '[': return (t = Type.tOpenSquare); case ']': return (t = Type.tCloseSquare); case '(': return (t = Type.tOpenParen); case ')': return (t = Type.tCloseParen); case ',': return (t = Type.tComma); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return readNumber(c); case '\"': inQuotedContent = true; this.keyword = Type.none; // anything in quotes isn't a keyword return scanString(c, _Private_IonConstants.lnIsVarLen - 1); case '\'': c2 = read(); if (c2 == '\'') { c2 = read(); if (c2 == '\'') { return scanLongString(); } this.unread(c2); c2 = '\''; // restore c2 for the next unread } this.unread(c2); inQuotedContent = true; return scanIdentifier(c); case '+': c2 = read(); if (c2 == 'i') { c2 = read(); if (c2 == 'n') { c2 = read(); if (c2 == 'f') { return (t = Type.kwPosInf); } this.unread(c2); c2 = 'n'; } this.unread(c2); c2 = 'i'; } this.unread(c2); if (is_in_expression) { return scanOperator(c); } break; // break to error case '-': c2 = read(); if (c2 >= '0' && c2 <= '9') { this.unread(c2); return readNumber(c); } if (c2 == 'i') { c2 = read(); if (c2 == 'n') { c2 = read(); if (c2 == 'f') { return (t = Type.kwNegInf); } this.unread(c2); c2 = 'n'; } this.unread(c2); c2 = 'i'; } this.unread(c2); if (is_in_expression) { return scanOperator(c); } break; // break to error default: if (IonTextUtils.isIdentifierStart(c)) { return scanIdentifier(c); } if (is_in_expression && isOperatorPart(c)) { return scanOperator(c); } } String message = "Unexpected character " + printCodePointAsString(c) + " encountered at line " + this.getLineNumber() + " column " + this.getColumn(); throw new IonException(message); } public Type scanIdentifier(int c) throws IOException { // reset our local value buffer resetValue(); // we don't have an identifier type any longer, just string this.t = Type.constSymbol; // some strings are keywords, most are not // mostly we'll guess that it's not a keyword this.keyword = null; // first we read the identifier content into our value buffer // if the value is quoted, use the escape char loop, otherwise // look for the non-identifier character (and throw it back) if (!readIdentifierContents(c)) { // not quoted (true is quoted) // anything in quotes (even single quotes) is not a keyword // and here we're not in quoted content (that is handled above) // so see if it's also a keyword this.keyword = IonTokenReader.matchKeyword(value, 0, value.length()); if (this.keyword != null) { if (this.keyword == Type.kwNull) { c = this.read(); if (c == '.') { int dot = value.length(); value.append((char)c); c = this.read(); int added = readIdentifierContents(c, IonTokenConstsX.TN_MAX_NAME_LENGTH + 1); // +1 is "enough roap" so if there are extra letters at the end of the keyword we keep at least 1 int kw = IonTokenConstsX.keyword(value, dot+1, dot+added+1); switch (kw) { case IonTokenConstsX.KEYWORD_NULL: case IonTokenConstsX.KEYWORD_BOOL: case IonTokenConstsX.KEYWORD_INT: case IonTokenConstsX.KEYWORD_FLOAT: case IonTokenConstsX.KEYWORD_DECIMAL: case IonTokenConstsX.KEYWORD_TIMESTAMP: case IonTokenConstsX.KEYWORD_SYMBOL: case IonTokenConstsX.KEYWORD_STRING: case IonTokenConstsX.KEYWORD_BLOB: case IonTokenConstsX.KEYWORD_CLOB: case IonTokenConstsX.KEYWORD_LIST: case IonTokenConstsX.KEYWORD_SEXP: case IonTokenConstsX.KEYWORD_STRUCT: this.keyword = setNullType(value, dot + 1, value.length() - dot - 1); break; default: // not a valid type name - so we have to unread back to the dot for (int ii=value.length(); ii>dot; ) { ii--; char uc = value.charAt(ii); unread(uc); } String message = position() + ": Expected Ion type after 'null.' but found: " + value; throw new IonException(message); } } else { unread(c); } } this.t = this.keyword; return this.t; } } // see if we're a user type of a member name c = this.readIgnoreWhitespace(); if (c != ':') { unread(c); } else { c = read(); if (c != ':') { unread(c); this.t = Type.constMemberName; } else { this.t = Type.constUserTypeDecl; } } return this.t; } boolean readIdentifierContents(int c) throws IOException { int quote = c; inQuotedContent = (quote == '\'' || quote == '\"'); if ((quotedIdentifier = inQuotedContent) == true) { for (;;) { c = read(); if (c < 0 || c == quote) break; if (c == '\\') { c = readEscapedCharacter(this.in, false /*not clob*/); } if (c != EMPTY_ESCAPE_SEQUENCE) { value.appendCodePoint(c); } } if (c == -1) { // TODO throw UnexpectedEofException throw new IonException("end encountered before closing quote '\\" + (char)endquote+ "'"); } // c, at this point is a single quote, which we don't append inQuotedContent = false; } else { value.append((char)c); for (;;) { c = read(); if (!IonTextUtils.isIdentifierPart(c)) { break; } value.append((char)c); } unread(c); // we throw back our terminator here } return quotedIdentifier; } int readIdentifierContents(int c, int max_length) throws IOException { assert(c != '\'' && c != '\"'); value.append((char)c); int count = 1; while (count < max_length) { c = read(); if (!IonTextUtils.isIdentifierPart(c)) { unread(c); // we throw back our terminator here break; } value.append((char)c); count++; } return count; } static Type matchKeyword(StringBuilder sb, int pos, int valuelen) throws IOException { Type keyword = null; switch (sb.charAt(pos++)) { // there has to be at least 1 chacter or we wouldn't be here case 'f': if (valuelen == 5 // "f" && sb.charAt(pos++) == 'a' && sb.charAt(pos++) == 'l' && sb.charAt(pos++) == 's' && sb.charAt(pos++) == 'e' ) { keyword = Type.kwFalse; } break; case 'n': if (valuelen == 4 // 'n' && sb.charAt(pos++) == 'u' && sb.charAt(pos++) == 'l' && sb.charAt(pos++) == 'l' ) { keyword = Type.kwNull; } else if (valuelen == 3 // 'n' && sb.charAt(pos++) == 'a' && sb.charAt(pos++) == 'n' ) { keyword = Type.kwNan; } break; case 't': if (valuelen == 4 // "t" && sb.charAt(pos++) == 'r' && sb.charAt(pos++) == 'u' && sb.charAt(pos++) == 'e' ) { keyword = Type.kwTrue; } break; } return keyword; } public Type setNullType(StringBuilder sb, int pos, int valuelen) { switch (valuelen) { case 3: //int if (sb.charAt(pos++) == 'i' && sb.charAt(pos++) == 'n' && sb.charAt(pos++) == 't' ) { return Type.kwNullInt; } break; case 4: //bool, blob, list, null, clob, sexp switch (sb.charAt(pos++)) { case 'b': //bool, blob switch (sb.charAt(pos++)) { case 'o': // bool if (sb.charAt(pos++) == 'o' && sb.charAt(pos++) == 'l' ) { return Type.kwNullBoolean; } break; case 'l': //blob if (sb.charAt(pos++) == 'o' && sb.charAt(pos++) == 'b' ) { return Type.kwNullBlob; } break; } break; case 'l': if (sb.charAt(pos++) == 'i' && sb.charAt(pos++) == 's' && sb.charAt(pos++) == 't' ) { return Type.kwNullList; } break; case 'n': if (sb.charAt(pos++) == 'u' && sb.charAt(pos++) == 'l' && sb.charAt(pos++) == 'l' ) { return Type.kwNullNull; } break; case 'c': //clob if (sb.charAt(pos++) == 'l' && sb.charAt(pos++) == 'o' && sb.charAt(pos++) == 'b' ) { return Type.kwNullClob; } break; case 's': //sexp if (sb.charAt(pos++) == 'e' && sb.charAt(pos++) == 'x' && sb.charAt(pos++) == 'p' ) { return Type.kwNullSexp; } break; } break; case 5: //float if (sb.charAt(pos++) == 'f' && sb.charAt(pos++) == 'l' && sb.charAt(pos++) == 'o' && sb.charAt(pos++) == 'a' && sb.charAt(pos++) == 't' ) { return Type.kwNullFloat; } break; case 6: switch (sb.charAt(pos++)) { case 's': //string //struct //symbol switch(sb.charAt(pos++)) { case 't': if (sb.charAt(pos++) == 'r') { switch (sb.charAt(pos++)) { case 'i': if (sb.charAt(pos++) == 'n' && sb.charAt(pos++) == 'g') { return Type.kwNullString; } break; case 'u': if (sb.charAt(pos++) == 'c' && sb.charAt(pos++) == 't') { return Type.kwNullStruct; } break; } } break; case 'y': if (sb.charAt(pos++) == 'm' && sb.charAt(pos++) == 'b' && sb.charAt(pos++) == 'o' && sb.charAt(pos++) == 'l' ) { return Type.kwNullSymbol; } break; } } break; case 7: //decimal if (sb.charAt(pos++) == 'd' && sb.charAt(pos++) == 'e' && sb.charAt(pos++) == 'c' && sb.charAt(pos++) == 'i' && sb.charAt(pos++) == 'm' && sb.charAt(pos++) == 'a' && sb.charAt(pos++) == 'l' ) { return Type.kwNullDecimal; } break; case 9: //timestamp if (sb.charAt(pos++) == 't' && sb.charAt(pos++) == 'i' && sb.charAt(pos++) == 'm' && sb.charAt(pos++) == 'e' && sb.charAt(pos++) == 's' && sb.charAt(pos++) == 't' && sb.charAt(pos++) == 'a' && sb.charAt(pos++) == 'm' && sb.charAt(pos++) == 'p' ) { return Type.kwNullTimestamp; } break; } String nullimage = sb.toString(); throw new IonException("invalid null value '"+nullimage+"' at " + this.position()); } public Type scanOperator(int c) throws IOException { // reset our local value buffer resetValue(); // we don't have an identifier type any longer, just string this.t = Type.constSymbol; // some strings are keywords, most are not // mostly we'll guess that it's not a keyword this.keyword = null; // first we read the identifier content into our value buffer // if the value is quoted, use the escape char loop, otherwise // look for the non-identifier character (and throw it back) value.append((char)c); for (;;) { c = read(); if (!IonTextUtils.isOperatorPart(c)) { break; } value.append((char)c); } unread(c); // we throw back our terminator here return this.t; } public Type scanString(int c, int maxlookahead) throws IOException { // reset out local value buffer resetValue(); if (c != '\"') { throw new IonException("Programmer error! Only a quote should get you here."); } endquote = '\"'; sizedloop: while (maxlookahead-- > 0) { switch ((c = this.read())) { case -1: break sizedloop; // TODO deoptimize, throw exception case '\"': break sizedloop; case '\n': throw new IonException("unexpected line terminator encountered in quoted string"); case '\\': c = readEscapedCharacter(this.in, false /*not clob*/); // throws UnexpectedEofException on EOF if (c != EMPTY_ESCAPE_SEQUENCE) { value.appendCodePoint(c); } break; default: // c could be part of a surrogate pair so we can't use // appendCodePoint() value.append((char)c); break; } } if (maxlookahead != -1 && c == '\"') { // this is the normal, non-longline case so we're just done closeString(); } else { // we're not at the closing quote, so this is an incomplete // string which will have to be read to the end ... later leaveOpenString(c, false); } return Type.constString; } void leaveOpenString(int c, boolean longstring) { if (c == -1) { throw new UnexpectedEofException(); } this.isIncomplete = true; this.inQuotedContent = true; this.isLongString = longstring; } /** * Scans the rest of the string through {@link #endquote}. *

XXX WARNING XXX * Almost identical logic is found in * {@link IonBinary#appendToLongValue(int, boolean, boolean, PushbackReader)} * * @param is_in_expression * @throws UnexpectedEofException if we hit EOF before the endquote. */ void finishScanString(boolean is_in_expression) throws IOException { assert isIncomplete; assert inQuotedContent; for (;;) { int c = this.read(); if (c == -1) { throw new UnexpectedEofException(); } if (c == endquote) break; // endquote == -1 during long strings if (c == '\\') { c = readEscapedCharacter(this.in, false /*not clob*/); if (c != EMPTY_ESCAPE_SEQUENCE) { value.appendCodePoint(c); } } else if (c == '\'' && isLongString) { // This happens while handling triple-quote field names if (twoMoreSingleQuotes()) { // At the end of a long string. Look for another one. inQuotedContent = false; c = readIgnoreWhitespace(); if (c == '\'' && twoMoreSingleQuotes()) { // Yep, another triple-quote. We've consumed all three // so just keep reading. inQuotedContent = true; } else { // Something else, we are done. unread(c); break; } } else { // Still inside the long string value.append((char)c); } } else if (!isLongString && (c == '\n')) { throw new IonException("unexpected line terminator encountered in quoted string"); } else { value.append((char)c); } } return; } /** * If two single quotes are next on the input, consume them and return * true. Otherwise, leave them on the input and return false. * @return true when there are two pending single quotes. * @throws IOException */ private boolean twoMoreSingleQuotes() throws IOException { int c = read(); if (c == '\'') { int c2 = read(); if (c2 == '\'') { return true; } unread(c2); } unread(c); return false; } void closeString() { this.isIncomplete = false; this.inQuotedContent = false; this.isLongString = false; } public Type scanLongString() throws IOException { // reset out local value buffer resetValue(); endquote = -1; // we're not at the closing quote, so this is an incomplete // string which will have to be read to the end ... later leaveOpenString(EMPTY_ESCAPE_SEQUENCE, true); return Type.constString; } /** * Closes parsing of a triple-quote string, remembering that there may be * more to come. */ void closeLongString() { this.isIncomplete = true; this.inQuotedContent = false; this.isLongString = true; } /** * @return the translated escape sequence. Will not return -1 since EOF is * not legal in this context. May return {@link #EMPTY_ESCAPE_SEQUENCE} to * indicate a zero-length sequence such as BS-NL. * @throws UnexpectedEofException if the EOF is encountered in the middle * of the escape sequence. * */ public static int readEscapedCharacter(PushbackReader r, boolean inClob) throws IOException, UnexpectedEofException { // \\ The backslash character // \0 The character 0 (nul terminator char, for some) // \xhh The character with hexadecimal value 0xhh // \ u hhhh The character with hexadecimal value 0xhhhh // \t The tab character ('\u0009') // \n The newline (line feed) character ('\ u 000A') // \v The vertical tab character ('\u000B') // \r The carriage-return character ('\ u 000D') // \f The form-feed character ('\ u 000C') // \a The alert (bell) character ('\ u 0007') // \" The double quote character // \' The single quote character // \? The question mark character int c2 = 0, c = r.read(); switch (c) { case -1: throw new UnexpectedEofException(); case 't': return '\t'; case 'n': return '\n'; case 'v': return '\u000B'; case 'r': return '\r'; case 'f': return '\f'; case 'b': return '\u0008'; case 'a': return '\u0007'; case '\\': return '\\'; case '\"': return '\"'; case '\'': return '\''; case '/': return '/'; case '?': return '?'; case 'U': // Expecting 8 hex digits if (inClob) { throw new IonException("Unicode escapes \\U not allowed in clob"); } c = readDigit(r, 16, true); if (c < 0) break; // TODO throw UnexpectedEofException c2 = c << 28; c = readDigit(r, 16, true); if (c < 0) break; c2 += c << 24; c = readDigit(r, 16, true); if (c < 0) break; c2 += c << 20; c = readDigit(r, 16, true); if (c < 0) break; c2 += c << 16; // ... fall through... case 'u': // Expecting 4 hex digits if (inClob) { throw new IonException("Unicode escapes \\u not allowed in clob"); } c = readDigit(r, 16, true); if (c < 0) break; c2 += c << 12; c = readDigit(r, 16, true); if (c < 0) break; c2 += c << 8; // ... fall through... case 'x': // Expecting 2 hex digits c = readDigit(r, 16, true); if (c < 0) break; c2 += c << 4; c = readDigit(r, 16, true); if (c < 0) break; return c2 + c; case '0': return 0; case '\n': return EMPTY_ESCAPE_SEQUENCE; default: break; } throw new IonException("invalid escape sequence \"\\" + (char) c + "\" [" + c + "]"); } public static int readDigit(/*EscapedCharacterReader*/PushbackReader r, int radix, boolean isRequired) throws IOException { int c = r.read(); if (c < 0) { r.unread(c); return -1; } int c2 = Character.digit(c, radix); if (c2 < 0) { if (isRequired) { throw new IonException("bad digit in escaped character '"+((char)c)+"'"); } // if it's not a required digit, we just throw it back r.unread(c); return -1; } return c2; } private void checkAndUnreadNumericStopper(int c) throws IOException { // Ignore EOF, so we don't "unread" it - EOF is a terminator if (c != -1) { if ( ! this.isValueTerminatingCharacter(c) ) { final String message = position() + ": Numeric value followed by invalid character " + printCodePointAsString(c); throw new IonException(message); } this.unread(c); } } private final boolean isValueTerminatingCharacter(int c) throws IOException { boolean isTerminator; if (c == '/') { // this is terminating only if it starts a comment of some sort c = this.read(); this.unread(c); // we never "keep" this character isTerminator = (c == '/' || c == '*'); } else { isTerminator = IonTextUtils.isNumericStop(c); } return isTerminator; } public Type readNumber(int c) throws IOException { // clear out our string buffer resetValue(); // process the initial sign character, if its there. boolean explicitPlusSign = false; switch (c) { case '-': value.append((char)c); c = this.read(); t = Type.constNegInt; this.numberType = NumberType.NT_NEGINT; this.isNegative = true; break; case '+': // we eat the plus sign, but remember we saw one explicitPlusSign = true; c = this.read(); t = Type.constPosInt; this.isNegative = false; this.numberType = NumberType.NT_POSINT; break; default: t = Type.constPosInt; this.numberType = NumberType.NT_POSINT; this.isNegative = false; break; } // process the initial digit. Caller has checked that it's a digit. assert isDigit(c, 10); value.append((char)c); boolean isZero = (c == '0'); boolean leadingZero = isZero; // process the next character after the initial digit. switch((c = this.read())) { case 'x': case 'X': if (!isZero) { throw new IonException("badly formed number encountered at " + position()); } this.numberType = NumberType.NT_HEX; // We don't need to append the char because we're going to wipe // the value anyway to accumulate just the hex digits. return scanHexNumber(); case '.': case 'd': case 'D': t = Type.constDecimal; this.numberType = NumberType.NT_DECIMAL; value.append((char)c); break; case 'e': case 'E': t = Type.constFloat; this.numberType = NumberType.NT_FLOAT; value.append((char)c); break; default: if (!isDigit(c, 10)) { checkAndUnreadNumericStopper(c); if (isZero && NumberType.NT_NEGINT.equals(this.numberType)) { t = Type.constPosInt; this.numberType = NumberType.NT_POSINT; } return t; } value.append((char)c); isZero &= (c == '0'); break; } // see if see can continue find leading digits if (NumberType.NT_NEGINT.equals(numberType) || NumberType.NT_POSINT.equals(numberType) ) { // We've now scanned at least two digits. // read in the remaining whole number digits for (;;) { c = this.read(); if (!isDigit(c, 10)) break; value.append((char)c); isZero &= (c == '0'); } // and see what we ran aground on .. switch (c) { case '.': case 'd': case 'D': if (leadingZero) { throw new IonException(position() + ": Invalid leading zero on numeric"); } t = Type.constDecimal; this.numberType = NumberType.NT_DECIMAL; value.append((char)c); break; case 'e': case 'E': if (leadingZero) { throw new IonException(position() + ": Invalid leading zero on numeric"); } t = Type.constFloat; this.numberType = NumberType.NT_FLOAT; value.append((char)c); break; case 'T': // same as '-' it's a timestamp case '-': if (NumberType.NT_POSINT.equals(this.numberType) && !explicitPlusSign) { return scanTimestamp(c); } // otherwise fall through to "all other" processing default: // Hit the end of our integer. Make sure its a valid stopper. checkAndUnreadNumericStopper(c); if (leadingZero && !isZero) { throw new IonException(position() + ": Invalid leading zero on numeric"); } if (isZero && NumberType.NT_NEGINT.equals(this.numberType)) { t = Type.constPosInt; this.numberType = NumberType.NT_POSINT; } return t; } } // now we must have just decimal or float // if it's decimal we ended on a DOT, so get the trailing // (fractional) digits if (NumberType.NT_DECIMAL.equals(this.numberType)) { for (;;) { c = this.read(); if (!isDigit(c, 10)) break; value.append((char)c); } // and see what we ran aground on this time.. switch (c) { case '-': case '+': // We'll handle exponent below. this.unread(c); break; case 'e': case 'E': // this is the only viable option now t = Type.constFloat; this.numberType = NumberType.NT_FLOAT; value.append((char)c); break; case 'd': case 'D': // this is the only viable option now assert t == Type.constDecimal; assert NumberType.NT_DECIMAL.equals(this.numberType); value.append((char)c); break; default: checkAndUnreadNumericStopper(c); return t; } } // now we have the exponent part to read // reading the first character of the exponent, it can be // a sign character or a digit (this is the only place // the sign is valid) and we're for sure a float at this point switch ((c = this.read())) { case '-': value.append((char)c); break; case '+': // we eat the plus sign break; default: if (!isDigit(c, 10)) { // this they said 'e' then they better put something valid after it! throw new IonException("badly formed number encountered at " + position()); } this.unread(c); // we'll re-read this in just a bit, but we had to check break; } // read in the remaining whole number digits and then quit for (;;) { c = this.read(); if (!isDigit(c, 10)) break; value.append((char)c); } checkAndUnreadNumericStopper(c); return t; } Type scanHexNumber() throws IOException { boolean anydigits = false; int c; boolean isZero = true; // Leave the 0x in the buffer since we'll need it to parse the content. // first get clear out our string buffer, we're starting over value.setLength(0); // resetValue() wipes out too much // read in the remaining whole number digits and then quit for (;;) { c = this.read(); if (!isDigit(c, 16)) break; anydigits = true; isZero &= (c == '0'); value.append((char)c); } if (!anydigits) { throw new IonException("badly formed hexadecimal number encountered at " + position()); } checkAndUnreadNumericStopper(c); if (isZero && t == Type.constNegInt) { t = Type.constPosInt; } // as long as we saw any digit, we're good return t; } /** * Scans a timestamp after reading yyyy-. * * We can be a little lenient here since the result will be reparsed and * validated more thoroughly by {@link Timestamp#valueOf(CharSequence)}. * * @param c the last character scanned; must be '-'. * @return {@link Type#constTime} */ Type scanTimestamp(int c) throws IOException { endofdate: for (;;) { // fake for loop to create a label we can jump out of, // because 4 or 5 levels of nested if's is just ugly // at this point we will have read leading digits and exactly 1 dash // in other words, we'll have read the year if (c == 'T') { // yearT is a valid timestamp value value.append((char)c); c = this.read(); // because we'll unread it before we return break endofdate; } if (c != '-') { // not a dash or a T after the year - so this is a bad value throw new IllegalStateException("invalid timestamp, expecting a dash here at " + this.position()); } // append the dash and then read the month field value.append((char)c); // so append it, because we haven't already c = readDigits(2, "month"); if (c == 'T') { // year-monthT is a valid timestamp value value.append((char)c); c = this.read(); // because we'll unread it before we return break endofdate; } if (c != '-') { // if the month isn't followed by a dash or a T it's an invalid month throw new IonException("invalid timestamp, expecting month at " + this.position()); } // append the dash and read the day (or day-of-month) field value.append((char)c); c = readDigits(2, "day of month"); if (c == 'T') { check4timezone: for (;;) { // another fake label/ for=goto // attach the 'T' to the value we're collecting value.append((char)c); // we're going to "watch" how many digits we read in the hours // field. It's 0 that's actually ok, since we can end at the // 'T' we just read int length_before_reading_hours = value.length(); // so read the hours c = readDigits(2, "hours"); if (length_before_reading_hours == value.length()) { // FIXME I don't think there should be a timezone here break check4timezone; } if (c != ':') { throw new IonException("invalid timestamp, expecting hours at " + this.position()); } value.append((char)c); // so read the minutes c = readDigits(2, "minutes"); if (c != ':') { if (c == '-' || c == '+' || c == 'Z') { break check4timezone; } break endofdate; } value.append((char)c); // so read the seconds c = readDigits(2, "seconds"); if (c != '.') { if (c == '-' || c == '+' || c == 'Z') { break check4timezone; } break endofdate; } value.append((char)c); // so read the fractional seconds c = readDigits(32, "fractional seconds"); break check4timezone; }//check4timezone // now check to see if it's a timezone offset we're looking at if (c == '-' || c == '+') { value.append((char)c); // so read the timezone offset c = readDigits(2, "timezone offset"); if (c != ':') break endofdate; value.append((char)c); c = readDigits(2, "timezone offset"); } else if (c == 'Z') { value.append((char)c); c = this.read(); // because we'll unread it before we return } } break endofdate; }//endofdate checkAndUnreadNumericStopper(c); return Type.constTime; } int readDigits(int limit, String field) throws IOException { int c, len = 0; // read in the remaining whole number digits and then quit for (;;) { c = this.read(); if (!isDigit(c, 10)) break; len++; if (len > limit) { throw new IonException("invalid format " + field + " too long at " + this.position()); } value.append((char)c); } return c; } public boolean makeValidNumeric(Type castto) throws IOException { String s = getValueString(false); try { this.t = castto.setNumericValue(this, s); } catch (NumberFormatException e) { throw new IonException(this.position() + ": invalid numeric value " + s, e); } return this.t.isNumeric(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy