com.amazon.ion.impl.IonTokenReader Maven / Gradle / Ivy
Show all versions of ion-java Show documentation
/*
* Copyright 2007-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package com.amazon.ion.impl;
import static com.amazon.ion.util.IonTextUtils.isDigit;
import static com.amazon.ion.util.IonTextUtils.isOperatorPart;
import static com.amazon.ion.util.IonTextUtils.isWhitespace;
import static com.amazon.ion.util.IonTextUtils.printCodePointAsString;
import com.amazon.ion.Decimal;
import com.amazon.ion.IonException;
import com.amazon.ion.Timestamp;
import com.amazon.ion.UnexpectedEofException;
import com.amazon.ion.impl._Private_IonConstants.HighNibble;
import com.amazon.ion.util.IonTextUtils;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.Stack;
/**
* This class is responsible for breaking the input stream into Tokens.
* It does both the token recognition (aka the scanner) and the state
* management needed to use the value underlying some of these tokens.
*/
final class IonTokenReader
{
// TODO clean up, many of these are unused.
public static int isPunctuation = 0x0001;
public static int isKeyword = 0x0002;
public static int isTypeName = 0x0004;
public static int isConstant = 0x0008;
private static int isPosInt = 0x0010;
private static int isNegInt = 0x0020;
public static int isFloat = 0x0040;
public static int isDecimal = 0x0080;
public static int isTag = 0x0100;
static public enum Type {
eof (isPunctuation, "" ),
tOpenParen (isPunctuation, "(" ),
tCloseParen (isPunctuation, ")" ),
tOpenSquare (isPunctuation, "[" ),
tCloseSquare (isPunctuation, "[" ),
tOpenCurly (isPunctuation, "{" ),
tCloseCurly (isPunctuation, "}" ),
tOpenDoubleCurly (isPunctuation, "{{" ),
//tCloseDoubleCurly (isPunctuation, "}}" ), // only valid at end of blob, not recognized in token read
tSingleQuote (isPunctuation, "'" ),
tDoubleQuote (isPunctuation, "\"" ),
//tColon (isPunctuation, ":" ), // filled in during identifier scan with lookahead
//tDoubleColon (isPunctuation, "::" ), // ditto
tComma (isPunctuation, "," ),
kwTrue ((isConstant + isTag + isKeyword), "true", HighNibble.hnBoolean),
kwFalse ((isConstant + isTag + isKeyword), "false", HighNibble.hnBoolean),
kwNull ((isConstant + isTag + isKeyword), "null", HighNibble.hnNull),
kwNullNull ((isConstant + isTag + isKeyword), "null.null", HighNibble.hnNull),
kwNullInt ((isConstant + isTag + isKeyword), "null.int", HighNibble.hnPosInt),
kwNullList ((isConstant + isTag + isKeyword), "null.list", HighNibble.hnList),
kwNullSexp ((isConstant + isTag + isKeyword), "null.sexp", HighNibble.hnSexp),
kwNullFloat ((isConstant + isTag + isKeyword), "null.float", HighNibble.hnFloat),
kwNullBlob ((isConstant + isTag + isKeyword), "null.blob", HighNibble.hnBlob),
kwNullClob ((isConstant + isTag + isKeyword), "null.clob", HighNibble.hnClob),
kwNullString ((isConstant + isTag + isKeyword), "null.string", HighNibble.hnString),
kwNullStruct ((isConstant + isTag + isKeyword), "null.struct", HighNibble.hnStruct), // LN
kwNullSymbol ((isConstant + isTag + isKeyword), "null.symbol", HighNibble.hnSymbol),
kwNullBoolean ((isConstant + isTag + isKeyword), "null.bool", HighNibble.hnBoolean),
kwNullDecimal ((isConstant + isTag + isKeyword), "null.decimal", HighNibble.hnDecimal),
kwNullTimestamp ((isConstant + isTag + isKeyword), "null.timestamp", HighNibble.hnTimestamp),
kwNan ((isConstant + isKeyword), "nan", HighNibble.hnFloat),
kwPosInf ((isConstant + isKeyword), "+inf", HighNibble.hnFloat),
kwNegInf ((isConstant + isKeyword), "-inf", HighNibble.hnFloat),
constNegInt ((isConstant + isNegInt), "cNegInt", HighNibble.hnNegInt),
constPosInt ((isConstant + isPosInt), "cPosInt", HighNibble.hnPosInt),
constFloat ((isConstant + isFloat), "cFloat", HighNibble.hnFloat),
constDecimal ((isConstant + isDecimal), "cDec", HighNibble.hnDecimal),
constTime ((isConstant), "cTime", HighNibble.hnTimestamp),
constString ((isConstant + isTag), "cString", HighNibble.hnString),
constSymbol ((isConstant + isTag), "cSymbol", HighNibble.hnSymbol),
constMemberName ((isConstant + isTag), "cMemberName", HighNibble.hnSymbol),
constUserTypeDecl ((isConstant + isTag), "cUserTypeDecl", HighNibble.hnSymbol),
none(0);
private int flags;
private String image;
private HighNibble highNibble;
Type() {}
Type(int v) {
this.flags = v;
}
Type(int v, String name) {
flags = v;
image = name;
}
Type(int v, String name, HighNibble ln) {
flags = v;
image = name;
highNibble = ln;
}
/**
* TODO why this class exists at all. We should store
* this stuff directly in IonTimestampImpl as a BigDecimal (time) and
* int (offset, -1==unknown) and avoid constructing more objects.
*
* Also, we probably don't need the DateFormats. The parser already
* matches against a regex, so we should be able to pull the data
* straight from the string to compute the value.
*/
public static class timeinfo { // TODO remove vestigial class timeinfo
static public Timestamp parse(String s) {
Timestamp t = null;
s = s.trim(); // TODO why is this necessary?
try {
t = Timestamp.valueOf(s); // TODO should Timestamp just throw an IonException?
}
catch (IllegalArgumentException e) {
throw new IonException(e);
}
return t;
}
}
// TODO move out of this class into TR.
public Type setNumericValue(IonTokenReader tr, String s) {
switch (this) {
case kwNan:
tr.doubleValue = Double.NaN;
return this;
case kwPosInf:
tr.doubleValue = Double.POSITIVE_INFINITY;
return this;
case kwNegInf:
tr.doubleValue = Double.NEGATIVE_INFINITY;
return this;
case constNegInt:
case constPosInt:
if (NumberType.NT_HEX.equals(tr.numberType)) {
tr.intValue = new BigInteger(s, 16);
// In hex case we've discarded the prefix [+-]?0x so
// reconstruct the sign
if (this == constNegInt) tr.intValue = tr.intValue.negate();
}
else {
tr.intValue = new BigInteger(s, 10);
// Make sure that sign aligns with type.
// Note that this allows negative zero.
assert (BigInteger.ZERO.equals(tr.intValue)
? true
: this == (tr.intValue.signum() < 0 ? constNegInt : constPosInt));
}
return this;
case constFloat:
tr.doubleValue = Double.parseDouble(s);
return this;
case constDecimal:
// BigDecimal parses using e instead of d or D
String eFormat = s.replace('d', 'e');
if (eFormat == s) {
// No match for 'd' but look for 'D'
eFormat = s.replace('D', 'e');
}
tr.decimalValue = Decimal.valueOf(eFormat);
return this;
case constTime:
tr.dateValue = timeinfo.parse(s);
return this;
default:
throw new AssertionError("Unknown op for numeric case: " + this);
}
}
public boolean isKeyword() { return ((flags & isKeyword) != 0); }
public boolean isConstant() { return ((flags & isConstant) != 0); }
public boolean isNumeric() { return ((flags & (isPosInt + isNegInt + isFloat + isDecimal)) != 0); }
public String getImage() { return (image == null) ? this.name() : image; }
public HighNibble getHighNibble() { return highNibble; }
@Override
public String toString() {
if (this.getImage() != null) return this.getImage();
return super.toString();
}
}
//
// this is a temp reader we use when converting blob's into
// blob values
//
static class LocalReader extends Reader {
IonTokenReader _tr;
int _sboffset;
int _sbavailable;
LocalReader(IonTokenReader tr) {
_tr = tr;
}
@Override
public void close() throws IOException {
_tr = null;
return;
}
@Override
public void reset() {
_sboffset = 0;
_sbavailable = _tr.value.length();
}
@Override
public int read() throws IOException {
int c = -1;
if (_sbavailable > 0) {
c = _tr.value.charAt(_sboffset++);
_sbavailable--;
}
else {
c = _tr.read();
}
return c;
}
@Override
public int read(char[] dst, int dstoffset, int len) throws IOException {
int needed = len;
while (needed-- > 0) {
int c = this.read();
if (c < 0) break;
dst[dstoffset++] = (char)c;
}
return len - needed;
}
}
// easy to use number types
static enum NumberType {
NT_POSINT,
NT_NEGINT,
NT_HEX, // pos or neg
NT_FLOAT,
NT_DECIMAL,
NT_DECIMAL_NEGATIVE_ZERO
}
/**
* Magic "character" to represent an escape sequence with an empty expansion.
* (or, in other words, a non-eof character that should be ignored)
*/
static final int EMPTY_ESCAPE_SEQUENCE = -2;
//--------------------------------------------------------------------
//
// this is the parsing context to manage the in list, in struct,
// waiting for identifier tag (usertypedecl or member name) vs value
//
//
static public enum Context {
NONE,
STRING,
BLOB,
CLOB,
EXPRESSION,
DATALIST,
STRUCT;
}
public Stack contextStack = new Stack();
public Context context = Context.NONE;
public void pushContext(Context newcontext) {
contextStack.push(newcontext);
context = newcontext;
}
public Context popContext() {
context = contextStack.pop();
return context;
}
//--------------------------------------------------------------------
//
// state that keeps us on track ...
//
private IonCharacterReader in;
private LocalReader localReader;
private PushbackReader pushbackReader;
public boolean inQuotedContent;
public boolean isIncomplete;
public boolean isLongString;
public boolean quotedIdentifier;
public int embeddedSlash;
public int endquote;
public Type t = Type.none;
public Type keyword = Type.none;
public StringBuilder value = new StringBuilder();
public String stringValue;
public Double doubleValue;
public BigInteger intValue;
public Timestamp dateValue;
public BigDecimal decimalValue;
public boolean isNegative;
public NumberType numberType;
public IonTokenReader(final Reader r) {
this.in = new IonCharacterReader( r );
}
public long getConsumedAmount() {
return in.getConsumedAmount();
}
public int getLineNumber() {
return in.getLineNumber();
}
public int getColumn() {
return in.getColumn();
}
public String position() {
return "line " + this.getLineNumber() + " column " + this.getColumn();
}
public String getValueString(boolean is_in_expression) throws IOException {
if (this.isIncomplete) {
finishScanString(is_in_expression);
stringValue = value.toString(); // TODO combine with below?
this.inQuotedContent = false;
}
else if (stringValue == null) {
stringValue = value.toString();
}
return stringValue;
}
void resetValue() {
isIncomplete = false;
stringValue = null;
doubleValue = null;
intValue = null;
dateValue = null;
decimalValue = null;
isNegative = false;
numberType = null;
t = null;
value.setLength(0);
}
public PushbackReader getPushbackReader() {
if (localReader == null) {
localReader = new LocalReader(this);
pushbackReader =
new PushbackReader(localReader,
_Private_Utils.MAX_LOOKAHEAD_UTF16);
}
localReader.reset();
return pushbackReader ;
}
/**
* Reads the next character using the underlying stream.
* @return -1 on end of stream.
* @throws IOException
*/
final int read() throws IOException {
final int ch = in.read();
assert ch != '\r';
return ch;
}
int readIgnoreWhitespace() throws IOException {
assert !inQuotedContent;
int c;
// read through comments - this detects and rejects double slash
// and slash star style comments and their contents
for (;;) {
c = read();
if (c == '/') { // possibly a start of a comment
int c2 = read();
if (c2 == '/') {
// we have a // comment, scan for the terminating new line
while (c2 != '\n' && c2 != -1) {
c2 = read();
}
c = c2;
}
else if (c2 == '*') {
// we have a /* */ comment scan for closing */
scancomment:
while (c2 != -1) {
c2 = read();
if (c2 == '*') {
c2 = read();
if (c2 == '/') {
break scancomment;
}
unread(c2); // in case this was the '*' we're looking for
}
}
c = read();
}
else {
// it wasn't a comment start, throw it back
unread(c2);
}
}
if (/* inContent ||*/ !isWhitespace(c)) {
break;
}
}
return c;
}
void unread(int c) throws IOException {
this.in.unread(c);
}
// TODO clone the body of next(c) here, later , for perf, we
// don't really want TWO methods calls per character, or
// mostly we don't want an extra one. (and I doubt that
// Java handles the tail optimization)
public Type next(boolean is_in_expression) throws IOException {
inQuotedContent = false;
int c = this.readIgnoreWhitespace();
return next(c, is_in_expression);
}
private Type next(int c, boolean is_in_expression) throws IOException {
int c2;
t = Type.none;
isIncomplete = false;
switch (c) {
case -1:
return (t = Type.eof);
case '{':
c2 = read();
if (c2 == '{') {
return (t = Type.tOpenDoubleCurly);
}
unread(c2);
return (t = Type.tOpenCurly);
case '}':
return (t = Type.tCloseCurly);
case '[':
return (t = Type.tOpenSquare);
case ']':
return (t = Type.tCloseSquare);
case '(':
return (t = Type.tOpenParen);
case ')':
return (t = Type.tCloseParen);
case ',':
return (t = Type.tComma);
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return readNumber(c);
case '\"':
inQuotedContent = true;
this.keyword = Type.none; // anything in quotes isn't a keyword
return scanString(c, _Private_IonConstants.lnIsVarLen - 1);
case '\'':
c2 = read();
if (c2 == '\'') {
c2 = read();
if (c2 == '\'') {
return scanLongString();
}
this.unread(c2);
c2 = '\''; // restore c2 for the next unread
}
this.unread(c2);
inQuotedContent = true;
return scanIdentifier(c);
case '+':
c2 = read();
if (c2 == 'i') {
c2 = read();
if (c2 == 'n') {
c2 = read();
if (c2 == 'f') {
return (t = Type.kwPosInf);
}
this.unread(c2);
c2 = 'n';
}
this.unread(c2);
c2 = 'i';
}
this.unread(c2);
if (is_in_expression) {
return scanOperator(c);
}
break; // break to error
case '-':
c2 = read();
if (c2 >= '0' && c2 <= '9') {
this.unread(c2);
return readNumber(c);
}
if (c2 == 'i') {
c2 = read();
if (c2 == 'n') {
c2 = read();
if (c2 == 'f') {
return (t = Type.kwNegInf);
}
this.unread(c2);
c2 = 'n';
}
this.unread(c2);
c2 = 'i';
}
this.unread(c2);
if (is_in_expression) {
return scanOperator(c);
}
break; // break to error
default:
if (IonTextUtils.isIdentifierStart(c)) {
return scanIdentifier(c);
}
if (is_in_expression && isOperatorPart(c)) {
return scanOperator(c);
}
}
String message =
"Unexpected character " + printCodePointAsString(c) +
" encountered at line " + this.getLineNumber() +
" column " + this.getColumn();
throw new IonException(message);
}
public Type scanIdentifier(int c) throws IOException
{
// reset our local value buffer
resetValue();
// we don't have an identifier type any longer, just string
this.t = Type.constSymbol;
// some strings are keywords, most are not
// mostly we'll guess that it's not a keyword
this.keyword = null;
// first we read the identifier content into our value buffer
// if the value is quoted, use the escape char loop, otherwise
// look for the non-identifier character (and throw it back)
if (!readIdentifierContents(c)) { // not quoted (true is quoted)
// anything in quotes (even single quotes) is not a keyword
// and here we're not in quoted content (that is handled above)
// so see if it's also a keyword
this.keyword = IonTokenReader.matchKeyword(value, 0, value.length());
if (this.keyword != null) {
if (this.keyword == Type.kwNull) {
c = this.read();
if (c == '.') {
int dot = value.length();
value.append((char)c);
c = this.read();
int added = readIdentifierContents(c, IonTokenConstsX.TN_MAX_NAME_LENGTH + 1); // +1 is "enough roap" so if there are extra letters at the end of the keyword we keep at least 1
int kw = IonTokenConstsX.keyword(value, dot+1, dot+added+1);
switch (kw) {
case IonTokenConstsX.KEYWORD_NULL:
case IonTokenConstsX.KEYWORD_BOOL:
case IonTokenConstsX.KEYWORD_INT:
case IonTokenConstsX.KEYWORD_FLOAT:
case IonTokenConstsX.KEYWORD_DECIMAL:
case IonTokenConstsX.KEYWORD_TIMESTAMP:
case IonTokenConstsX.KEYWORD_SYMBOL:
case IonTokenConstsX.KEYWORD_STRING:
case IonTokenConstsX.KEYWORD_BLOB:
case IonTokenConstsX.KEYWORD_CLOB:
case IonTokenConstsX.KEYWORD_LIST:
case IonTokenConstsX.KEYWORD_SEXP:
case IonTokenConstsX.KEYWORD_STRUCT:
this.keyword = setNullType(value, dot + 1, value.length() - dot - 1);
break;
default:
// not a valid type name - so we have to unread back to the dot
for (int ii=value.length(); ii>dot; ) {
ii--;
char uc = value.charAt(ii);
unread(uc);
}
String message =
position()
+ ": Expected Ion type after 'null.' but found: "
+ value;
throw new IonException(message);
}
}
else {
unread(c);
}
}
this.t = this.keyword;
return this.t;
}
}
// see if we're a user type of a member name
c = this.readIgnoreWhitespace();
if (c != ':') {
unread(c);
}
else {
c = read();
if (c != ':') {
unread(c);
this.t = Type.constMemberName;
}
else {
this.t = Type.constUserTypeDecl;
}
}
return this.t;
}
boolean readIdentifierContents(int c) throws IOException {
int quote = c;
inQuotedContent = (quote == '\'' || quote == '\"');
if ((quotedIdentifier = inQuotedContent) == true) {
for (;;) {
c = read();
if (c < 0 || c == quote) break;
if (c == '\\') {
c = readEscapedCharacter(this.in, false /*not clob*/);
}
if (c != EMPTY_ESCAPE_SEQUENCE) {
value.appendCodePoint(c);
}
}
if (c == -1) { // TODO throw UnexpectedEofException
throw new IonException("end encountered before closing quote '\\" + (char)endquote+ "'");
}
// c, at this point is a single quote, which we don't append
inQuotedContent = false;
}
else {
value.append((char)c);
for (;;) {
c = read();
if (!IonTextUtils.isIdentifierPart(c)) {
break;
}
value.append((char)c);
}
unread(c); // we throw back our terminator here
}
return quotedIdentifier;
}
int readIdentifierContents(int c, int max_length) throws IOException {
assert(c != '\'' && c != '\"');
value.append((char)c);
int count = 1;
while (count < max_length) {
c = read();
if (!IonTextUtils.isIdentifierPart(c)) {
unread(c); // we throw back our terminator here
break;
}
value.append((char)c);
count++;
}
return count;
}
static Type matchKeyword(StringBuilder sb, int pos, int valuelen) throws IOException
{
Type keyword = null;
switch (sb.charAt(pos++)) { // there has to be at least 1 chacter or we wouldn't be here
case 'f':
if (valuelen == 5 // "f"
&& sb.charAt(pos++) == 'a'
&& sb.charAt(pos++) == 'l'
&& sb.charAt(pos++) == 's'
&& sb.charAt(pos++) == 'e'
) {
keyword = Type.kwFalse;
}
break;
case 'n':
if (valuelen == 4 // 'n'
&& sb.charAt(pos++) == 'u'
&& sb.charAt(pos++) == 'l'
&& sb.charAt(pos++) == 'l'
) {
keyword = Type.kwNull;
}
else if (valuelen == 3 // 'n'
&& sb.charAt(pos++) == 'a'
&& sb.charAt(pos++) == 'n'
) {
keyword = Type.kwNan;
}
break;
case 't':
if (valuelen == 4 // "t"
&& sb.charAt(pos++) == 'r'
&& sb.charAt(pos++) == 'u'
&& sb.charAt(pos++) == 'e'
) {
keyword = Type.kwTrue;
}
break;
}
return keyword;
}
public Type setNullType(StringBuilder sb, int pos, int valuelen)
{
switch (valuelen) {
case 3:
//int
if (sb.charAt(pos++) == 'i'
&& sb.charAt(pos++) == 'n'
&& sb.charAt(pos++) == 't'
) {
return Type.kwNullInt;
}
break;
case 4:
//bool, blob, list, null, clob, sexp
switch (sb.charAt(pos++)) {
case 'b':
//bool, blob
switch (sb.charAt(pos++)) {
case 'o':
// bool
if (sb.charAt(pos++) == 'o'
&& sb.charAt(pos++) == 'l'
) {
return Type.kwNullBoolean;
}
break;
case 'l':
//blob
if (sb.charAt(pos++) == 'o'
&& sb.charAt(pos++) == 'b'
) {
return Type.kwNullBlob;
}
break;
}
break;
case 'l':
if (sb.charAt(pos++) == 'i'
&& sb.charAt(pos++) == 's'
&& sb.charAt(pos++) == 't'
) {
return Type.kwNullList;
}
break;
case 'n':
if (sb.charAt(pos++) == 'u'
&& sb.charAt(pos++) == 'l'
&& sb.charAt(pos++) == 'l'
) {
return Type.kwNullNull;
}
break;
case 'c':
//clob
if (sb.charAt(pos++) == 'l'
&& sb.charAt(pos++) == 'o'
&& sb.charAt(pos++) == 'b'
) {
return Type.kwNullClob;
}
break;
case 's':
//sexp
if (sb.charAt(pos++) == 'e'
&& sb.charAt(pos++) == 'x'
&& sb.charAt(pos++) == 'p'
) {
return Type.kwNullSexp;
}
break;
}
break;
case 5:
//float
if (sb.charAt(pos++) == 'f'
&& sb.charAt(pos++) == 'l'
&& sb.charAt(pos++) == 'o'
&& sb.charAt(pos++) == 'a'
&& sb.charAt(pos++) == 't'
) {
return Type.kwNullFloat;
}
break;
case 6:
switch (sb.charAt(pos++)) {
case 's':
//string
//struct
//symbol
switch(sb.charAt(pos++)) {
case 't':
if (sb.charAt(pos++) == 'r') {
switch (sb.charAt(pos++)) {
case 'i':
if (sb.charAt(pos++) == 'n' && sb.charAt(pos++) == 'g') {
return Type.kwNullString;
}
break;
case 'u':
if (sb.charAt(pos++) == 'c' && sb.charAt(pos++) == 't') {
return Type.kwNullStruct;
}
break;
}
}
break;
case 'y':
if (sb.charAt(pos++) == 'm'
&& sb.charAt(pos++) == 'b'
&& sb.charAt(pos++) == 'o'
&& sb.charAt(pos++) == 'l'
) {
return Type.kwNullSymbol;
}
break;
}
}
break;
case 7:
//decimal
if (sb.charAt(pos++) == 'd'
&& sb.charAt(pos++) == 'e'
&& sb.charAt(pos++) == 'c'
&& sb.charAt(pos++) == 'i'
&& sb.charAt(pos++) == 'm'
&& sb.charAt(pos++) == 'a'
&& sb.charAt(pos++) == 'l'
) {
return Type.kwNullDecimal;
}
break;
case 9:
//timestamp
if (sb.charAt(pos++) == 't'
&& sb.charAt(pos++) == 'i'
&& sb.charAt(pos++) == 'm'
&& sb.charAt(pos++) == 'e'
&& sb.charAt(pos++) == 's'
&& sb.charAt(pos++) == 't'
&& sb.charAt(pos++) == 'a'
&& sb.charAt(pos++) == 'm'
&& sb.charAt(pos++) == 'p'
) {
return Type.kwNullTimestamp;
}
break;
}
String nullimage = sb.toString();
throw new IonException("invalid null value '"+nullimage+"' at " + this.position());
}
public Type scanOperator(int c) throws IOException
{
// reset our local value buffer
resetValue();
// we don't have an identifier type any longer, just string
this.t = Type.constSymbol;
// some strings are keywords, most are not
// mostly we'll guess that it's not a keyword
this.keyword = null;
// first we read the identifier content into our value buffer
// if the value is quoted, use the escape char loop, otherwise
// look for the non-identifier character (and throw it back)
value.append((char)c);
for (;;) {
c = read();
if (!IonTextUtils.isOperatorPart(c)) {
break;
}
value.append((char)c);
}
unread(c); // we throw back our terminator here
return this.t;
}
public Type scanString(int c, int maxlookahead) throws IOException
{
// reset out local value buffer
resetValue();
if (c != '\"') {
throw new IonException("Programmer error! Only a quote should get you here.");
}
endquote = '\"';
sizedloop:
while (maxlookahead-- > 0) {
switch ((c = this.read())) {
case -1: break sizedloop; // TODO deoptimize, throw exception
case '\"': break sizedloop;
case '\n':
throw new IonException("unexpected line terminator encountered in quoted string");
case '\\':
c = readEscapedCharacter(this.in, false /*not clob*/);
// throws UnexpectedEofException on EOF
if (c != EMPTY_ESCAPE_SEQUENCE) {
value.appendCodePoint(c);
}
break;
default:
// c could be part of a surrogate pair so we can't use
// appendCodePoint()
value.append((char)c);
break;
}
}
if (maxlookahead != -1 && c == '\"') {
// this is the normal, non-longline case so we're just done
closeString();
}
else {
// we're not at the closing quote, so this is an incomplete
// string which will have to be read to the end ... later
leaveOpenString(c, false);
}
return Type.constString;
}
void leaveOpenString(int c, boolean longstring) {
if (c == -1) {
throw new UnexpectedEofException();
}
this.isIncomplete = true;
this.inQuotedContent = true;
this.isLongString = longstring;
}
/**
* Scans the rest of the string through {@link #endquote}.
*
XXX WARNING XXX
* Almost identical logic is found in
* {@link IonBinary#appendToLongValue(int, boolean, boolean, PushbackReader)}
*
* @param is_in_expression
* @throws UnexpectedEofException if we hit EOF before the endquote.
*/
void finishScanString(boolean is_in_expression) throws IOException
{
assert isIncomplete;
assert inQuotedContent;
for (;;)
{
int c = this.read();
if (c == -1) {
throw new UnexpectedEofException();
}
if (c == endquote) break; // endquote == -1 during long strings
if (c == '\\') {
c = readEscapedCharacter(this.in, false /*not clob*/);
if (c != EMPTY_ESCAPE_SEQUENCE) {
value.appendCodePoint(c);
}
}
else if (c == '\'' && isLongString) {
// This happens while handling triple-quote field names
if (twoMoreSingleQuotes()) {
// At the end of a long string. Look for another one.
inQuotedContent = false;
c = readIgnoreWhitespace();
if (c == '\'' && twoMoreSingleQuotes()) {
// Yep, another triple-quote. We've consumed all three
// so just keep reading.
inQuotedContent = true;
}
else {
// Something else, we are done.
unread(c);
break;
}
}
else {
// Still inside the long string
value.append((char)c);
}
}
else if (!isLongString && (c == '\n')) {
throw new IonException("unexpected line terminator encountered in quoted string");
}
else {
value.append((char)c);
}
}
return;
}
/**
* If two single quotes are next on the input, consume them and return
* true. Otherwise, leave them on the input and return false.
* @return true when there are two pending single quotes.
* @throws IOException
*/
private boolean twoMoreSingleQuotes() throws IOException
{
int c = read();
if (c == '\'') {
int c2 = read();
if (c2 == '\'') {
return true;
}
unread(c2);
}
unread(c);
return false;
}
void closeString() {
this.isIncomplete = false;
this.inQuotedContent = false;
this.isLongString = false;
}
public Type scanLongString() throws IOException
{
// reset out local value buffer
resetValue();
endquote = -1;
// we're not at the closing quote, so this is an incomplete
// string which will have to be read to the end ... later
leaveOpenString(EMPTY_ESCAPE_SEQUENCE, true);
return Type.constString;
}
/**
* Closes parsing of a triple-quote string, remembering that there may be
* more to come.
*/
void closeLongString() {
this.isIncomplete = true;
this.inQuotedContent = false;
this.isLongString = true;
}
/**
* @return the translated escape sequence. Will not return -1 since EOF is
* not legal in this context. May return {@link #EMPTY_ESCAPE_SEQUENCE} to
* indicate a zero-length sequence such as BS-NL.
* @throws UnexpectedEofException if the EOF is encountered in the middle
* of the escape sequence.
* */
public static int readEscapedCharacter(PushbackReader r, boolean inClob)
throws IOException, UnexpectedEofException
{
// \\ The backslash character
// \0 The character 0 (nul terminator char, for some)
// \xhh The character with hexadecimal value 0xhh
// \ u hhhh The character with hexadecimal value 0xhhhh
// \t The tab character ('\u0009')
// \n The newline (line feed) character ('\ u 000A')
// \v The vertical tab character ('\u000B')
// \r The carriage-return character ('\ u 000D')
// \f The form-feed character ('\ u 000C')
// \a The alert (bell) character ('\ u 0007')
// \" The double quote character
// \' The single quote character
// \? The question mark character
int c2 = 0, c = r.read();
switch (c) {
case -1:
throw new UnexpectedEofException();
case 't': return '\t';
case 'n': return '\n';
case 'v': return '\u000B';
case 'r': return '\r';
case 'f': return '\f';
case 'b': return '\u0008';
case 'a': return '\u0007';
case '\\': return '\\';
case '\"': return '\"';
case '\'': return '\'';
case '/': return '/';
case '?': return '?';
case 'U':
// Expecting 8 hex digits
if (inClob) {
throw new IonException("Unicode escapes \\U not allowed in clob");
}
c = readDigit(r, 16, true);
if (c < 0) break; // TODO throw UnexpectedEofException
c2 = c << 28;
c = readDigit(r, 16, true);
if (c < 0) break;
c2 += c << 24;
c = readDigit(r, 16, true);
if (c < 0) break;
c2 += c << 20;
c = readDigit(r, 16, true);
if (c < 0) break;
c2 += c << 16;
// ... fall through...
case 'u':
// Expecting 4 hex digits
if (inClob) {
throw new IonException("Unicode escapes \\u not allowed in clob");
}
c = readDigit(r, 16, true);
if (c < 0) break;
c2 += c << 12;
c = readDigit(r, 16, true);
if (c < 0) break;
c2 += c << 8;
// ... fall through...
case 'x':
// Expecting 2 hex digits
c = readDigit(r, 16, true);
if (c < 0) break;
c2 += c << 4;
c = readDigit(r, 16, true);
if (c < 0) break;
return c2 + c;
case '0':
return 0;
case '\n':
return EMPTY_ESCAPE_SEQUENCE;
default:
break;
}
throw new IonException("invalid escape sequence \"\\"
+ (char) c + "\" [" + c + "]");
}
public static int readDigit(/*EscapedCharacterReader*/PushbackReader r, int radix, boolean isRequired) throws IOException {
int c = r.read();
if (c < 0) {
r.unread(c);
return -1;
}
int c2 = Character.digit(c, radix);
if (c2 < 0) {
if (isRequired) {
throw new IonException("bad digit in escaped character '"+((char)c)+"'");
}
// if it's not a required digit, we just throw it back
r.unread(c);
return -1;
}
return c2;
}
private void checkAndUnreadNumericStopper(int c) throws IOException
{
// Ignore EOF, so we don't "unread" it - EOF is a terminator
if (c != -1) {
if ( ! this.isValueTerminatingCharacter(c) ) {
final String message =
position() + ": Numeric value followed by invalid character "
+ printCodePointAsString(c);
throw new IonException(message);
}
this.unread(c);
}
}
private final boolean isValueTerminatingCharacter(int c) throws IOException
{
boolean isTerminator;
if (c == '/') {
// this is terminating only if it starts a comment of some sort
c = this.read();
this.unread(c); // we never "keep" this character
isTerminator = (c == '/' || c == '*');
}
else {
isTerminator = IonTextUtils.isNumericStop(c);
}
return isTerminator;
}
public Type readNumber(int c) throws IOException {
// clear out our string buffer
resetValue();
// process the initial sign character, if its there.
boolean explicitPlusSign = false;
switch (c) {
case '-':
value.append((char)c);
c = this.read();
t = Type.constNegInt;
this.numberType = NumberType.NT_NEGINT;
this.isNegative = true;
break;
case '+':
// we eat the plus sign, but remember we saw one
explicitPlusSign = true;
c = this.read();
t = Type.constPosInt;
this.isNegative = false;
this.numberType = NumberType.NT_POSINT;
break;
default:
t = Type.constPosInt;
this.numberType = NumberType.NT_POSINT;
this.isNegative = false;
break;
}
// process the initial digit. Caller has checked that it's a digit.
assert isDigit(c, 10);
value.append((char)c);
boolean isZero = (c == '0');
boolean leadingZero = isZero;
// process the next character after the initial digit.
switch((c = this.read())) {
case 'x':
case 'X':
if (!isZero) {
throw new IonException("badly formed number encountered at " + position());
}
this.numberType = NumberType.NT_HEX;
// We don't need to append the char because we're going to wipe
// the value anyway to accumulate just the hex digits.
return scanHexNumber();
case '.':
case 'd':
case 'D':
t = Type.constDecimal;
this.numberType = NumberType.NT_DECIMAL;
value.append((char)c);
break;
case 'e':
case 'E':
t = Type.constFloat;
this.numberType = NumberType.NT_FLOAT;
value.append((char)c);
break;
default:
if (!isDigit(c, 10)) {
checkAndUnreadNumericStopper(c);
if (isZero && NumberType.NT_NEGINT.equals(this.numberType)) {
t = Type.constPosInt;
this.numberType = NumberType.NT_POSINT;
}
return t;
}
value.append((char)c);
isZero &= (c == '0');
break;
}
// see if see can continue find leading digits
if (NumberType.NT_NEGINT.equals(numberType)
|| NumberType.NT_POSINT.equals(numberType)
) {
// We've now scanned at least two digits.
// read in the remaining whole number digits
for (;;) {
c = this.read();
if (!isDigit(c, 10)) break;
value.append((char)c);
isZero &= (c == '0');
}
// and see what we ran aground on ..
switch (c) {
case '.':
case 'd':
case 'D':
if (leadingZero) {
throw new IonException(position() + ": Invalid leading zero on numeric");
}
t = Type.constDecimal;
this.numberType = NumberType.NT_DECIMAL;
value.append((char)c);
break;
case 'e':
case 'E':
if (leadingZero) {
throw new IonException(position() + ": Invalid leading zero on numeric");
}
t = Type.constFloat;
this.numberType = NumberType.NT_FLOAT;
value.append((char)c);
break;
case 'T': // same as '-' it's a timestamp
case '-':
if (NumberType.NT_POSINT.equals(this.numberType) && !explicitPlusSign) {
return scanTimestamp(c);
}
// otherwise fall through to "all other" processing
default:
// Hit the end of our integer. Make sure its a valid stopper.
checkAndUnreadNumericStopper(c);
if (leadingZero && !isZero) {
throw new IonException(position() + ": Invalid leading zero on numeric");
}
if (isZero && NumberType.NT_NEGINT.equals(this.numberType)) {
t = Type.constPosInt;
this.numberType = NumberType.NT_POSINT;
}
return t;
}
}
// now we must have just decimal or float
// if it's decimal we ended on a DOT, so get the trailing
// (fractional) digits
if (NumberType.NT_DECIMAL.equals(this.numberType)) {
for (;;) {
c = this.read();
if (!isDigit(c, 10)) break;
value.append((char)c);
}
// and see what we ran aground on this time..
switch (c) {
case '-':
case '+':
// We'll handle exponent below.
this.unread(c);
break;
case 'e':
case 'E':
// this is the only viable option now
t = Type.constFloat;
this.numberType = NumberType.NT_FLOAT;
value.append((char)c);
break;
case 'd':
case 'D':
// this is the only viable option now
assert t == Type.constDecimal;
assert NumberType.NT_DECIMAL.equals(this.numberType);
value.append((char)c);
break;
default:
checkAndUnreadNumericStopper(c);
return t;
}
}
// now we have the exponent part to read
// reading the first character of the exponent, it can be
// a sign character or a digit (this is the only place
// the sign is valid) and we're for sure a float at this point
switch ((c = this.read())) {
case '-':
value.append((char)c);
break;
case '+':
// we eat the plus sign
break;
default:
if (!isDigit(c, 10)) {
// this they said 'e' then they better put something valid after it!
throw new IonException("badly formed number encountered at " + position());
}
this.unread(c);
// we'll re-read this in just a bit, but we had to check
break;
}
// read in the remaining whole number digits and then quit
for (;;) {
c = this.read();
if (!isDigit(c, 10)) break;
value.append((char)c);
}
checkAndUnreadNumericStopper(c);
return t;
}
Type scanHexNumber() throws IOException
{
boolean anydigits = false;
int c;
boolean isZero = true;
// Leave the 0x in the buffer since we'll need it to parse the content.
// first get clear out our string buffer, we're starting over
value.setLength(0); // resetValue() wipes out too much
// read in the remaining whole number digits and then quit
for (;;) {
c = this.read();
if (!isDigit(c, 16)) break;
anydigits = true;
isZero &= (c == '0');
value.append((char)c);
}
if (!anydigits) {
throw new IonException("badly formed hexadecimal number encountered at " + position());
}
checkAndUnreadNumericStopper(c);
if (isZero && t == Type.constNegInt) {
t = Type.constPosInt;
}
// as long as we saw any digit, we're good
return t;
}
/**
* Scans a timestamp after reading yyyy-
.
*
* We can be a little lenient here since the result will be reparsed and
* validated more thoroughly by {@link Timestamp#valueOf(CharSequence)}.
*
* @param c the last character scanned; must be '-'
.
* @return {@link Type#constTime}
*/
Type scanTimestamp(int c) throws IOException {
endofdate:
for (;;) { // fake for loop to create a label we can jump out of,
// because 4 or 5 levels of nested if's is just ugly
// at this point we will have read leading digits and exactly 1 dash
// in other words, we'll have read the year
if (c == 'T') {
// yearT is a valid timestamp value
value.append((char)c);
c = this.read(); // because we'll unread it before we return
break endofdate;
}
if (c != '-') {
// not a dash or a T after the year - so this is a bad value
throw new IllegalStateException("invalid timestamp, expecting a dash here at " + this.position());
}
// append the dash and then read the month field
value.append((char)c); // so append it, because we haven't already
c = readDigits(2, "month");
if (c == 'T') {
// year-monthT is a valid timestamp value
value.append((char)c);
c = this.read(); // because we'll unread it before we return
break endofdate;
}
if (c != '-') {
// if the month isn't followed by a dash or a T it's an invalid month
throw new IonException("invalid timestamp, expecting month at " + this.position());
}
// append the dash and read the day (or day-of-month) field
value.append((char)c);
c = readDigits(2, "day of month");
if (c == 'T') {
check4timezone:
for (;;) { // another fake label/ for=goto
// attach the 'T' to the value we're collecting
value.append((char)c);
// we're going to "watch" how many digits we read in the hours
// field. It's 0 that's actually ok, since we can end at the
// 'T' we just read
int length_before_reading_hours = value.length();
// so read the hours
c = readDigits(2, "hours");
if (length_before_reading_hours == value.length()) {
// FIXME I don't think there should be a timezone here
break check4timezone;
}
if (c != ':') {
throw new IonException("invalid timestamp, expecting hours at " + this.position());
}
value.append((char)c);
// so read the minutes
c = readDigits(2, "minutes");
if (c != ':') {
if (c == '-' || c == '+' || c == 'Z') {
break check4timezone;
}
break endofdate;
}
value.append((char)c);
// so read the seconds
c = readDigits(2, "seconds");
if (c != '.') {
if (c == '-' || c == '+' || c == 'Z') {
break check4timezone;
}
break endofdate;
}
value.append((char)c);
// so read the fractional seconds
c = readDigits(32, "fractional seconds");
break check4timezone;
}//check4timezone
// now check to see if it's a timezone offset we're looking at
if (c == '-' || c == '+') {
value.append((char)c);
// so read the timezone offset
c = readDigits(2, "timezone offset");
if (c != ':') break endofdate;
value.append((char)c);
c = readDigits(2, "timezone offset");
}
else if (c == 'Z') {
value.append((char)c);
c = this.read(); // because we'll unread it before we return
}
}
break endofdate;
}//endofdate
checkAndUnreadNumericStopper(c);
return Type.constTime;
}
int readDigits(int limit, String field) throws IOException {
int c, len = 0;
// read in the remaining whole number digits and then quit
for (;;) {
c = this.read();
if (!isDigit(c, 10)) break;
len++;
if (len > limit) {
throw new IonException("invalid format " + field + " too long at " + this.position());
}
value.append((char)c);
}
return c;
}
public boolean makeValidNumeric(Type castto) throws IOException {
String s = getValueString(false);
try
{
this.t = castto.setNumericValue(this, s);
}
catch (NumberFormatException e)
{
throw new IonException(this.position() + ": invalid numeric value " + s, e);
}
return this.t.isNumeric();
}
}