net.sf.saxon.ma.json.JsonParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of Saxon-HE Show documentation
Show all versions of Saxon-HE Show documentation
The XSLT and XQuery Processor
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2022 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
package net.sf.saxon.ma.json;
import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.functions.SystemFunction;
import net.sf.saxon.om.Function;
import net.sf.saxon.om.Item;
import net.sf.saxon.om.NameChecker;
import net.sf.saxon.om.Sequence;
import net.sf.saxon.serialize.charcode.UTF16CharacterSet;
import net.sf.saxon.str.StringView;
import net.sf.saxon.trans.Err;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.transpile.CSharp;
import net.sf.saxon.transpile.CSharpSimpleEnum;
import net.sf.saxon.type.SpecificFunctionType;
import net.sf.saxon.type.StringToDouble;
import net.sf.saxon.value.*;
import java.util.Map;
/**
* Parser for JSON, which notifies parsing events to a JsonHandler
*/
public class JsonParser {
public static final int ESCAPE = 1;
public static final int ALLOW_ANY_TOP_LEVEL = 2;
public static final int LIBERAL = 4;
public static final int VALIDATE = 8;
public static final int DEBUG = 16;
public static final int DUPLICATES_RETAINED = 32;
public static final int DUPLICATES_LAST = 64;
public static final int DUPLICATES_FIRST = 128;
public static final int DUPLICATES_REJECTED = 256;
public static final int DUPLICATES_SPECIFIED = DUPLICATES_FIRST | DUPLICATES_LAST | DUPLICATES_RETAINED | DUPLICATES_REJECTED;
public static final int NESTING_LIMIT = 10000;
private static final String ERR_GRAMMAR = "FOJS0001";
private static final String ERR_DUPLICATE = "FOJS0003";
private static final String ERR_SCHEMA = "FOJS0004";
private static final String ERR_OPTIONS = "FOJS0005";
private static final String ERR_LIMITS = "FOJS0001"; // No specific code in spec
private Function numberParser = null;
private int nesting;
/**
* Create a JSON parser
*/
public JsonParser() {
nesting = 0;
}
/**
* Parse the JSON string according to supplied options
*
* @param input JSON input string
* @param flags options for the conversion as a map of xs:string : value pairs
* @param handler event handler to which parsing events are notified
* @param context XPath evaluation context
* @throws XPathException if the syntax of the input is incorrect
*/
public void parse(String input, int flags, JsonHandler handler, XPathContext context) throws XPathException {
if (input.isEmpty()) {
invalidJSON("An empty string is not valid JSON", ERR_GRAMMAR, 1);
}
JsonTokenizer t = new JsonTokenizer(input);
t.next();
try {
parseConstruct(handler, t, flags, context);
} catch (IllegalStateException e) {
// e.g. unmatched surrogate pairs
invalidJSON(e.getMessage(), ERR_GRAMMAR, t.lineNumber);
}
if (t.next() != JsonToken.EOF) {
invalidJSON("Unexpected token beyond end of JSON input", ERR_GRAMMAR, t.lineNumber);
}
}
/**
* Extract the requested JSON parsing options as a set of flags in a bit-significant integer
* @param options the supplied options map
* @param allowValidate true if the validate option is permitted
* @param isSchemaAware true if the processor is schema-aware (only relevant when allowValidate=true)
* @return the options as a sef of flags
* @throws XPathException if any options are invalid
*/
public static int getFlags(Map options, boolean allowValidate, boolean isSchemaAware) throws XPathException {
int flags = 0;
BooleanValue debug = (BooleanValue) options.get("debug");
if (debug != null && debug.getBooleanValue()) {
flags |= DEBUG;
}
BooleanValue escape = ((BooleanValue) options.get("escape"));
if (escape != null && escape.getBooleanValue()) {
flags |= ESCAPE;
if (options.get("fallback") != null) {
throw new XPathException("Cannot specify a fallback function when escape=true", "FOJS0005");
}
}
BooleanValue liberal = ((BooleanValue) options.get("liberal"));
if (liberal != null && liberal.getBooleanValue()) {
flags |= LIBERAL;
flags |= ALLOW_ANY_TOP_LEVEL;
}
boolean validate = false;
if (allowValidate) {
validate = ((BooleanValue) options.get("validate")).getBooleanValue();
if (validate) {
if (!isSchemaAware) {
error("Requiring validation on non-schema-aware processor", ERR_SCHEMA);
}
flags |= VALIDATE;
}
}
if (options.containsKey("duplicates")) {
String duplicates = ((StringValue) options.get("duplicates")).getStringValue();
switch (duplicates) {
case "reject":
flags |= DUPLICATES_REJECTED;
break;
case "use-last":
flags |= DUPLICATES_LAST;
break;
case "use-first":
flags |= DUPLICATES_FIRST;
break;
case "retain":
flags |= DUPLICATES_RETAINED;
break;
default:
error("Invalid value for 'duplicates' option", ERR_OPTIONS);
break;
}
if (validate && "retain".equals(duplicates)) {
error("The options validate:true and duplicates:retain cannot be used together", ERR_OPTIONS);
}
}
return flags;
}
/**
* Parse a JSON construct (top-level or nested)
*
* @param handler the handler to generate the result
* @param tokenizer the tokenizer, positioned at the first token of the construct to be read
* @param flags parsing options
* @param context XPath evaluation context
* @throws net.sf.saxon.trans.XPathException if a dynamic error occurs (for example, invalid JSON input)
*/
private void parseConstruct(JsonHandler handler, JsonTokenizer tokenizer, int flags, XPathContext context) throws XPathException {
boolean debug = (flags & DEBUG) != 0;
if (debug) {
System.err.println("token:" + tokenizer.currentToken + " :" + tokenizer.currentTokenValue);
}
if (nesting > NESTING_LIMIT) {
// Needed for C#, because we can't rely on catching StackOverflow
invalidJSON("Objects are too deeply nested", ERR_LIMITS, tokenizer.lineNumber);
}
JsonToken tok = tokenizer.currentToken;
switch (tok) {
case LCURLY:
nesting++;
parseObject(handler, tokenizer, flags, context);
nesting--;
break;
case LSQB:
nesting++;
parseArray(handler, tokenizer, flags, context);
nesting--;
break;
case NUMERIC_LITERAL:
String lexical = tokenizer.currentTokenValue.toString();
AtomicValue d = parseNumericLiteral(lexical, flags, tokenizer.lineNumber, context);
handler.writeNumeric(lexical, d);
break;
case TRUE:
handler.writeBoolean(true);
break;
case FALSE:
handler.writeBoolean(false);
break;
case NULL:
handler.writeNull();
break;
case STRING_LITERAL:
String literal = tokenizer.currentTokenValue.toString();
handler.writeString(unescape(literal, flags, ERR_GRAMMAR, tokenizer.lineNumber));
break;
default:
invalidJSON("Unexpected symbol: " + tokenizer.currentTokenValue, ERR_GRAMMAR, tokenizer.lineNumber);
break;
}
}
/**
* Parse a JSON object (or map), i.e. construct delimited by curly braces
*
* @param handler the handler to generate the result
* @param tokenizer the tokenizer, positioned at the object to be read
* @param flags parsing options as a set of flags
* @param context XPath evaluation context
* @throws net.sf.saxon.trans.XPathException if a dynamic error occurs (such as invalid JSON input)
*/
private void parseObject(JsonHandler handler, JsonTokenizer tokenizer, int flags, XPathContext context) throws XPathException {
boolean liberal = (flags & LIBERAL) != 0;
handler.startMap();
JsonToken tok = tokenizer.next();
while (tok != JsonToken.RCURLY) {
if (tok != JsonToken.STRING_LITERAL && !(tok == JsonToken.UNQUOTED_STRING && liberal)) {
invalidJSON("Property name must be a string literal (found " + showToken(tok, tokenizer.currentTokenValue.toString() + ")"),
ERR_GRAMMAR, tokenizer.lineNumber);
}
String key = tokenizer.currentTokenValue.toString();
key = unescape(key, flags, ERR_GRAMMAR, tokenizer.lineNumber);
String reEscaped = handler.reEscape(key);
tok = tokenizer.next();
if (tok != JsonToken.COLON) {
invalidJSON("Missing colon after \"" + Err.wrap(key) + "\"", ERR_GRAMMAR, tokenizer.lineNumber);
}
tokenizer.next();
boolean duplicate = handler.setKey(key, reEscaped);
if (duplicate && ((flags & DUPLICATES_REJECTED) != 0)) {
invalidJSON("Duplicate key value \"" + Err.wrap(key) + "\"", ERR_DUPLICATE, tokenizer.lineNumber);
}
try {
if (!duplicate || ((flags & (DUPLICATES_LAST | DUPLICATES_RETAINED)) != 0)) {
parseConstruct(handler, tokenizer, flags, context);
} else {
// retain first: parse the duplicate value but discard it
JsonHandler h2 = new JsonHandler();
h2.setContext(context);
parseConstruct(h2, tokenizer, flags, context);
}
} catch (StackOverflowError e) {
invalidJSON("Objects are too deeply nested", ERR_LIMITS, tokenizer.lineNumber);
}
tok = tokenizer.next();
if (tok == JsonToken.COMMA) {
tok = tokenizer.next();
if (tok == JsonToken.RCURLY) {
if (liberal) {
break; // tolerate the trailing comma
} else {
invalidJSON("Trailing comma after entry in object", ERR_GRAMMAR, tokenizer.lineNumber);
}
}
} else if (tok == JsonToken.RCURLY) {
break;
} else {
invalidJSON("Unexpected token after value of \"" + Err.wrap(key) + "\" property", ERR_GRAMMAR, tokenizer.lineNumber);
}
}
handler.endMap();
}
/**
* Parse a JSON array, i.e. construct delimited by square brackets
*
* @param handler the handler to generate the result
* @param tokenizer the tokenizer, positioned at the object to be read
* @param flags parsing options
* @param context XPath evaluation context
* @throws net.sf.saxon.trans.XPathException if a dynamic error occurs (such as invalid JSON input)
*/
private void parseArray(JsonHandler handler, JsonTokenizer tokenizer, int flags, XPathContext context) throws XPathException {
boolean liberal = (flags & LIBERAL) != 0;
handler.startArray();
JsonToken tok = tokenizer.next();
if (tok == JsonToken.RSQB) {
handler.endArray();
return;
}
while (true) {
try {
parseConstruct(handler, tokenizer, flags, context);
} catch (StackOverflowError e) {
invalidJSON("Arrays are too deeply nested", ERR_LIMITS, tokenizer.lineNumber);
}
tok = tokenizer.next();
if (tok == JsonToken.COMMA) {
tok = tokenizer.next();
if (tok == JsonToken.RSQB) {
if (liberal) {
break;// tolerate the trailing comma
} else {
invalidJSON("Trailing comma after entry in array", ERR_GRAMMAR, tokenizer.lineNumber);
}
}
} else if (tok == JsonToken.RSQB) {
break;
} else {
invalidJSON("Unexpected token (" + showToken(tok, tokenizer.currentTokenValue.toString()) +
") after entry in array", ERR_GRAMMAR, tokenizer.lineNumber);
}
}
handler.endArray();
}
/**
* Parse a JSON numeric literal,
*
* @param token the numeric literal to be parsed and converted
* @param flags parsing options
* @return the result of parsing and conversion to XDM
* @throws net.sf.saxon.trans.XPathException if a dynamic error occurs (such as invalid JSON input)
*/
private AtomicValue parseNumericLiteral(String token, int flags, int lineNumber, XPathContext context) throws XPathException {
try {
if ((flags & LIBERAL) == 0) {
// extra checks on the number disabled by choosing spec="liberal"
if (token.startsWith("+")) {
invalidJSON("Leading + sign not allowed: " + token, ERR_GRAMMAR, lineNumber);
} else {
String t = token;
if (t.startsWith("-")) {
t = t.substring(1);
}
if (t.startsWith("0") &&
!(t.equals("0") || t.startsWith("0.") || t.startsWith("0e") || t.startsWith("0E"))) {
invalidJSON("Redundant leading zeroes not allowed: " + token, ERR_GRAMMAR, lineNumber);
}
if (t.endsWith(".") || t.contains(".e") || t.contains(".E")) {
invalidJSON("Empty fractional part not allowed", ERR_GRAMMAR, lineNumber);
}
if (t.startsWith(".")) {
invalidJSON("Empty integer part not allowed", ERR_GRAMMAR, lineNumber);
}
}
}
if (numberParser != null) {
Sequence[] args = new Sequence[1];
args[0] = new StringValue(token);
Sequence result = SystemFunction.dynamicCall(numberParser, context, args).head();
return (AtomicValue)result.head();
} else {
return new DoubleValue(StringToDouble.getInstance().stringToNumber(StringView.tidy(token)));
}
} catch (NumberFormatException e) {
invalidJSON("Invalid numeric literal: " + e.getMessage(), ERR_GRAMMAR, lineNumber);
return DoubleValue.NaN;
}
}
/**
* Unescape a JSON string literal
*
* @param literal the string literal to be processed
* @param flags parsing options
* @param errorCode Error code
* @param lineNumber the line number
* @return the result of parsing and conversion to XDM
* @throws net.sf.saxon.trans.XPathException if a dynamic error occurs (such as invalid JSON input)
*/
public static String unescape(String literal, int flags, String errorCode, int lineNumber) throws XPathException {
if (literal.indexOf('\\') < 0) {
return literal;
}
boolean liberal = (flags & LIBERAL) != 0;
StringBuilder buffer = new StringBuilder(literal.length());
for (int i = 0; i < literal.length(); i++) {
char c = literal.charAt(i);
if (c == '\\') {
if (i++ == literal.length() - 1) {
throw new XPathException("Invalid JSON escape: String " + Err.wrap(literal) + " ends in backslash", errorCode);
}
switch (literal.charAt(i)) {
case '"':
buffer.append('"');
break;
case '\\':
buffer.append('\\');
break;
case '/':
buffer.append('/');
break;
case 'b':
buffer.append('\b');
break;
case 'f':
buffer.append('\f');
break;
case 'n':
buffer.append('\n');
break;
case 'r':
buffer.append('\r');
break;
case 't':
buffer.append('\t');
break;
case 'u':
try {
String hex = literal.substring(i + 1, i + 5);
int code = Integer.parseInt(hex, 16);
buffer.append((char) code);
i += 4;
} catch (Exception e) {
if (liberal) {
buffer.append("\\u");
} else {
throw new XPathException("Invalid JSON escape: \\u must be followed by four hex characters", errorCode);
}
}
break;
default:
if (liberal) {
buffer.append(literal.charAt(i));
} else {
char next = literal.charAt(i);
String xx = next < 256 ? next + "" : "x" + Integer.toHexString(next);
throw new XPathException("Unknown escape sequence \\" + xx, errorCode);
}
break;
}
} else {
buffer.append(c);
}
}
return buffer.toString();
}
/**
* Throw an error
*
* @param message the error message
* @param code the error code to be used
* @throws net.sf.saxon.trans.XPathException always
*/
private static void error(String message, String code)
throws XPathException {
throw new XPathException(message, code);
}
/**
* Throw an error
*
* @param message the error message
* @param code the error code to be used
* @throws net.sf.saxon.trans.XPathException always
*/
private static void invalidJSON(String message, String code, int lineNumber)
throws XPathException {
error("Invalid JSON input on line " + lineNumber + ": " + message, code);
}
@CSharpSimpleEnum
public enum JsonToken {
LSQB, RSQB, LCURLY, RCURLY, STRING_LITERAL, NUMERIC_LITERAL, TRUE,
FALSE, NULL, COLON, COMMA, UNQUOTED_STRING, EOF
}
/**
* Inner class to do the tokenization
*/
private static class JsonTokenizer {
public final String input;
public int position;
public int lineNumber = 1;
public JsonToken currentToken;
public StringBuilder currentTokenValue = new StringBuilder(64);
JsonTokenizer(String input) {
this.input = input;
this.position = 0;
// Ignore a leading BOM
if (!input.isEmpty() && input.charAt(0) == 65279) {
position++;
}
}
public JsonToken next() throws XPathException {
currentToken = readToken();
return currentToken;
}
private JsonToken readToken() throws XPathException {
if (position >= input.length()) {
return JsonToken.EOF;
}
boolean breakLoop = false;
do {
char c = input.charAt(position);
switch (c) {
case '\n':
case '\r':
if (!(c == '\n' && position > 0 && input.charAt(position) == '\n')) {
lineNumber++;
}
// drop through
CSharp.emitCode("goto case ' ';");
case ' ':
case '\t':
if (++position >= input.length()) {
return JsonToken.EOF;
}
break;
default:
breakLoop = true;
break;
}
} while (!breakLoop);
char ch = input.charAt(position++);
switch (ch) {
case '[':
return JsonToken.LSQB;
case '{':
return JsonToken.LCURLY;
case ']':
return JsonToken.RSQB;
case '}':
return JsonToken.RCURLY;
case '"':
currentTokenValue.setLength(0);
boolean afterBackslash = false;
while (true) {
if (position >= input.length()) {
invalidJSON("Unclosed quotes in string literal", ERR_GRAMMAR, lineNumber);
}
char c = input.charAt(position++);
if (c < 32) {
invalidJSON("Unescaped control character (x" + Integer.toHexString(c) + ")", ERR_GRAMMAR, lineNumber);
}
if (afterBackslash && c == 'u') {
try {
String hex = input.substring(position, position + 4);
Integer.parseInt(hex, 16);
} catch (Exception e) {
invalidJSON("\\u must be followed by four hex characters", ERR_GRAMMAR, lineNumber);
}
}
if (c == '"' && !afterBackslash) {
break;
} else {
currentTokenValue.append(c);
afterBackslash = c == '\\' && !afterBackslash;
}
}
return JsonToken.STRING_LITERAL;
case ':':
return JsonToken.COLON;
case ',':
return JsonToken.COMMA;
case '-':
case '+': // for liberal parsing
case '.': // for liberal parsing
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
currentTokenValue.setLength(0);
currentTokenValue.append(ch);
if (position < input.length()) { // We could be in ECMA mode when there is a single digit
while (true) {
char c = input.charAt(position);
if ((c >= '0' && c <= '9') || c == '-' || c == '+' || c == '.' || c == 'e' || c == 'E') {
currentTokenValue.append(c);
if (++position >= input.length()) {
break;
}
} else {
break;
}
}
}
return JsonToken.NUMERIC_LITERAL;
default: {
// Allow unquoted strings in liberal mode
if (NameChecker.isNCNameChar(ch)) {
currentTokenValue.setLength(0);
currentTokenValue.append(ch);
while (position < input.length()) {
char c = input.charAt(position);
if (NameChecker.isNCNameChar(c)) {
currentTokenValue.append(c);
position++;
} else {
break;
}
}
String val = currentTokenValue.toString();
switch (val) {
case "true":
return JsonToken.TRUE;
case "false":
return JsonToken.FALSE;
case "null":
return JsonToken.NULL;
default:
return JsonToken.UNQUOTED_STRING;
}
} else {
char c = input.charAt(--position);
String s = UTF16CharacterSet.isSurrogate(c) ? "" : " '" + c + "'";
invalidJSON("Unexpected character" + s + " (\\u" +
Integer.toHexString(c) + ") at position " + position, ERR_GRAMMAR, lineNumber);
return JsonToken.EOF;
}
}
}
}
}
public static String showToken(JsonToken token, String currentTokenValue) {
switch (token) {
case LSQB:
return "[";
case RSQB:
return "]";
case LCURLY:
return "{";
case RCURLY:
return "}";
case STRING_LITERAL:
return "string (\"" + currentTokenValue + "\")";
case NUMERIC_LITERAL:
return "number (" + currentTokenValue + ")";
case TRUE:
return "true";
case FALSE:
return "false";
case NULL:
return "null";
case COLON:
return ":";
case COMMA:
return ",";
case EOF:
return "";
default:
return "<" + token + ">";
}
}
public void setNumberParser(Map options, XPathContext context) throws XPathException {
Sequence val = options.get("number-parser");
if (val != null) {
Item fn = val.head();
if (fn instanceof Function) {
numberParser = (Function) fn;
if (numberParser.getArity() != 1) {
throw new XPathException("Number-parser function must have arity=1", "FOJS0005");
}
SpecificFunctionType required = new SpecificFunctionType(
new SequenceType[]{SequenceType.SINGLE_STRING}, SequenceType.SINGLE_ATOMIC);
if (!required.matches(numberParser, context.getConfiguration().getTypeHierarchy())) {
throw new XPathException("Number-parser function does not match the required type", "FOJS0005");
}
} else {
throw new XPathException("Value of option 'number-parser' is not a function", "FOJS0005");
}
}
}
}
// Copyright (c) 2018-2022 Saxonica Limited