com.google.json.JsonSanitizer Maven / Gradle / Ivy
Show all versions of json-sanitizer Show documentation
// Copyright (C) 2012 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.json;
import java.math.BigInteger;
/**
* Given JSON-like content, converts it to valid JSON.
* This can be attached at either end of a data-pipeline to help satisfy
* Postel's principle:
*
* be conservative in what you do, be liberal in what you accept from others
*
*
* Applied to JSON-like content from others, it will produce well-formed JSON
* that should satisfy any parser you use.
*
* Applied to your output before you send, it will coerce minor mistakes in
* encoding and make it easier to embed your JSON in HTML and XML.
*
*
Input
* The sanitizer takes JSON like content, and interprets it as JS eval would.
* Specifically, it deals with these non-standard constructs.
*
* - {@code '...'} Single quoted strings are converted to JSON strings.
*
- {@code \xAB} Hex escapes are converted to JSON unicode escapes.
*
- {@code \012} Octal escapes are converted to JSON unicode escapes.
*
- {@code 0xAB} Hex integer literals are converted to JSON decimal numbers.
*
- {@code 012} Octal integer literals are converted to JSON decimal numbers.
*
- {@code +.5} Decimal numbers are coerced to JSON's stricter format.
*
- {@code [0,,2]} Elisions in arrays are filled with {@code null}.
*
- {@code [1,2,3,]} Trailing commas are removed.
*
{foo:"bar"}
Unquoted property names are quoted.
* //comments
JS style line and block comments are removed.
* (...)
Grouping parentheses are removed.
*
*
* The sanitizer fixes missing punctuation, end quotes, and mismatched or
* missing close brackets. If an input contains only white-space then the valid
* JSON string {@code null} is substituted.
*
* Output
* The output is well-formed JSON as defined by
* RFC 4627.
* The output satisfies three additional properties:
*
* - The output will not contain the substring (case-insensitively)
* {@code "The output will not contain the substring {@code "]]>"} so can be
* embedded inside an XML CDATA section without further encoding.
* - The output is a valid Javascript expression, so can be parsed by
* Javascript's
eval
builtin (after being wrapped in parentheses)
* or by JSON.parse
.
* Specifically, the output will not contain any string literals with embedded
* JS newlines (U+2028 Paragraph separator or U+2029 Line separator).
* - The output contains only valid Unicode scalar values
* (no isolated UTF-16 surrogates) that are
* allowed in XML unescaped.
*
*
* Security
* Since the output is well-formed JSON, passing it to eval
will
* have no side-effects and no free variables, so is neither a code-injection
* vector, nor a vector for exfiltration of secrets.
*
* This library only ensures that the JSON string → Javascript object
* phase has no side effects and resolves no free variables, and cannot control
* how other client side code later interprets the resulting Javascript object.
* So if client-side code takes a part of the parsed data that is controlled by
* an attacker and passes it back through a powerful interpreter like
* {@code eval} or {@code innerHTML} then that client-side code might suffer
* unintended side-effects.
*
*
Efficiency
* The sanitize method will return the input string without allocating a new
* buffer when the input is already valid JSON that satisfies the properties
* above. Thus, if used on input that is usually well formed, it has minimal
* memory overhead.
* The sanitize method takes O(n) time where n is the length in UTF-16
* code-units.
*/
public final class JsonSanitizer {
/** The default for the maximumNestingDepth constructor parameter. */
public static final int DEFAULT_NESTING_DEPTH = 64;
/** The maximum value for the maximumNestingDepth constructor parameter. */
public static final int MAXIMUM_NESTING_DEPTH = 4096;
/**
* Given JSON-like content, produces a string of JSON that is safe to embed,
* safe to pass to JavaScript's {@code eval} operator.
*
* @param jsonish JSON-like content.
* @return embeddable JSON
*/
public static String sanitize(String jsonish) {
return sanitize(jsonish, DEFAULT_NESTING_DEPTH);
}
/**
* Same as {@link JsonSanitizer#sanitize(String)}, but allows to set a custom
* maximum nesting depth.
*
* @param jsonish JSON-like content.
* @param maximumNestingDepth maximum nesting depth.
* @return embeddable JSON
*/
public static String sanitize(String jsonish, int maximumNestingDepth) {
JsonSanitizer s = new JsonSanitizer(jsonish, maximumNestingDepth);
s.sanitize();
return s.toString();
}
/**
* Describes where we are in a state machine that consists of transitions on
* complete values, colons, commas, and brackets.
*/
private enum State {
/**
* Immediately after '[' and
* {@link #BEFORE_ELEMENT before the first element}.
*/
START_ARRAY,
/** Before a JSON value in an array or at the top level. */
BEFORE_ELEMENT,
/**
* After a JSON value in an array or at the top level, and before any
* following comma or close bracket.
*/
AFTER_ELEMENT,
/** Immediately after '{' and {@link #BEFORE_KEY before the first key}. */
START_MAP,
/** Before a key in a key-value map. */
BEFORE_KEY,
/** After a key in a key-value map but before the required colon. */
AFTER_KEY,
/** Before a value in a key-value map. */
BEFORE_VALUE,
/**
* After a value in a key-value map but before any following comma or
* close bracket.
*/
AFTER_VALUE,
;
}
/**
* The maximum nesting depth. According to RFC4627 it is implementation-specific.
*/
private final int maximumNestingDepth;
private final String jsonish;
/**
* The number of brackets that have been entered and not subsequently exited.
* Also, the length of the used prefix of {@link #isMap}.
*/
private int bracketDepth;
/**
* {@code isMap[i]} when {@code 0 <= i && i < bracketDepth} is true iff
* the i-th open bracket was a '{'
, not a '['
.
*/
private boolean[] isMap;
/**
* If non-null, then contains the sanitized form of
* {@code jsonish.substring(0, cleaned)}.
* If {@code null}, then no unclean constructs have been found in
* {@code jsonish} yet.
*/
private StringBuilder sanitizedJson;
/**
* The length of the prefix of {@link #jsonish} that has been written onto
* {@link #sanitizedJson}.
*/
private int cleaned;
private static final boolean SUPER_VERBOSE_AND_SLOW_LOGGING = false;
JsonSanitizer(String jsonish) {
this(jsonish, DEFAULT_NESTING_DEPTH);
}
JsonSanitizer(String jsonish, int maximumNestingDepth) {
this.maximumNestingDepth = Math.min(Math.max(1, maximumNestingDepth),MAXIMUM_NESTING_DEPTH);
if (SUPER_VERBOSE_AND_SLOW_LOGGING) {
System.err.println("\n" + jsonish + "\n========");
}
this.jsonish = jsonish != null ? jsonish : "null";
}
int getMaximumNestingDepth() {
return this.maximumNestingDepth;
}
void sanitize() {
// Return to consistent state.
bracketDepth = cleaned = 0;
sanitizedJson = null;
State state = State.START_ARRAY;
int n = jsonish.length();
// Walk over each token and either validate it, by just advancing i and
// computing the next state, or manipulate cleaned&sanitizedJson so that
// sanitizedJson contains the sanitized equivalent of
// jsonish.substring(0, cleaned).
token_loop:
for (int i = 0; i < n; ++i) {
try {
char ch = jsonish.charAt(i);
if (SUPER_VERBOSE_AND_SLOW_LOGGING) {
String sanitizedJsonStr =
(sanitizedJson == null ? "" : sanitizedJson)
+ jsonish.substring(cleaned, i);
System.err.println("i=" + i + ", ch=" + ch + ", state=" + state
+ ", sanitized=" + sanitizedJsonStr);
}
switch (ch) {
case '\t': case '\n': case '\r': case ' ':
break;
case '"': case '\'':
state = requireValueState(i, state, true);
int strEnd = endOfQuotedString(jsonish, i);
sanitizeString(i, strEnd);
i = strEnd - 1;
break;
case '(': case ')':
// Often JSON-like content which is meant for use by eval is
// wrapped in parentheses so that the JS parser treats contained
// curly brackets as part of an object constructor instead of a
// block statement.
// We elide these grouping parentheses to ensure valid JSON.
elide(i, i + 1);
break;
case '{': case '[':
state = requireValueState(i, state, false);
if (isMap == null) {
isMap = new boolean[maximumNestingDepth];
}
boolean map = ch == '{';
isMap[bracketDepth] = map;
++bracketDepth;
state = map ? State.START_MAP : State.START_ARRAY;
break;
case '}': case ']':
if (bracketDepth == 0) {
elide(i, jsonish.length());
break token_loop;
}
// Strip trailing comma to convert {"a":0,} -> {"a":0}
// and [1,2,3,] -> [1,2,3,]
switch (state) {
case BEFORE_VALUE:
insert(i, "null");
break;
case BEFORE_ELEMENT: case BEFORE_KEY:
elideTrailingComma(i);
break;
case AFTER_KEY:
insert(i, ":null");
break;
case START_MAP: case START_ARRAY:
case AFTER_ELEMENT: case AFTER_VALUE: break;
}
--bracketDepth;
char closeBracket = isMap[bracketDepth] ? '}' : ']';
if (ch != closeBracket) {
replace(i, i + 1, closeBracket);
}
state = bracketDepth == 0 || !isMap[bracketDepth - 1]
? State.AFTER_ELEMENT : State.AFTER_VALUE;
break;
case ',':
if (bracketDepth == 0) { throw UNBRACKETED_COMMA; }
// Convert comma elisions like [1,,3] to [1,null,3].
// [1,,3] in JS is an array that has no element at index 1
// according to the "in" operator so accessing index 1 will
// yield the special value "undefined" which is equivalent to
// JS's "null" value according to "==".
switch (state) {
// Normal
case AFTER_ELEMENT:
state = State.BEFORE_ELEMENT;
break;
case AFTER_VALUE:
state = State.BEFORE_KEY;
break;
// Array elision.
case START_ARRAY: case BEFORE_ELEMENT:
insert(i, "null");
state = State.BEFORE_ELEMENT;
break;
// Ignore
case START_MAP: case BEFORE_KEY:
case AFTER_KEY:
elide(i, i + 1);
break;
// Supply missing value.
case BEFORE_VALUE:
insert(i, "null");
state = State.BEFORE_KEY;
break;
}
break;
case ':':
if (state == State.AFTER_KEY) {
state = State.BEFORE_VALUE;
} else {
elide(i, i + 1);
}
break;
case '/':
// Skip over JS-style comments since people like inserting them into
// data files and getting huffy with Crockford when he says no to
// versioning JSON to allow ignorable tokens.
int end = i + 1;
if (i + 1 < n) {
switch (jsonish.charAt(i + 1)) {
case '/':
end = n; // Worst case.
for (int j = i + 2; j < n; ++j) {
char cch = jsonish.charAt(j);
if (cch == '\n' || cch == '\r'
|| cch == '\u2028' || cch == '\u2029') {
end = j + 1;
break;
}
}
break;
case '*':
end = n;
if (i + 3 < n) {
for (int j = i + 2;
(j = jsonish.indexOf('/', j + 1)) >= 0;) {
if (jsonish.charAt(j - 1) == '*') {
end = j + 1;
break;
}
}
}
break;
}
}
elide(i, end);
i = end - 1;
break;
default:
// Three kinds of other values can occur.
// 1. Numbers
// 2. Keyword values ("false", "null", "true")
// 3. Unquoted JS property names as in the JS expression
// ({ foo: "bar"})
// which is equivalent to the JSON
// { "foo": "bar" }
// 4. Cruft tokens like BOMs.
// Look for a run of '.', [0-9], [a-zA-Z_$], [+-] which subsumes
// all the above without including any JSON special characters
// outside keyword and number.
int runEnd;
for (runEnd = i; runEnd < n; ++runEnd) {
char tch = jsonish.charAt(runEnd);
if (('a' <= tch && tch <= 'z') || ('0' <= tch && tch <= '9')
|| tch == '+' || tch == '-' || tch == '.'
|| ('A' <= tch && tch <= 'Z') || tch == '_' || tch == '$') {
continue;
}
break;
}
if (runEnd == i) {
elide(i, i + 1);
break;
}
state = requireValueState(i, state, true);
boolean isNumber = ('0' <= ch && ch <= '9')
|| ch == '.' || ch == '+' || ch == '-';
boolean isKeyword = !isNumber && isKeyword(i, runEnd);
if (!(isNumber || isKeyword)) {
// We're going to have to quote the output. Further expand to
// include more of an unquoted token in a string.
for (; runEnd < n; ++runEnd) {
if (isJsonSpecialChar(runEnd)) {
break;
}
}
if (runEnd < n && jsonish.charAt(runEnd) == '"') {
++runEnd;
}
}
if (state == State.AFTER_KEY) {
// We need to quote whatever we have since it is used as a
// property name in a map and only quoted strings can be used that
// way in JSON.
insert(i, '"');
if (isNumber) {
// By JS rules,
// { .5e-1: "bar" }
// is the same as
// { "0.05": "bar" }
// because a number literal is converted to its string form
// before being used as a property name.
canonicalizeNumber(i, runEnd);
// We intentionally ignore the return value of canonicalize.
// Uncanonicalizable numbers just get put straight through as
// string values.
insert(runEnd, '"');
} else {
sanitizeString(i, runEnd);
}
} else {
if (isNumber) {
// Convert hex and octal constants to decimal and ensure that
// integer and fraction portions are not empty.
normalizeNumber(i, runEnd);
} else if (!isKeyword) {
// Treat as an unquoted string literal.
insert(i, '"');
sanitizeString(i, runEnd);
}
}
i = runEnd - 1;
}
} catch (@SuppressWarnings("unused") UnbracketedComma e) {
elide(i, jsonish.length());
break;
}
}
if (state == State.START_ARRAY && bracketDepth == 0) {
// No tokens. Only whitespace
insert(n, "null");
state = State.AFTER_ELEMENT;
}
if (SUPER_VERBOSE_AND_SLOW_LOGGING) {
System.err.println(
"state=" + state + ", sanitizedJson=" + sanitizedJson
+ ", cleaned=" + cleaned + ", bracketDepth=" + bracketDepth);
}
if ((sanitizedJson != null && sanitizedJson.length() != 0)
|| cleaned != 0 || bracketDepth != 0) {
if (sanitizedJson == null) {
sanitizedJson = new StringBuilder(n + bracketDepth);
}
sanitizedJson.append(jsonish, cleaned, n);
cleaned = n;
switch (state) {
case BEFORE_ELEMENT: case BEFORE_KEY:
elideTrailingComma(n);
break;
case AFTER_KEY:
sanitizedJson.append(":null");
break;
case BEFORE_VALUE:
sanitizedJson.append("null");
break;
default: break;
}
// Insert brackets to close unclosed content.
while (bracketDepth != 0) {
sanitizedJson.append(isMap[--bracketDepth] ? '}' : ']');
}
}
}
/**
* Ensures that the output corresponding to {@code jsonish[start:end]} is a
* valid JSON string that has the same meaning when parsed by Javascript
* {@code eval}.
*
* - Making sure that it is fully quoted with double-quotes.
*
- Escaping any Javascript newlines : CR, LF, U+2028, U+2029
*
- Escaping HTML special characters to allow it to be safely embedded
* in HTML {@code and ]]> in string
// literals so that the output can be embedded in HTML script elements
// and in XML CDATA sections without affecting the parser state.
// References:
// https://www.w3.org/TR/html53/semantics-scripting.html#restrictions-for-contents-of-script-elements
// https://www.w3.org/TR/html53/syntax.html#script-data-escaped-state
// https://www.w3.org/TR/html53/syntax.html#script-data-double-escaped-state
// https://www.w3.org/TR/xml/#sec-cdata-sect
case '<': {
// Disallow , which lets the HTML parser switch out of the "script
// data escaped" or "script data double escaped" state.
if ((i - 2) >= start) {
int lb = i - 1;
int cm1AndDelta = unescapedCharRev(jsonish, lb);
char cm1 = (char) cm1AndDelta;
lb -= cm1AndDelta >>> 16;
int cm2AndDelta = unescapedCharRev(jsonish, lb);
char cm2 = (char) cm2AndDelta;
if ('-' == cm2 && '-' == cm1) {
replace(i, i + 1, "\\u003e"); // Escaped >
}
}
break;
case ']':
if (i + 2 < end) {
int la = i + 1;
long c1AndDelta = unescapedChar(jsonish, la);
char c1 = (char) c1AndDelta;
la += c1AndDelta >>> 16;
long c2AndEnd = unescapedChar(jsonish, la);
char c2 = (char) c2AndEnd;
if (']' == c1 && '>' == c2) {
replace(i, i + 1, "\\u005d");
}
}
break;
// Normalize escape sequences.
case '\\':
if (i + 1 == end) {
elide(i, i + 1);
break;
}
char sch = jsonish.charAt(i + 1);
switch (sch) {
case 'b': case 'f': case 'n': case 'r': case 't': case '\\':
case '/': case '"':
++i;
break;
case 'v': // Recognized by JS but not by JSON.
replace(i, i + 2, "\\u0008");
++i;
break;
case 'x':
if (i + 4 < end && isHexAt(i+2) && isHexAt(i+3)) {
replace(i, i + 2, "\\u00"); // \xab -> \u00ab
i += 3;
break;
}
elide(i, i + 1);
break;
case 'u':
if (i + 6 < end && isHexAt(i + 2) && isHexAt(i + 3)
&& isHexAt(i + 4) && isHexAt(i + 5)) {
i += 5;
break;
}
elide(i, i + 1);
break;
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7': {
int octalStart = i + 1;
int octalEnd = octalStart;
++octalEnd;
if (octalEnd < end && isOctAt(octalEnd)) {
++octalEnd;
if (sch <= '3' && octalEnd < end && isOctAt(octalEnd)) {
++octalEnd;
}
}
int value = 0;
for (int j = octalStart; j < octalEnd; ++j) {
char digit = jsonish.charAt(j);
value = (value << 3) | (digit - '0');
}
replace(octalStart, octalEnd, "u00");
appendHex(value, 2);
i = octalEnd - 1;
break;
}
default:
// Literal char that is recognized by JS but not by JSON.
// "\-" is valid JS but not valid JSON.
elide(i, i + 1);
break;
}
break;
default:
// Escape all control code-points and isolated surrogates which are
// not embeddable in XML.
// http://www.w3.org/TR/xml/#charsets says
// Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD]
// | [#x10000-#x10FFFF]
if (ch < 0x20) {
// Proceed to hex-escape below since control characters are
// disallowed by ECMA-404 which governs JavaScript's `JSON.parse`.
// Common ones like CR, LF, and TAB are given short escape sequences above.
} else if (ch < 0xd800) { // Not a surrogate.
continue;
} else if (ch < 0xe000) { // A surrogate
if (Character.isHighSurrogate(ch) && i+1 < end
&& Character.isLowSurrogate(jsonish.charAt(i+1))) {
++i; // Skip over low surrogate since we have already vetted it.
continue;
}
} else if (ch <= 0xfffd) { // Not one of the 0xff.. controls.
continue;
}
replace(i, i + 1, "\\u");
for (int j = 4; --j >= 0;) {
sanitizedJson.append(HEX_DIGITS[(ch >>> (j << 2)) & 0xf]);
}
break;
}
}
if (!closed) { insert(end, '"'); }
}
private State requireValueState(int pos, State state, boolean canBeKey)
throws UnbracketedComma {
switch (state) {
case START_MAP: case BEFORE_KEY:
if (canBeKey) {
return State.AFTER_KEY;
} else {
insert(pos, "\"\":");
return State.AFTER_VALUE;
}
case AFTER_KEY:
insert(pos, ':');
return State.AFTER_VALUE;
case BEFORE_VALUE:
return State.AFTER_VALUE;
case AFTER_VALUE:
if (canBeKey) {
insert(pos, ',');
return State.AFTER_KEY;
} else {
insert(pos, ",\"\":");
return State.AFTER_VALUE;
}
case START_ARRAY: case BEFORE_ELEMENT:
return State.AFTER_ELEMENT;
case AFTER_ELEMENT:
if (bracketDepth == 0) { throw UNBRACKETED_COMMA; }
insert(pos, ',');
return State.AFTER_ELEMENT;
}
throw new AssertionError();
}
private void insert(int pos, char ch) {
replace(pos, pos, ch);
}
private void insert(int pos, String s) {
replace(pos, pos, s);
}
private void elide(int start, int end) {
if (sanitizedJson == null) {
sanitizedJson = new StringBuilder(jsonish.length() + 16);
}
sanitizedJson.append(jsonish, cleaned, start);
cleaned = end;
}
private void replace(int start, int end, char ch) {
elide(start, end);
sanitizedJson.append(ch);
}
private void replace(int start, int end, String s) {
elide(start, end);
sanitizedJson.append(s);
}
/**
* The position past the last character within the quotes of the quoted
* string starting at {@code s.charAt(start)}. Does not assume that the
* quoted string is properly closed.
*/
private static int endOfQuotedString(String s, int start) {
char quote = s.charAt(start);
for (int i = start; (i = s.indexOf(quote, i + 1)) >= 0;) {
// If there are an even number of preceding backslashes then this is
// the end of the string.
int slashRunStart = i;
while (slashRunStart > start && s.charAt(slashRunStart - 1) == '\\') {
--slashRunStart;
}
if (((i - slashRunStart) & 1) == 0) {
return i + 1;
}
}
return s.length();
}
private void elideTrailingComma(int closeBracketPos) {
// The content before closeBracketPos is stored in two places.
// 1. sanitizedJson
// 2. jsonish.substring(cleaned, closeBracketPos)
// We walk over whitespace characters in both right-to-left looking for a
// comma.
for (int i = closeBracketPos; --i >= cleaned;) {
switch (jsonish.charAt(i)) {
case '\t': case '\n': case '\r': case ' ':
continue;
case ',':
elide(i, i+1);
return;
default: throw new AssertionError("" + jsonish.charAt(i));
}
}
assert sanitizedJson != null;
for (int i = sanitizedJson.length(); --i >= 0;) {
switch (sanitizedJson.charAt(i)) {
case '\t': case '\n': case '\r': case ' ':
continue;
case ',':
sanitizedJson.setLength(i);
return;
default: throw new AssertionError("" + sanitizedJson.charAt(i));
}
}
throw new AssertionError(
"Trailing comma not found in " + jsonish + " or " + sanitizedJson);
}
/**
* Ensures that the given run of characters is a valid JSON number. This is
* less aggressive than {@link #canonicalizeNumber} since it can be called
* on inputs that are valid JSON so is on the fast path.
*
* JS numbers differ from JSON numbers in several ways:
* - They can have '+' as a sign prefix: +1
*
- They allow a 0x... hexadecimal form: 0xA4
*
- They allow a 0... octal form: 012
*
- The integer part can be empty: .5
*
- The fraction part can be empty: 1.
*
*/
private void normalizeNumber(int start, int end) {
int pos = start;
// Sign
if (pos < end) {
switch (jsonish.charAt(pos)) {
case '+':
elide(pos, pos + 1);
++pos;
break;
case '-':
++pos;
break;
default:
break;
}
}
// Integer part
int intEnd = endOfDigitRun(pos, end);
if (pos == intEnd) { // No empty integer parts allowed in JSON.
insert(pos, '0');
} else if ('0' == jsonish.charAt(pos)) {
boolean reencoded = false;
int maxDigVal = 0; // The value of the max digit
int probableBase = 10; // The base suggested by the prefix
int firstDigitIndex = -1;
if (intEnd - pos == 1 && intEnd < end
&& 'x' == (jsonish.charAt(intEnd) | 32)) { // Recode hex.
probableBase = 16;
firstDigitIndex = intEnd + 1;
for (intEnd = intEnd + 1; intEnd < end; ++intEnd) {
char ch = jsonish.charAt(intEnd);
int digVal;
if ('0' <= ch && ch <= '9') {
digVal = ch - '0';
} else {
ch |= 32;
if ('a' <= ch && ch <= 'f') {
digVal = ch - ('a' - 10);
} else {
break;
}
}
maxDigVal = Math.max(digVal, maxDigVal);
}
reencoded = true;
} else if (intEnd - pos > 1) { // Recode octal.
probableBase = 8;
firstDigitIndex = pos;
for (int i = pos; i < intEnd; ++i) {
int digVal = jsonish.charAt(i) - '0';
if (digVal < 0) {
break;
}
maxDigVal = Math.max(digVal, maxDigVal);
}
reencoded = true;
}
if (reencoded) {
// Avoid multiple signs.
// Putting out the underflowed value is the least bad option.
elide(pos, intEnd);
String digits = jsonish.substring(firstDigitIndex, intEnd);
int nDigits = digits.length();
int base = probableBase > maxDigVal ? probableBase : maxDigVal > 10 ? 16 : 10;
if (nDigits == 0) {
sanitizedJson.append('0');
} else if (DIGITS_BY_BASE_THAT_FIT_IN_63B[base] >= nDigits) {
long value = Long.parseLong(digits, base);
sanitizedJson.append(value);
} else {
// If there are lots of digits, we need to reencode using a BigInteger
BigInteger value = new BigInteger(digits, base);
sanitizedJson.append(value);
}
}
}
pos = intEnd;
// Optional fraction.
if (pos < end && jsonish.charAt(pos) == '.') {
++pos;
int fractionEnd = endOfDigitRun(pos, end);
if (fractionEnd == pos) {
insert(pos, '0');
}
// JS eval will discard digits after 24(?) but will not treat them as a
// syntax error, and JSON allows arbitrary length fractions.
pos = fractionEnd;
}
// Optional exponent.
if (pos < end && 'e' == (jsonish.charAt(pos) | 32)) {
++pos;
if (pos < end) {
switch (jsonish.charAt(pos)) {
// JSON allows explicit + in exponent but not for number as a whole.
case '+': case '-': ++pos; break;
default: break;
}
}
// JSON allows leading zeros on exponent part.
int expEnd = endOfDigitRun(pos, end);
if (expEnd == pos) {
insert(pos, '0');
}
pos = expEnd;
}
if (pos != end) {
elide(pos, end);
}
}
/**
* Converts a run of characters that form a JS number to its canonical form
* which happens to also be a valid JSON number.
* @return true when the number could be canonicalized.
*/
private boolean canonicalizeNumber(int start, int end) {
elide(start, start);
int sanStart = sanitizedJson.length();
normalizeNumber(start, end);
// Ensure that the number is on the output buffer. Since this method is
// only called when we are quoting a number that appears where a property
// name is expected, we can force the sanitized form to contain it without
// affecting the fast-track for already valid inputs.
elide(end, end);
int sanEnd = sanitizedJson.length();
return canonicalizeNumber(sanitizedJson, sanStart, sanEnd);
}
/**
* @param sanStart the start (inclusive) of the number on sanitizedJson.
* @param sanEnd the end (exclusive) of the number on sanitizedJson.
* @return true when the number could be canonicalized.
*/
private static boolean canonicalizeNumber(
StringBuilder sanitizedJson, int sanStart, int sanEnd) {
// Now we perform several steps.
// 1. Convert from scientific notation to regular or vice-versa based on
// normalized exponent.
// 2. Remove trailing zeroes from the fraction and truncate it to 24 digits.
// 3. Elide the fraction entirely if it is ".0".
// 4. Convert any 'E' that separates the exponent to lower-case.
// 5. Elide any minus sign on a zero value.
// to convert the number to its canonical JS string form.
// Figure out where the parts of the number start and end.
int intStart, intEnd, fractionStart, fractionEnd, expStart, expEnd;
intStart = sanStart + (sanitizedJson.charAt(sanStart) == '-' ? 1 : 0);
for (intEnd = intStart; intEnd < sanEnd; ++intEnd) {
char ch = sanitizedJson.charAt(intEnd);
if (!('0' <= ch && ch <= '9')) { break; }
}
if (intEnd == sanEnd || '.' != sanitizedJson.charAt(intEnd)) {
fractionStart = fractionEnd = intEnd;
} else {
fractionStart = intEnd + 1;
for (fractionEnd = fractionStart; fractionEnd < sanEnd; ++fractionEnd) {
char ch = sanitizedJson.charAt(fractionEnd);
if (!('0' <= ch && ch <= '9')) { break; }
}
}
if (fractionEnd == sanEnd) {
expStart = expEnd = sanEnd;
} else {
assert 'e' == (sanitizedJson.charAt(fractionEnd) | 32);
expStart = fractionEnd + 1;
if (sanitizedJson.charAt(expStart) == '+') { ++expStart; }
expEnd = sanEnd;
}
assert
intStart <= intEnd
&& intEnd <= fractionStart
&& fractionStart <= fractionEnd
&& fractionEnd <= expStart
&& expStart <= expEnd;
int exp;
if (expEnd == expStart) {
exp = 0;
} else {
try {
exp = Integer.parseInt(sanitizedJson.substring(expStart, expEnd), 10);
} catch (@SuppressWarnings("unused") NumberFormatException ex) {
// The exponent is out of the range of representable ints.
// JSON does not place limits on the range of representable numbers but
// nor does it allow bare numbers as keys.
return false;
}
}
// Numbered Comments below come from the EcmaScript 5 language specification
// section 9.8.1 : ToString Applied to the Number Type
// http://es5.github.com/#x9.8.1
// 5. let n, k, and s be integers such that k >= 1, 10k-1 <= s < 10k, the
// Number value for s * 10n-k is m, and k is as small as possible.
// Note that k is the number of digits in the decimal representation of s,
// that s is not divisible by 10, and that the least significant digit of s
// is not necessarily uniquely determined by these criteria.
int n = exp; // Exponent
// s, the string of decimal digits in the representation of m are stored in
// sanitizedJson.substring(intStart).
// k, the number of digits in s is computed later.
// Leave only the number representation on the output buffer after intStart.
// This leaves any sign on the digit per
// 3. If m is less than zero, return the String concatenation of the
// String "-" and ToString(-m).
boolean sawDecimal = false;
boolean zero = true;
int digitOutPos = intStart;
for (int i = intStart, nZeroesPending = 0; i < fractionEnd; ++i) {
char ch = sanitizedJson.charAt(i);
if (ch == '.') {
sawDecimal = true;
if (zero) { nZeroesPending = 0; }
continue;
}
char digit = ch;
if ((!zero || digit != '0') && !sawDecimal) { ++n; }
if (digit == '0') {
// Keep track of runs of zeros so that we can take them into account
// if we later see a non-zero digit.
++nZeroesPending;
} else {
if (zero) { // First non-zero digit.
// Discard runs of zeroes at the front of the integer part, but
// any after the decimal point factor into the exponent, n.
if (sawDecimal) {
n -= nZeroesPending;
}
nZeroesPending = 0;
}
zero = false;
while (nZeroesPending != 0 || digit != 0) {
char vdigit;
if (nZeroesPending == 0) {
vdigit = digit;
digit = (char) 0;
} else {
vdigit = '0';
--nZeroesPending;
}
// TODO: limit s to 21 digits?
sanitizedJson.setCharAt(digitOutPos++, vdigit);
}
}
}
sanitizedJson.setLength(digitOutPos);
// Number of digits in decimal representation of s.
int k = digitOutPos - intStart;
// Now we have computed n, k, and s as defined above. Time to add decimal
// points, exponents, and leading zeroes per the rest of the JS number
// formatting specification.
if (zero) { // There are no non-zero decimal digits.
// 2. If m is +0 or -0, return the String "0".
sanitizedJson.setLength(sanStart); // Elide any sign.
sanitizedJson.append('0');
return true;
}
// 6. If k <= n <= 21, return the String consisting of the k digits of the
// decimal representation of s (in order, with no leading zeroes),
// followed by n-k occurrences of the character '0'.
if (k <= n && n <= 21) {
for (int i = k; i < n; ++i) {
sanitizedJson.append('0');
}
// 7. If 0 < n <= 21, return the String consisting of the most significant n
// digits of the decimal representation of s, followed by a decimal point
// '.', followed by the remaining k-n digits of the decimal representation
// of s.
} else if (0 < n && n <= 21) {
sanitizedJson.insert(intStart + n, '.');
// 8. If -6 < n <= 0, return the String consisting of the character '0',
// followed by a decimal point '.', followed by -n occurrences of the
// character '0', followed by the k digits of the decimal representation of
// s.
} else if (-6 < n && n <= 0) {
sanitizedJson.insert(intStart, "0.000000".substring(0, 2 - n));
} else {
// 9. Otherwise, if k = 1, return the String consisting of the single
// digit of s, followed by lowercase character 'e', followed by a plus
// sign '+' or minus sign '-' according to whether n-1 is positive or
// negative, followed by the decimal representation of the integer
// abs(n-1) (with no leading zeros).
if (k == 1) {
// Sole digit already on sanitizedJson.
// 10. Return the String consisting of the most significant digit of the
// decimal representation of s, followed by a decimal point '.', followed
// by the remaining k-1 digits of the decimal representation of s,
// followed by the lowercase character 'e', followed by a plus sign '+'
// or minus sign '-' according to whether n-1 is positive or negative,
// followed by the decimal representation of the integer abs(n-1) (with
// no leading zeros).
} else {
sanitizedJson.insert(intStart + 1, '.');
}
int nLess1 = n-1;
sanitizedJson.append('e').append(nLess1 < 0 ? '-' : '+')
.append(Math.abs(nLess1));
}
return true;
}
private boolean isKeyword(int start, int end) {
int n = end - start;
if (n == 5) {
return "false".regionMatches(0, jsonish, start, n);
} else if (n == 4) {
return "null".regionMatches(0, jsonish, start, n)
|| "true".regionMatches(0, jsonish, start, n);
}
return false;
}
private boolean isOctAt(int i) {
return isOct(jsonish.charAt(i));
}
private static boolean isOct(char ch) {
return '0' <= ch && ch <= '7';
}
private boolean isHexAt(int i) {
return isHex(jsonish.charAt(i));
}
private static boolean isHex(char ch) {
if ('0' <= ch && ch <= '9') { return true; }
int lch = ch | 32;
return 'a' <= lch && lch <= 'f';
}
private static int hexVal(char ch) {
int lch = ch | 32;
return lch - (lch <= '9' ? '0' : 'a' - 10);
}
private boolean isJsonSpecialChar(int i) {
char ch = jsonish.charAt(i);
if (ch <= ' ') { return true; }
switch (ch) {
case '"':
case ',': case ':':
case '[': case ']':
case '{': case '}':
return true;
default:
return false;
}
}
private void appendHex(int n, int nDigits) {
for (int quadsToShift = nDigits; --quadsToShift >= 0;) {
int dig = (n >>> (4 * quadsToShift)) & 0xf;
sanitizedJson.append((char) (dig + (dig < 10 ? '0' : (char) ('a' - 10))));
}
}
/** Indicates that a comma was seen at the top level. */
private static final class UnbracketedComma extends Exception {
private static final long serialVersionUID = 783239978717247850L;
// No members. Used for nominal type.
}
private int endOfDigitRun(int start, int limit) {
for (int end = start; end < limit; ++end) {
char ch = jsonish.charAt(end);
if (!('0' <= ch && ch <= '9')) { return end; }
}
return limit;
}
/**
* An exception instance that can be thrown without the cost of unwinding the
* stack.
*/
@SuppressWarnings("synthetic-access")
private static final UnbracketedComma UNBRACKETED_COMMA
= new UnbracketedComma();
static {
UNBRACKETED_COMMA.setStackTrace(new StackTraceElement[0]);
}
CharSequence toCharSequence() {
return sanitizedJson != null ? sanitizedJson : jsonish;
}
@Override
public String toString() {
return sanitizedJson != null ? sanitizedJson.toString() : jsonish;
}
private static final char[] HEX_DIGITS = new char[] {
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
};
private static final int[] DIGITS_BY_BASE_THAT_FIT_IN_63B = {
-1, -1, 63, 39, 31, 27, 24, 22,
21 /* base 8 */, 19, 18 /* base 10 */, 18, 17, 17, 16, 16,
15 /* base 16 */,
};
/**
* Looks for a literal character or escape sequence at left in s.
* Packs the char into the lowest 16 bytes of the output and the
* count of characters in the sequence in the remaining bits
*/
private static int unescapedChar(String s, int left) {
int n = s.length();
if (left >= n) {
return 0;
}
char c = s.charAt(left);
if (c == '\\') {
if (left + 1 == n) {
return 0x10000;
}
char nc = s.charAt(left + 1);
switch (nc) {
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7': {
int octalStart = left + 1;
int octalEnd = octalStart;
++octalEnd;
if (octalEnd < n && isOct(s.charAt(octalEnd))) {
++octalEnd;
if (nc <= '3' && octalEnd < n && isOct(s.charAt(octalEnd))) {
++octalEnd;
}
}
int value = 0;
for (int j = octalStart; j < octalEnd; ++j) {
char digit = s.charAt(j);
value = (value << 3) | (digit - '0');
}
return ((octalEnd - left) << 16) | value;
}
case 'x':
if (left + 3 < n) {
char d0 = s.charAt(left + 2);
char d1 = s.charAt(left + 3);
if (isHex(d0) && isHex(d1)) {
return 0x4000 | (hexVal(d0) << 4) | hexVal(d1);
}
}
break;
case 'u':
if (left + 5 < n) {
char d0 = s.charAt(left + 2);
char d1 = s.charAt(left + 3);
char d2 = s.charAt(left + 4);
char d3 = s.charAt(left + 5);
if (isHex(d0) && isHex(d1) && isHex(d2) && isHex(d3)) {
return 0x6000 |
(hexVal(d0) << 12) | (hexVal(d1) << 8) | (hexVal(d2) << 4) | hexVal(d3);
}
}
break;
case 'b': return (0x20000 | '\b');
case 'f': return (0x20000 | '\f');
case 'n': return 0x2000A;
case 'r': return 0x2000D;
case 't': return 0x20009;
case 'v': return 0x20008;
default: break;
}
return (0x20000) | nc;
} else {
return 0x10000 | c;
}
}
private static int unescapedCharRev(String s, int rightIncl) {
if (rightIncl < 0) {
return 0;
}
// \?
// \01
// \012
// \x00
// \u0000
for (int i = 1; i < 6; ++i) {
int left = rightIncl - i;
if (left < 0) { break; }
if (s.charAt(left) == '\\') {
// If there are an odd number of '\\' then decode.
int n = 1;
while (left - n >= 0 && s.charAt(left - n) == '\\') {
++n;
}
if ((n & 1) == 0) {
int unescaped = unescapedChar(s, left);
if ((unescaped >>> 16) == i) {
return unescaped;
}
}
break;
}
}
return 0x10000 | s.charAt(rightIncl);
}
}