
com.schemaapp.core.util.JsonSanitizer Maven / Gradle / Ivy
// Copyright (C) 2012 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
package com.schemaapp.core.util;
import java.math.BigInteger;
/**
* Given JSON-like content, converts it to valid JSON. This can be attached at
* either end of a data-pipeline to help satisfy Postel's principle:
* be conservative in what you do, be liberal in what you accept
* from others
*
* Applied to JSON-like content from others, it will produce well-formed JSON
* that should satisfy any parser you use.
*
* Applied to your output before you send, it will coerce minor mistakes in
* encoding and make it easier to embed your JSON in HTML and XML.
*
*
Input
The sanitizer takes JSON like content, and interprets it as JS
* eval would. Specifically, it deals with these non-standard constructs.
*
* - {@code '...'} Single quoted strings are converted to JSON strings.
*
- {@code \xAB} Hex escapes are converted to JSON unicode escapes.
*
- {@code \012} Octal escapes are converted to JSON unicode escapes.
*
- {@code 0xAB} Hex integer literals are converted to JSON decimal numbers.
*
- {@code 012} Octal integer literals are converted to JSON decimal numbers.
*
- {@code +.5} Decimal numbers are coerced to JSON's stricter format.
*
- {@code [0,,2]} Elisions in arrays are filled with {@code null}.
*
- {@code [1,2,3,]} Trailing commas are removed.
*
{foo:"bar"}
Unquoted property names are quoted.
* //comments
JS style line and block comments are removed.
* (...)
Grouping parentheses are removed.
*
*
* The sanitizer fixes missing punctuation, end quotes, and mismatched or
* missing close brackets. If an input contains only white-space then the valid
* JSON string {@code null} is substituted.
*
* Output
The output is well-formed JSON as defined by
* RFC 4627. The output
* satisfies three additional properties:
*
* - The output will not contain the substring (case-insensitively)
* {@code "The output will not contain the substring {@code "]]>"}
* so can be embedded inside an XML CDATA section without further encoding.
* - The output is a valid Javascript expression, so can be parsed by
* Javascript's
eval
builtin (after being wrapped in parentheses)
* or by JSON.parse
. Specifically, the output will not contain any
* string literals with embedded JS newlines (U+2028 Paragraph separator or
* U+2029 Line separator).
* - The output contains only valid Unicode scalar values (no isolated UTF-16
* surrogates) that are allowed in
* XML unescaped.
*
*
* Security
Since the output is well-formed JSON, passing it to
* eval
will have no side-effects and no free variables, so is
* neither a code-injection vector, nor a vector for exfiltration of secrets.
*
*
* This library only ensures that the JSON string → Javascript object phase
* has no side effects and resolves no free variables, and cannot control how
* other client side code later interprets the resulting Javascript object. So
* if client-side code takes a part of the parsed data that is controlled by an
* attacker and passes it back through a powerful interpreter like {@code eval}
* or {@code innerHTML} then that client-side code might suffer unintended
* side-effects.
*
*
Efficiency
The sanitize method will return the input string without
* allocating a new buffer when the input is already valid JSON that satisfies
* the properties above. Thus, if used on input that is usually well formed, it
* has minimal memory overhead.
*
* The sanitize method takes O(n) time where n is the length in UTF-16
* code-units.
*/
public final class JsonSanitizer {
public static final int DEFAULT_NESTING_DEPTH = 64;
public static final int MAXIMUM_NESTING_DEPTH = 4096;
private final int maximumNestingDepth;
private final String jsonish;
private int bracketDepth;
private boolean[] isMap;
private StringBuilder sanitizedJson;
private int cleaned;
private static final UnbracketedComma UNBRACKETED_COMMA;
private static final char[] HEX_DIGITS;
private static final int[] DIGITS_BY_BASE_THAT_FIT_IN_63B;
public static String sanitize(final String jsonish) {
return sanitize(jsonish, 64);
}
public static String sanitize(final String jsonish, final int maximumNestingDepth) {
final JsonSanitizer s = new JsonSanitizer(jsonish, maximumNestingDepth);
s.sanitize();
return s.toString();
}
/**
* Given JSON-like content, produces a string of JSON that is safe to embed,
* safe to pass to JavaScript's {@code eval} operator.
*
* @param jsonish JSON-like content.
* @return embeddable JSON
*/
JsonSanitizer(final String jsonish) {
this(jsonish, 64);
}
/**
* Same as {@link JsonSanitizer#sanitize(String)}, but allows to set a custom
* maximum nesting depth.
*
* @param jsonish JSON-like content.
* @param maximumNestingDepth maximum nesting depth.
*/
public JsonSanitizer(final String jsonish, final int maximumNestingDepth) {
this.maximumNestingDepth = Math.min(Math.max(1, maximumNestingDepth), 4096);
this.jsonish = ((jsonish != null) ? jsonish : "null");
}
public int getMaximumNestingDepth() {
return this.maximumNestingDepth;
}
void sanitize() {
final int n2 = 0;
this.cleaned = n2;
this.bracketDepth = n2;
this.sanitizedJson = null;
State state = State.START_ARRAY;
final int n = this.jsonish.length();
Label_1192:
for (int i = 0; i < n; ++i) {
try {
final char ch = this.jsonish.charAt(i);
switch (ch) {
case '\t':
case '\n':
case '\r':
case ' ': {
break;
}
case '\"':
case '\'': {
state = this.requireValueState(i, state, true);
final int strEnd = endOfQuotedString(this.jsonish, i);
this.sanitizeString(i, strEnd);
i = strEnd - 1;
break;
}
case '(':
case ')': {
this.elide(i, i + 1);
break;
}
case '[':
case '{': {
this.requireValueState(i, state, false);
if (this.isMap == null) {
this.isMap = new boolean[this.maximumNestingDepth];
}
final boolean map = ch == '{';
this.isMap[this.bracketDepth] = map;
++this.bracketDepth;
state = (map ? State.START_MAP : State.START_ARRAY);
break;
}
case ']':
case '}': {
if (this.bracketDepth == 0) {
this.elide(i, this.jsonish.length());
break Label_1192;
}
switch (state) {
case BEFORE_VALUE: {
this.insert(i, "null");
break;
}
case BEFORE_ELEMENT:
case BEFORE_KEY: {
this.elideTrailingComma(i);
break;
}
case AFTER_KEY: {
this.insert(i, ":null");
break;
}
default:
break;
}
--this.bracketDepth;
final char closeBracket = this.isMap[this.bracketDepth] ? '}' : ']';
if (ch != closeBracket) {
this.replace(i, i + 1, closeBracket);
}
state = ((this.bracketDepth == 0 || !this.isMap[this.bracketDepth - 1]) ? State.AFTER_ELEMENT : State.AFTER_VALUE);
break;
}
case ',': {
if (this.bracketDepth == 0) {
throw JsonSanitizer.UNBRACKETED_COMMA;
}
switch (state) {
case AFTER_ELEMENT: {
state = State.BEFORE_ELEMENT;
break;
}
case AFTER_VALUE: {
state = State.BEFORE_KEY;
break;
}
case BEFORE_ELEMENT:
case START_ARRAY: {
this.insert(i, "null");
state = State.BEFORE_ELEMENT;
break;
}
case BEFORE_KEY:
case AFTER_KEY:
case START_MAP: {
this.elide(i, i + 1);
break;
}
case BEFORE_VALUE: {
this.insert(i, "null");
state = State.BEFORE_KEY;
break;
}
}
break;
}
case ':': {
if (state == State.AFTER_KEY) {
state = State.BEFORE_VALUE;
break;
}
this.elide(i, i + 1);
break;
}
case '/': {
int end = i + 1;
if (i + 1 < n) {
switch (this.jsonish.charAt(i + 1)) {
case '/': {
end = n;
for (int j = i + 2; j < n; ++j) {
final char cch = this.jsonish.charAt(j);
if (cch == '\n' || cch == '\r' || cch == '\u2028' || cch == '\u2029') {
end = j + 1;
break;
}
}
break;
}
case '*': {
end = n;
if (i + 3 < n) {
int j = i + 2;
while ((j = this.jsonish.indexOf(47, j + 1)) >= 0) {
if (this.jsonish.charAt(j - 1) == '*') {
end = j + 1;
break;
}
}
break;
}
break;
}
default:
break;
}
}
this.elide(i, end);
i = end - 1;
break;
}
default: {
int runEnd;
for (runEnd = i; runEnd < n; ++runEnd) {
final char tch = this.jsonish.charAt(runEnd);
if (('a' > tch || tch > 'z') && ('0' > tch || tch > '9') && tch != '+' && tch != '-' && tch != '.' && ('A' > tch || tch > 'Z') && tch != '_' && tch != '$') {
break;
}
}
if (runEnd == i) {
this.elide(i, i + 1);
break;
}
state = this.requireValueState(i, state, true);
final boolean isNumber = ('0' <= ch && ch <= '9') || ch == '.' || ch == '+' || ch == '-';
final boolean isKeyword = !isNumber && this.isKeyword(i, runEnd);
if (!isNumber && !isKeyword) {
while (runEnd < n && !this.isJsonSpecialChar(runEnd)) {
++runEnd;
}
if (runEnd < n && this.jsonish.charAt(runEnd) == '\"') {
++runEnd;
}
}
if (state == State.AFTER_KEY) {
this.insert(i, '\"');
if (isNumber) {
this.canonicalizeNumber(i, runEnd);
this.insert(runEnd, '\"');
}
else {
this.sanitizeString(i, runEnd);
}
}
else if (isNumber) {
this.normalizeNumber(i, runEnd);
}
else if (!isKeyword) {
this.insert(i, '\"');
this.sanitizeString(i, runEnd);
}
i = runEnd - 1;
break;
}
}
}
catch (UnbracketedComma e) {
this.elide(i, this.jsonish.length());
break;
}
}
if (state == State.START_ARRAY && this.bracketDepth == 0) {
this.insert(n, "null");
state = State.AFTER_ELEMENT;
}
if ((this.sanitizedJson != null && this.sanitizedJson.length() != 0) || this.cleaned != 0 || this.bracketDepth != 0) {
if (this.sanitizedJson == null) {
this.sanitizedJson = new StringBuilder(n + this.bracketDepth);
}
this.sanitizedJson.append(this.jsonish, this.cleaned, n);
this.cleaned = n;
switch (state) {
case BEFORE_ELEMENT:
case BEFORE_KEY: {
this.elideTrailingComma(n);
break;
}
case AFTER_KEY: {
this.sanitizedJson.append(":null");
break;
}
case BEFORE_VALUE: {
this.sanitizedJson.append("null");
break;
}
default:
break;
}
while (this.bracketDepth != 0) {
final StringBuilder sanitizedJson = this.sanitizedJson;
final boolean[] isMap = this.isMap;
final int bracketDepth = this.bracketDepth - 1;
this.bracketDepth = bracketDepth;
sanitizedJson.append((char)(isMap[bracketDepth] ? 125 : 93));
}
}
}
/**
* Ensures that the output corresponding to {@code jsonish[start:end]} is a
* valid JSON string that has the same meaning when parsed by Javascript
* {@code eval}.
*
* - Making sure that it is fully quoted with double-quotes.
*
- Escaping any Javascript newlines : CR, LF, U+2028, U+2029
*
- Escaping HTML special characters to allow it to be safely embedded
* in HTML {@code