All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.schemaapp.core.util.JsonSanitizer Maven / Gradle / Ivy

// Copyright (C) 2012 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
package com.schemaapp.core.util;
import java.math.BigInteger;

/**
 * Given JSON-like content, converts it to valid JSON. This can be attached at
 * either end of a data-pipeline to help satisfy Postel's principle:
 * 
be conservative in what you do, be liberal in what you accept * from others
*

* Applied to JSON-like content from others, it will produce well-formed JSON * that should satisfy any parser you use. *

* Applied to your output before you send, it will coerce minor mistakes in * encoding and make it easier to embed your JSON in HTML and XML. * *

Input

The sanitizer takes JSON like content, and interprets it as JS * eval would. Specifically, it deals with these non-standard constructs. *
    *
  • {@code '...'} Single quoted strings are converted to JSON strings. *
  • {@code \xAB} Hex escapes are converted to JSON unicode escapes. *
  • {@code \012} Octal escapes are converted to JSON unicode escapes. *
  • {@code 0xAB} Hex integer literals are converted to JSON decimal numbers. *
  • {@code 012} Octal integer literals are converted to JSON decimal numbers. *
  • {@code +.5} Decimal numbers are coerced to JSON's stricter format. *
  • {@code [0,,2]} Elisions in arrays are filled with {@code null}. *
  • {@code [1,2,3,]} Trailing commas are removed. *
  • {foo:"bar"} Unquoted property names are quoted. *
  • //comments JS style line and block comments are removed. *
  • (...) Grouping parentheses are removed. *
* * The sanitizer fixes missing punctuation, end quotes, and mismatched or * missing close brackets. If an input contains only white-space then the valid * JSON string {@code null} is substituted. * *

Output

The output is well-formed JSON as defined by * RFC 4627. The output * satisfies three additional properties: *
    *
  1. The output will not contain the substring (case-insensitively) * {@code "The output will not contain the substring {@code "]]>"} * so can be embedded inside an XML CDATA section without further encoding.
  2. *
  3. The output is a valid Javascript expression, so can be parsed by * Javascript's eval builtin (after being wrapped in parentheses) * or by JSON.parse. Specifically, the output will not contain any * string literals with embedded JS newlines (U+2028 Paragraph separator or * U+2029 Line separator). *
  4. The output contains only valid Unicode scalar values (no isolated UTF-16 * surrogates) that are allowed in * XML unescaped. *
* *

Security

Since the output is well-formed JSON, passing it to * eval will have no side-effects and no free variables, so is * neither a code-injection vector, nor a vector for exfiltration of secrets. * *

* This library only ensures that the JSON string → Javascript object phase * has no side effects and resolves no free variables, and cannot control how * other client side code later interprets the resulting Javascript object. So * if client-side code takes a part of the parsed data that is controlled by an * attacker and passes it back through a powerful interpreter like {@code eval} * or {@code innerHTML} then that client-side code might suffer unintended * side-effects. * *

Efficiency

The sanitize method will return the input string without * allocating a new buffer when the input is already valid JSON that satisfies * the properties above. Thus, if used on input that is usually well formed, it * has minimal memory overhead. *

* The sanitize method takes O(n) time where n is the length in UTF-16 * code-units. */ public final class JsonSanitizer { public static final int DEFAULT_NESTING_DEPTH = 64; public static final int MAXIMUM_NESTING_DEPTH = 4096; private final int maximumNestingDepth; private final String jsonish; private int bracketDepth; private boolean[] isMap; private StringBuilder sanitizedJson; private int cleaned; private static final UnbracketedComma UNBRACKETED_COMMA; private static final char[] HEX_DIGITS; private static final int[] DIGITS_BY_BASE_THAT_FIT_IN_63B; public static String sanitize(final String jsonish) { return sanitize(jsonish, 64); } public static String sanitize(final String jsonish, final int maximumNestingDepth) { final JsonSanitizer s = new JsonSanitizer(jsonish, maximumNestingDepth); s.sanitize(); return s.toString(); } /** * Given JSON-like content, produces a string of JSON that is safe to embed, * safe to pass to JavaScript's {@code eval} operator. * * @param jsonish JSON-like content. * @return embeddable JSON */ JsonSanitizer(final String jsonish) { this(jsonish, 64); } /** * Same as {@link JsonSanitizer#sanitize(String)}, but allows to set a custom * maximum nesting depth. * * @param jsonish JSON-like content. * @param maximumNestingDepth maximum nesting depth. */ public JsonSanitizer(final String jsonish, final int maximumNestingDepth) { this.maximumNestingDepth = Math.min(Math.max(1, maximumNestingDepth), 4096); this.jsonish = ((jsonish != null) ? jsonish : "null"); } public int getMaximumNestingDepth() { return this.maximumNestingDepth; } void sanitize() { final int n2 = 0; this.cleaned = n2; this.bracketDepth = n2; this.sanitizedJson = null; State state = State.START_ARRAY; final int n = this.jsonish.length(); Label_1192: for (int i = 0; i < n; ++i) { try { final char ch = this.jsonish.charAt(i); switch (ch) { case '\t': case '\n': case '\r': case ' ': { break; } case '\"': case '\'': { state = this.requireValueState(i, state, true); final int strEnd = endOfQuotedString(this.jsonish, i); this.sanitizeString(i, strEnd); i = strEnd - 1; break; } case '(': case ')': { this.elide(i, i + 1); break; } case '[': case '{': { this.requireValueState(i, state, false); if (this.isMap == null) { this.isMap = new boolean[this.maximumNestingDepth]; } final boolean map = ch == '{'; this.isMap[this.bracketDepth] = map; ++this.bracketDepth; state = (map ? State.START_MAP : State.START_ARRAY); break; } case ']': case '}': { if (this.bracketDepth == 0) { this.elide(i, this.jsonish.length()); break Label_1192; } switch (state) { case BEFORE_VALUE: { this.insert(i, "null"); break; } case BEFORE_ELEMENT: case BEFORE_KEY: { this.elideTrailingComma(i); break; } case AFTER_KEY: { this.insert(i, ":null"); break; } default: break; } --this.bracketDepth; final char closeBracket = this.isMap[this.bracketDepth] ? '}' : ']'; if (ch != closeBracket) { this.replace(i, i + 1, closeBracket); } state = ((this.bracketDepth == 0 || !this.isMap[this.bracketDepth - 1]) ? State.AFTER_ELEMENT : State.AFTER_VALUE); break; } case ',': { if (this.bracketDepth == 0) { throw JsonSanitizer.UNBRACKETED_COMMA; } switch (state) { case AFTER_ELEMENT: { state = State.BEFORE_ELEMENT; break; } case AFTER_VALUE: { state = State.BEFORE_KEY; break; } case BEFORE_ELEMENT: case START_ARRAY: { this.insert(i, "null"); state = State.BEFORE_ELEMENT; break; } case BEFORE_KEY: case AFTER_KEY: case START_MAP: { this.elide(i, i + 1); break; } case BEFORE_VALUE: { this.insert(i, "null"); state = State.BEFORE_KEY; break; } } break; } case ':': { if (state == State.AFTER_KEY) { state = State.BEFORE_VALUE; break; } this.elide(i, i + 1); break; } case '/': { int end = i + 1; if (i + 1 < n) { switch (this.jsonish.charAt(i + 1)) { case '/': { end = n; for (int j = i + 2; j < n; ++j) { final char cch = this.jsonish.charAt(j); if (cch == '\n' || cch == '\r' || cch == '\u2028' || cch == '\u2029') { end = j + 1; break; } } break; } case '*': { end = n; if (i + 3 < n) { int j = i + 2; while ((j = this.jsonish.indexOf(47, j + 1)) >= 0) { if (this.jsonish.charAt(j - 1) == '*') { end = j + 1; break; } } break; } break; } default: break; } } this.elide(i, end); i = end - 1; break; } default: { int runEnd; for (runEnd = i; runEnd < n; ++runEnd) { final char tch = this.jsonish.charAt(runEnd); if (('a' > tch || tch > 'z') && ('0' > tch || tch > '9') && tch != '+' && tch != '-' && tch != '.' && ('A' > tch || tch > 'Z') && tch != '_' && tch != '$') { break; } } if (runEnd == i) { this.elide(i, i + 1); break; } state = this.requireValueState(i, state, true); final boolean isNumber = ('0' <= ch && ch <= '9') || ch == '.' || ch == '+' || ch == '-'; final boolean isKeyword = !isNumber && this.isKeyword(i, runEnd); if (!isNumber && !isKeyword) { while (runEnd < n && !this.isJsonSpecialChar(runEnd)) { ++runEnd; } if (runEnd < n && this.jsonish.charAt(runEnd) == '\"') { ++runEnd; } } if (state == State.AFTER_KEY) { this.insert(i, '\"'); if (isNumber) { this.canonicalizeNumber(i, runEnd); this.insert(runEnd, '\"'); } else { this.sanitizeString(i, runEnd); } } else if (isNumber) { this.normalizeNumber(i, runEnd); } else if (!isKeyword) { this.insert(i, '\"'); this.sanitizeString(i, runEnd); } i = runEnd - 1; break; } } } catch (UnbracketedComma e) { this.elide(i, this.jsonish.length()); break; } } if (state == State.START_ARRAY && this.bracketDepth == 0) { this.insert(n, "null"); state = State.AFTER_ELEMENT; } if ((this.sanitizedJson != null && this.sanitizedJson.length() != 0) || this.cleaned != 0 || this.bracketDepth != 0) { if (this.sanitizedJson == null) { this.sanitizedJson = new StringBuilder(n + this.bracketDepth); } this.sanitizedJson.append(this.jsonish, this.cleaned, n); this.cleaned = n; switch (state) { case BEFORE_ELEMENT: case BEFORE_KEY: { this.elideTrailingComma(n); break; } case AFTER_KEY: { this.sanitizedJson.append(":null"); break; } case BEFORE_VALUE: { this.sanitizedJson.append("null"); break; } default: break; } while (this.bracketDepth != 0) { final StringBuilder sanitizedJson = this.sanitizedJson; final boolean[] isMap = this.isMap; final int bracketDepth = this.bracketDepth - 1; this.bracketDepth = bracketDepth; sanitizedJson.append((char)(isMap[bracketDepth] ? 125 : 93)); } } } /** * Ensures that the output corresponding to {@code jsonish[start:end]} is a * valid JSON string that has the same meaning when parsed by Javascript * {@code eval}. *

    *
  • Making sure that it is fully quoted with double-quotes. *
  • Escaping any Javascript newlines : CR, LF, U+2028, U+2029 *
  • Escaping HTML special characters to allow it to be safely embedded * in HTML {@code