All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.owasp.encoder.HTMLEncoder Maven / Gradle / Ivy

The newest version!
// Copyright (c) 2012 Jeff Ichnowski
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
//     * Redistributions of source code must retain the above
//       copyright notice, this list of conditions and the following
//       disclaimer.
//
//     * Redistributions in binary form must reproduce the above
//       copyright notice, this list of conditions and the following
//       disclaimer in the documentation and/or other materials
//       provided with the distribution.
//
//     * Neither the name of the OWASP nor the names of its
//       contributors may be used to endorse or promote products
//       derived from this software without specific prior written
//       permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
// OF THE POSSIBILITY OF SUCH DAMAGE.
package org.owasp.encoder;

import java.nio.CharBuffer;
import java.nio.charset.CoderResult;

/**
 * 

* HTMLEncoder -- an encoder for HTML contexts. Currently most HTML-based * contexts are properly handled by {@link XMLEncoder}. The remaining * HTML-specific context of "unquoted attributes" could not be added to the * XMLEncoder without slowing it down. This class implements that remaining * context: unquoted attribute values.

* *

* Note: because this context is likely small strings, and hopefully rarely * used, no effort was put into optimizing this encoder.

* * @author Jeff Ichnowski */ class HTMLEncoder extends Encoder { /** * Number of characters in the encoding prefix and suffix when using decimal * numeric encodings of the form "&#...;". */ private static final int ENCODE_AFFIX_CHAR_COUNT = 3; /** * Encoding for '\t'. */ private static final char[] TAB = " ".toCharArray(); /** * Encoding for '&'. */ private static final char[] AMP = "&".toCharArray(); /** * Encoding for '<'. */ private static final char[] LT = "<".toCharArray(); /** * Encoding for '>'. */ private static final char[] GT = ">".toCharArray(); // The large table-switch implementation used here is fast to // implement but slower at runtime than tuned-for-expected-input // encoders that use selective if/else's. Look at the results of // BenchmarkTest to see the difference. See note in javadoc as to // reasoning. // On Core i7 (Sandybridge) // Baseline is 371.401009 ns/op // Benchmarked Encode.forXml: 324.219992 ns/op (-12.70% on baseline) // Benchmarked Encode.forHtmlUnquotedAttribute: 821.583263 ns/op (+121.21% on baseline) @Override int maxEncodedLength(int n) { // if everything is line separators and paragraph separators then // we get "⁛" return n * (ENCODE_AFFIX_CHAR_COUNT + 4); } @Override int firstEncodedOffset(String input, int off, int len) { final int n = off + len; for (int i = off; i < n; ++i) { final char ch = input.charAt(i); switch (ch) { case '\t': case '\r': case '\f': case '\n': case ' ': case Unicode.NEL: case '\"': case '\'': case '/': case '=': case '`': case '&': case '<': case '>': return i; case '!': case '#': case '$': case '%': case '(': case ')': case '*': case '+': case ',': case '-': case '.': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case ':': case ';': case '?': case '@': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '[': case '\\': case ']': case '^': case '_': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case '{': case '|': case '}': case '~': break; // valid default: if (Character.isHighSurrogate(ch)) { if (i + 1 < n) { if (Character.isLowSurrogate(input.charAt(i + 1))) { int cp = Character.toCodePoint(ch, input.charAt(i + 1)); if (Unicode.isNonCharacter(cp)) { return i; } else { ++i; } break; } } else { return i; } } if (ch <= Unicode.MAX_C1_CTRL_CHAR || Character.MIN_SURROGATE <= ch && ch <= Character.MAX_SURROGATE || ch > '\ufffd' || ('\ufdd0' <= ch && ch <= '\ufdef') || ch == Unicode.LINE_SEPARATOR || ch == Unicode.PARAGRAPH_SEPARATOR) { return i; } } } return n; } /** * Appends a source array verbatim to the output array. Caller must insure * there is enough space in the array for the output. * * @param src the characters to copy * @param out the output buffer * @param j the offset where to write in the output buffer * @return {@code j + src.length} */ static int append(char[] src, char[] out, int j) { System.arraycopy(src, 0, out, j, src.length); return j + src.length; } /** * Appends the numerically encoded version of {@code codePoint} to the * output buffer. Caller must insure there is enough space for the output. * * @param codePoint the character to encode * @param out the output buffer * @param j the offset where to write in the output buffer * @return {@code j} + the encoded length. */ static int encode(int codePoint, char[] out, int j) { out[j++] = '&'; out[j++] = '#'; if (codePoint >= 1000) { out[j++] = (char) (codePoint / 1000 % 10 + '0'); } if (codePoint >= 100) { out[j++] = (char) (codePoint / 100 % 10 + '0'); } if (codePoint >= 10) { out[j++] = (char) (codePoint / 10 % 10 + '0'); } out[j++] = (char) (codePoint % 10 + '0'); out[j++] = ';'; return j; } //CSOFF: MethodLength @Override CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput) { final char[] in = input.array(); final char[] out = output.array(); int i = input.arrayOffset() + input.position(); final int n = input.arrayOffset() + input.limit(); int j = output.arrayOffset() + output.position(); final int m = output.arrayOffset() + output.limit(); charLoop: for (; i < n; ++i) { final char ch = in[i]; // gigantic switch, hopefully compiled to a tableswitch. // this approach appears to be slower than the if/else // approach used in the other encoders. Perhaps an artifact // of the CPU's branch predictor, or possible additional // overhead of range checking, or having the entire table // available to the cache. If time allows, it would // interesting to find out. switch (ch) { case '\t': if (j + TAB.length > m) { return overflow(input, i, output, j); } j = append(TAB, out, j); break; case '\r': case '\n': case '\f': case ' ': case '\"': case '\'': case '/': case '=': case '`': if (ENCODE_AFFIX_CHAR_COUNT + 2 + j > m) { return overflow(input, i, output, j); } j = encode(ch, out, j); break; case Unicode.NEL: if (ENCODE_AFFIX_CHAR_COUNT + 3 + j > m) { return overflow(input, i, output, j); } j = encode(ch, out, j); break; case '&': if (j + AMP.length > m) { return overflow(input, i, output, j); } j = append(AMP, out, j); break; case '<': if (j + LT.length > m) { return overflow(input, i, output, j); } j = append(LT, out, j); break; case '>': if (j + GT.length > m) { return overflow(input, i, output, j); } j = append(GT, out, j); break; case '!': case '#': case '$': case '%': case '(': case ')': case '*': case '+': case ',': case '-': case '.': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case ':': case ';': case '?': case '@': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '[': case '\\': case ']': case '^': case '_': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case '{': case '|': case '}': case '~': if (j >= m) { return overflow(input, i, output, j); } out[j++] = ch; break; default: if (Character.isHighSurrogate(ch)) { if (i + 1 < n) { if (Character.isLowSurrogate(in[i + 1])) { int cp = Character.toCodePoint(ch, in[i + 1]); if (Unicode.isNonCharacter(cp)) { if (j >= m) { return overflow(input, i, output, j); } out[j++] = '-'; ++i; } else { if (j + 1 >= m) { return overflow(input, i, output, j); } out[j++] = ch; out[j++] = in[++i]; } break; } } else if (!endOfInput) { break charLoop; } } if (j >= m) { return overflow(input, i, output, j); } if (ch <= Unicode.MAX_C1_CTRL_CHAR || Character.MIN_SURROGATE <= ch && ch <= Character.MAX_SURROGATE || ch > '\ufffd' || ('\ufdd0' <= ch && ch <= '\ufdef')) { // invalid out[j++] = '-'; } else if (ch == Unicode.LINE_SEPARATOR || ch == Unicode.PARAGRAPH_SEPARATOR) { if (ENCODE_AFFIX_CHAR_COUNT + 4 + j > m) { return overflow(input, i, output, j); } j = encode(ch, out, j); } else { out[j++] = ch; } } } return underflow(input, i, output, j); } //CSON: MethodLength }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy