org.owasp.encoder.Encode Maven / Gradle / Ivy
Show all versions of aem-sdk-api Show documentation
// Copyright (c) 2012 Jeff Ichnowski
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// * Redistributions of source code must retain the above
// copyright notice, this list of conditions and the following
// disclaimer.
//
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials
// provided with the distribution.
//
// * Neither the name of the OWASP nor the names of its
// contributors may be used to endorse or promote products
// derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
// OF THE POSSIBILITY OF SUCH DAMAGE.
package org.owasp.encoder;
import java.io.IOException;
import java.io.Writer;
import java.nio.CharBuffer;
import java.nio.charset.CoderResult;
/**
* Encode -- fluent interface for contextual encoding. Example usage in a JSP:
*
*
* <input value="<%=Encode.forHtml(value)%>" />
*
*
* There are two versions of each contextual encoding method. The first
* takes a {@code String} argument and returns the encoded version as a
* {@code String}. The second version writes the encoded version directly
* to a {@code Writer}.
*
* Please make sure to read and understand the context that the method encodes
* for. Encoding for the incorrect context will likely lead to exposing a
* cross-site scripting vulnerability.
*
* @author Jeff Ichnowski
*/
public final class Encode {
/** No instances. */
private Encode() {}
/**
* Encodes for (X)HTML text content and text attributes. Since
* this method encodes for both contexts, it may be slightly less
* efficient to use this method over the methods targeted towards
* the specific contexts ({@link #forHtmlAttribute(String)} and
* {@link #forHtmlContent(String)}. In general this method should
* be preferred unless you are really concerned with saving a few
* bytes or are writing a framework that utilizes this
* package.
*
* Example JSP Usage
*
* <div><%=Encode.forHtml(unsafeData)%></div>
*
* <input value="<%=Encode.forHtml(unsafeData)%>" />
*
*
*
* Encoding Table
*
*
* Input
* Result
*
*
*
*
* {@code &}
* {@code &}
*
*
* {@code <}
* {@code <}
*
*
* {@code >}
* {@code >}
*
*
* {@code "}
* {@code "}
*
*
* {@code '}
* {@code '}
*
*
*
*
* Additional Notes
*
* - The encoding of the greater-than sign ({@code >}) is not
* strictly required, but is included for maximum
* compatibility.
*
* - Numeric encoding is used for double-quote character ({@code
* "}) as it shorter than the also valid {@code "}.
*
* - Carriage return (U+0D), line-feed (U+0A), horizontal tab
* (U+09) and space (U+20) are valid in quoted attributes and in
* block in an unescaped form.
*
* - Surrogate pairs are passed through only if valid.
*
* - Characters that are not valid according
* to the XML specification are replaced by a space character
* as they could lead to parsing errors. In particular only {@code #x9
* | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
* [#x10000-#x10FFFF]} are considered valid.
*
*
* @param input the data to encode
* @return the data encoded for html.
*/
public static String forHtml(String input) {
return forXml(input);
}
/**
* See {@link #forHtml(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forHtml(Writer out, String input) throws IOException {
forXml(out, input);
}
/**
* This method encodes for HTML text content. It does not escape
* quotation characters and is thus unsafe for use with
* HTML attributes. Use either forHtml or forHtmlAttribute for those
* methods.
*
* Example JSP Usage
*
* <div><%=Encode.forHtmlContent(unsafeData)%></div>
*
*
* Encoding Table
*
*
* Input
* Result
*
*
*
*
* {@code &}
* {@code &}
*
*
* {@code <}
* {@code <}
*
*
* {@code >}
* {@code >}
*
*
*
*
* Additional Notes
*
* - Single-quote character ({@code '}) and double-quote
* character ({@code "}) do not require encoding in HTML
* blocks, unlike other HTML contexts.
*
* - The encoding of the greater-than sign ({@code >}) is not
* strictly required, but is included for maximum
* compatibility.
*
* - Carriage return (U+0D), line-feed (U+0A), horizontal tab
* (U+09) and space (U+20) are valid in quoted attributes and in
* block in an unescaped form.
*
* - Surrogate pairs are passed through only if valid.
*
* - Characters that are not valid according
* to the XML specification are replaced by a space character
* as they could lead to parsing errors. In particular only {@code #x9
* | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
* [#x10000-#x10FFFF]} are considered valid.
*
*
* @param input the input to encode
* @return the encoded result
*/
public static String forHtmlContent(String input) {
return forXmlContent(input);
}
/**
* See {@link #forHtmlContent(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forHtmlContent(Writer out, String input)
throws IOException
{
forXmlContent(out, input);
}
/**
* This method encodes for HTML text attributes.
*
* Example JSP Usage
*
* <div><%=Encode.forHtmlAttribute(unsafeData)%></div>
*
*
*
* Encoding Table
*
*
* Input
* Result
*
*
*
*
* {@code &}
* {@code &}
*
*
* {@code <}
* {@code <}
*
*
* {@code "}
* {@code "}
*
*
* {@code '}
* {@code '}
*
*
*
*
* Additional Notes
*
* - Both the single-quote character ({@code '}) and the
* double-quote character ({@code "}) are encoded so this is safe
* for HTML attributes with either enclosing character.
*
* - The encoding of the greater-than sign ({@code >}) is not
* required for attributes.
*
* - Numeric encoding is used for double-quote character ({@code
* "}) as it shorter than the also valid {@code "}.
*
* - Carriage return (U+0D), line-feed (U+0A), horizontal tab
* (U+09) and space (U+20) are valid in quoted attributes and in
* block in an unescaped form.
*
* - Surrogate pairs are passed through only if valid.
*
* - Characters that are not valid according
* to the XML specification are replaced by a space character
* as they could lead to parsing errors. In particular only {@code #x9
* | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
* [#x10000-#x10FFFF]} are considered valid.
*
*
* @param input the input to encode
* @return the encoded result
*/
public static String forHtmlAttribute(String input) {
return forXmlAttribute(input);
}
/**
* See {@link #forHtmlAttribute(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forHtmlAttribute(Writer out, String input)
throws IOException
{
forXmlAttribute(out, input);
}
/**
* Encodes for unquoted HTML attribute values. {@link
* #forHtml(String)} or {@link #forHtmlAttribute(String)} should
* usually be preferred over this method as quoted attributes are
* XHTML compliant.
*
* When using this method, the caller is not required to
* provide quotes around the attribute (since it is encoded for
* such context). The caller should make sure that the attribute
* value does not abut unsafe characters--and thus should usually
* err on the side of including a space character after the
* value.
*
* Use of this method is discouraged as quoted attributes are
* generally more compatible and safer. Also note, that no
* attempt has been made to optimize this encoding, though it is
* still probably faster than other encoding libraries.
*
* Example JSP Usage
*
* <input value=<%=Encode.forHtmlUnquotedAttribute(input)%> >
*
*
*
* Encoding Table
*
*
* Input
* Result
*
*
*
*
* {@code U+0009} (horizontal tab)
* {@code }
*
* {@code U+000A} (line feed)
* {@code
}
*
* {@code U+000C} (form feed)
* {@code }
*
* {@code U+000D} (carriage return)
* {@code
}
*
* {@code U+0020} (space)
* {@code }
*
* {@code &}
* {@code &}
*
* {@code <}
* {@code <}
*
* {@code >}
* {@code >}
*
* {@code "}
* {@code "}
*
* {@code '}
* {@code '}
*
* {@code /}
* {@code /}
*
* {@code =}
* {@code =}
*
* {@code `}
* {@code `}
*
* {@code U+0085} (next line)
* {@code
}
*
* {@code U+2028} (line separator)
* {@code
}
*
* {@code U+2029} (paragraph separator)
* {@code
}
*
*
*
* Additional Notes
*
* - The following characters are not encoded:
* {@code 0-9, a-z, A-Z}, {@code !}, {@code
* #}, {@code $}, {@code %},
* {@code (}, {@code )}, {@code
* *}, {@code +}, {@code ,},
* {@code -}, {@code .}, {@code
* [}, {@code \}, {@code ]},
* {@code ^}, {@code _}, {@code
* }}.
*
* - Surrogate pairs are passed through only if valid. Invalid
* surrogate pairs are replaced by a hyphen (-).
*
* - Characters in the C0 and C1 control blocks and not
* otherwise listed above are considered invalid and replaced by a
* hyphen (-) character.
*
* - Unicode "non-characters" are replaced by hyphens (-).
*
*
* @param input the attribute value to be encoded.
* @return the attribute value encoded for unquoted attribute
* context.
*/
public static String forHtmlUnquotedAttribute(String input) {
return encode(Encoders.HTML_UNQUOTED_ATTRIBUTE_ENCODER, input);
}
/**
* See {@link #forHtmlUnquotedAttribute(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forHtmlUnquotedAttribute(Writer out, String input)
throws IOException
{
encode(Encoders.HTML_UNQUOTED_ATTRIBUTE_ENCODER, out, input);
}
// HTML comment encoding is not currently supported because
// of the number of vendor-specific sequences that would need
// to be handled (e.g. ""
// public static String forHtmlComment(String input) {
// // only alphanumeric and space, everything else becomes a space
//
// // HTML comment context needs to avoid browser extensions
// // such as ""
// throw new UnsupportedOperationException();
// }
/**
* Encodes for CSS strings. The context must be surrounded by quotation
* characters. It is safe for use in both style blocks and attributes in
* HTML.
*
* Example JSP Usage
*
* <div style="background: url('<=Encode.forCssString(...)%>');">
*
* <style type="text/css">
* background: url('<%=Encode.forCssString(...)%>');
* </style>
*
*
* Encoding Notes
*
*
* - The following characters are encoded using hexidecimal
* encodings: {@code U+0000} - {@code U+001f},
* {@code "},
* {@code '},
* {@code \},
* {@code <},
* {@code &},
* {@code (},
* {@code )},
* {@code /},
* {@code >},
* {@code U+007f},
* line separator ({@code U+2028}),
* paragraph separator ({@code U+2029}).
*
* - Any character requiring encoding is encoded as {@code \xxx}
* where {@code xxx} is the shortest hexidecimal representation of
* its Unicode code point (after decoding surrogate pairs if
* necessary). This encoding is never zero padded. Thus, for
* example, the tab character is encoded as {@code \9}, not {@code
* \0009}.
*
* - The encoder looks ahead 1 character in the input and
* appends a space to an encoding to avoid the next character
* becoming part of the hexidecimal encoded sequence. Thus
* “{@code '1}” is encoded as “{@code \27
* 1}”, and not as “{@code \271}”. If a space
* is not necessary, it is not included, thus “{@code
* 'x}” is encoded as “{@code \27x}”, and not as
* “{@code \27 x}”.
*
* - Surrogate pairs are passed through only if valid. Invalid
* surrogate pairs are replaced by an underscore (_).
*
* - Unicode "non-characters" are replaced by underscores (_).
*
*
*
* @param input the input to encode
* @return the encoded result
*/
public static String forCssString(String input) {
// need to watch out for CSS expressions
return encode(Encoders.CSS_STRING_ENCODER, input);
}
/**
* See {@link #forCssString(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forCssString(Writer out, String input)
throws IOException
{
encode(Encoders.CSS_STRING_ENCODER, out, input);
}
/**
* Encodes for CSS URL contexts. The context must be surrounded by {@code "url("}
* and {@code ")"}. It is safe for use in both style blocks and attributes in HTML.
* Note: this does not do any checking on the quality or safety of the URL
* itself. The caller should insure that the URL is safe for embedding
* (e.g. input validation) by other means.
*
* Example JSP Usage
*
* <div style="background:url(<=Encode.forCssUrl(...)%>);">
*
* <style type="text/css">
* background: url(<%=Encode.forCssUrl(...)%>);
* </style>
*
* Encoding Notes
*
*
* - The following characters are encoded using hexidecimal
* encodings: {@code U+0000} - {@code U+001f},
* {@code "},
* {@code '},
* {@code \},
* {@code <},
* {@code &},
* {@code /},
* {@code >},
* {@code U+007f},
* line separator ({@code U+2028}),
* paragraph separator ({@code U+2029}).
*
* - Any character requiring encoding is encoded as {@code \xxx}
* where {@code xxx} is the shortest hexidecimal representation of
* its Unicode code point (after decoding surrogate pairs if
* necessary). This encoding is never zero padded. Thus, for
* example, the tab character is encoded as {@code \9}, not {@code
* \0009}.
*
* - The encoder looks ahead 1 character in the input and
* appends a space to an encoding to avoid the next character
* becoming part of the hexidecimal encoded sequence. Thus
* “{@code '1}” is encoded as “{@code \27
* 1}”, and not as “{@code \271}”. If a space
* is not necessary, it is not included, thus “{@code
* 'x}” is encoded as “{@code \27x}”, and not as
* “{@code \27 x}”.
*
* - Surrogate pairs are passed through only if valid. Invalid
* surrogate pairs are replaced by an underscore (_).
*
* - Unicode "non-characters" are replaced by underscores (_).
*
*
*
* @param input the input to encode
* @return the encoded result
*/
public static String forCssUrl(String input) {
return encode(Encoders.CSS_URL_ENCODER, input);
}
/**
* See {@link #forCssUrl(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forCssUrl(Writer out, String input)
throws IOException
{
encode(Encoders.CSS_URL_ENCODER, out, input);
}
/**
* Performs percent-encoding of a URL according to RFC 3986. The provided
* URL is assumed to a valid URL. This method does not do any checking on
* the quality or safety of the URL itself. In many applications it may
* be better to use {@link java.net.URI} instead. Note: this is a
* particularly dangerous context to put untrusted content in, as for
* example a "javascript:" URL provided by a malicious user would be
* "properly" escaped, and still execute.
*
* Encoding Table
* The following characters are not encoded:
*
* U+20: ! # $ & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; = ?
* U+40: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ ] _
* U+60: a b c d e f g h i j k l m n o p q r s t u v w x y z ~
*
*
* Encoding Notes
*
*
* - The single-quote character({@code '}) is not encoded.
*
* - This encoding is not intended to be used standalone. The
* output should be encoded to the target context. For example:
* {@code ...}.
* (Note, the single-quote character ({@code '}) is not
* encoded.)
*
* - URL encoding is an encoding for bytes, not unicode. The
* input string is thus first encoded as a sequence of UTF-8
* byte. The bytes are then encoded as {@code %xx} where {@code
* xx} is the two-digit hexidecimal representation of the
* byte. (The implementation does this as one step for
* performance.)
*
* - Surrogate pairs are first decoded to a Unicode code point
* before encoding as UTF-8.
*
* - Invalid characters (e.g. partial or invalid surrogate
* pairs), are replaced with a hyphen ({@code -}) character.
*
*
*
* @param input the input to encode
* @return the encoded result
*/
@Deprecated public static String forUri(String input) {
return encode(Encoders.URI_ENCODER, input);
}
/**
* See {@link #forUri(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*
* @deprecated There is never a need to encode a complete URI with this form of encoding.
*/
@Deprecated public static void forUri(Writer out, String input)
throws IOException
{
encode(Encoders.URI_ENCODER, out, input);
}
/**
* Performs percent-encoding for a component of a URI, such as a query
* parameter name or value, path or query-string. In particular this
* method insures that special characters in the component do not get
* interpreted as part of another component.
*
*
* <a href="http://www.owasp.org/<%=Encode.forUriComponent(...)%>?query#fragment">
*
* <a href="/search?value=<%=Encode.forUriComponent(...)%>&order=1#top">
*
*
* Encoding Table
* The following characters are not encoded:
*
* U+20: - . 0 1 2 3 4 5 6 7 8 9
* U+40: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _
* U+60: a b c d e f g h i j k l m n o p q r s t u v w x y z ~
*
*
* Encoding Notes
*
*
* - Unlike {@link #forUri(String)} this method is safe to be
* used in most containing contexts, including: HTML/XML, CSS,
* and JavaScript contexts.
*
* - URL encoding is an encoding for bytes, not unicode. The
* input string is thus first encoded as a sequence of UTF-8
* byte. The bytes are then encoded as {@code %xx} where {@code
* xx} is the two-digit hexidecimal representation of the
* byte. (The implementation does this as one step for
* performance.)
*
* - Surrogate pairs are first decoded to a Unicode code point
* before encoding as UTF-8.
*
* - Invalid characters (e.g. partial or invalid surrogate
* pairs), are replaced with a hyphen ({@code -}) character.
*
*
*
* @param input the input to encode
* @return the encoded result
*/
public static String forUriComponent(String input) {
return encode(Encoders.URI_COMPONENT_ENCODER, input);
}
/**
* See {@link #forUriComponent(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forUriComponent(Writer out, String input)
throws IOException
{
encode(Encoders.URI_COMPONENT_ENCODER, out, input);
}
/**
* Encoder for XML and XHTML. See {@link #forHtml(String)} for a
* description of the encoding and context.
*
* @see #forHtml(String)
* @param input the input to encode
* @return the encoded result
*/
public static String forXml(String input) {
return encode(Encoders.XML_ENCODER, input);
}
/**
* See {@link #forXml(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forXml(Writer out, String input)
throws IOException
{
encode(Encoders.XML_ENCODER, out, input);
}
/**
* Encoder for XML and XHTML text content. See {@link
* #forHtmlContent(String)} for description of encoding and
* context.
*
* @see #forHtmlContent(String)
* @param input the input to encode
* @return the encoded result
*/
public static String forXmlContent(String input) {
return encode(Encoders.XML_CONTENT_ENCODER, input);
}
/**
* See {@link #forXmlContent(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forXmlContent(Writer out, String input)
throws IOException
{
encode(Encoders.XML_CONTENT_ENCODER, out, input);
}
/**
* Encoder for XML and XHTML attribute content. See {@link
* #forHtmlAttribute(String)} for description of encoding and
* context.
*
* @see #forHtmlAttribute(String)
* @param input the input to encode
* @return the encoded result
*/
public static String forXmlAttribute(String input) {
return encode(Encoders.XML_ATTRIBUTE_ENCODER, input);
}
/**
* See {@link #forXmlAttribute(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forXmlAttribute(Writer out, String input)
throws IOException
{
encode(Encoders.XML_ATTRIBUTE_ENCODER, out, input);
}
/**
* Encoder for XML comments. NOT FOR USE WITH
* (X)HTML CONTEXTS. (X)HTML comments may be interpreted by
* browsers as something other than a comment, typically in vendor
* specific extensions (e.g. {@code <--if[IE]-->}).
* For (X)HTML it is recommend that unsafe content never be included
* in a comment.
*
* The caller must provide the comment start and end sequences.
*
* This method replaces all invalid XML characters with spaces,
* and replaces the "--" sequence (which is invalid in XML comments)
* with "-~" (hyphen-tilde). This encoding behavior may change
* in future releases. If the comments need to be decoded, the
* caller will need to come up with their own encode/decode system.
*
*
* out.println("<?xml version='1.0'?>");
* out.println("<data>");
* out.println("<!-- "+Encode.forXmlComment(comment)+" -->");
* out.println("</data>");
*
*
* @param input the input to encode
* @return the encoded result
*/
public static String forXmlComment(String input) {
return encode(Encoders.XML_COMMENT_ENCODER, input);
}
/**
* See {@link #forXmlComment(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forXmlComment(Writer out, String input)
throws IOException
{
encode(Encoders.XML_COMMENT_ENCODER, out, input);
}
/**
* Encodes data for an XML CDATA section. On the chance that the input
* contains a terminating {@code "]]>"}, it will be replaced by
* {@code "]]>]]"}.
* As with all XML contexts, characters that are invalid according to the
* XML specification will be replaced by a space character. Caller must
* provide the CDATA section boundaries.
*
*
* <xml-data><![CDATA[<%=Encode.forCDATA(...)%>]]></xml-data>
*
*
* @param input the input to encode
* @return the encoded result
*/
public static String forCDATA(String input) {
return encode(Encoders.CDATA_ENCODER, input);
}
/**
* See {@link #forCDATA(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forCDATA(Writer out, String input)
throws IOException
{
encode(Encoders.CDATA_ENCODER, out, input);
}
/**
* Encodes for a Java string. This method will use "\b", "\t", "\r", "\f",
* "\n", "\"", "\'", "\\", octal and unicode escapes. Valid surrogate
* pairing is not checked. The caller must provide the enclosing quotation
* characters. This method is useful for when writing code generators and
* outputting debug messages.
*
*
* out.println("public class Hello {");
* out.println(" public static void main(String[] args) {");
* out.println(" System.out.println(\"" + Encode.forJava(message) + "\");");
* out.println(" }");
* out.println("}");
*
*
* @param input the input to encode
* @return the input encoded for java strings.
*/
public static String forJava(String input) {
return encode(Encoders.JAVA_ENCODER, input);
}
/**
* See {@link #forJava(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forJava(Writer out, String input)
throws IOException
{
encode(Encoders.JAVA_ENCODER, out, input);
}
/**
* Encodes for a JavaScript string. It is safe for use in HTML
* script attributes (such as {@code onclick}), script
* blocks, JSON files, and JavaScript source. The caller MUST
* provide the surrounding quotation characters for the string.
* Since this performs additional encoding so it can work in all
* of the JavaScript contexts listed, it may be slightly less
* efficient than using one of the methods targetted to a specific
* JavaScript context ({@link #forJavaScriptAttribute(String)},
* {@link #forJavaScriptBlock}, {@link #forJavaScriptSource}).
* Unless you are interested in saving a few bytes of output or
* are writing a framework on top of this library, it is recommend
* that you use this method over the others.
*
* Example JSP Usage:
*
* <button onclick="alert('<%=Encode.forJavaScript(data)%>');">
* <script type="text/javascript">
* var data = "<%=Encode.forJavaScript(data)%>";
* </script>
*
*
*
* Encoding Description
*
*
* Input Character
* Encoded Result
* Notes
*
*
*
*
* U+0008 BS
* \b
* Backspace character
*
*
* U+0009 HT
* \t
* Horizontal tab character
*
*
* U+000A LF
* \n
* Line feed character
*
*
* U+000C FF
* \f
* Form feed character
*
*
* U+000D CR
* \r
* Carriage return character
*
*
* U+0022 "
* \x22
* The encoding \"
is not used here because
* it is not safe for use in HTML attributes. (In HTML
* attributes, it would also be correct to use
* "\"".)
*
*
* U+0026 &
* \x26
* Ampersand character
*
*
* U+0027 '
* \x27
* The encoding \'
is not used here because
* it is not safe for use in HTML attributes. (In HTML
* attributes, it would also be correct to use
* "\'".)
*
*
* U+002F /
* \/
* This encoding is used to avoid an input sequence
* "</" from prematurely terminating a </script>
* block.
*
*
* U+005C \
* \\
*
*
*
* U+0000 to U+001F
* \x##
* Hexadecimal encoding is used for characters in this
* range that were not already mentioned in above.
*
*
*
*
* @param input the input string to encode
* @return the input encoded for JavaScript
* @see #forJavaScriptAttribute(String)
* @see #forJavaScriptBlock(String)
*/
public static String forJavaScript(String input) {
return encode(Encoders.JAVASCRIPT_ENCODER, input);
}
/**
* See {@link #forJavaScript(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forJavaScript(Writer out, String input)
throws IOException
{
encode(Encoders.JAVASCRIPT_ENCODER, out, input);
}
/**
* This method encodes for JavaScript strings contained within
* HTML script attributes (such as {@code onclick}). It is
* NOT safe for use in script blocks. The caller MUST provide the
* surrounding quotation characters. This method performs the
* same encode as {@link #forJavaScript(String)} with the
* exception that /
is not escaped.
*
* Unless you are interested in saving a few bytes of
* output or are writing a framework on top of this library, it is
* recommend that you use {@link #forJavaScript(String)} over this
* method.
*
* Example JSP Usage:
*
* <button onclick="alert('<%=Encode.forJavaScriptAttribute(data)%>');">
*
*
* @param input the input string to encode
* @return the input encoded for JavaScript
* @see #forJavaScript(String)
* @see #forJavaScriptBlock(String)
*/
public static String forJavaScriptAttribute(String input) {
return encode(Encoders.JAVASCRIPT_ATTRIBUTE_ENCODER, input);
}
/**
* See {@link #forJavaScriptAttribute(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forJavaScriptAttribute(Writer out, String input)
throws IOException
{
encode(Encoders.JAVASCRIPT_ATTRIBUTE_ENCODER, out, input);
}
/**
* This method encodes for JavaScript strings contained within
* HTML script blocks. It is NOT safe for use in script
* attributes (such as onclick
). The caller must
* provide the surrounding quotation characters. This method
* performs the same encode as {@link #forJavaScript(String)} with
* the exception that "
and '
are
* encoded as \"
and \'
* respectively.
*
* Unless you are interested in saving a few bytes of
* output or are writing a framework on top of this library, it is
* recommend that you use {@link #forJavaScript(String)} over this
* method.
*
* Example JSP Usage:
*
* <script type="text/javascript">
* var data = "<%=Encode.forJavaScriptBlock(data)%>";
* </script>
*
*
* @param input the input string to encode
* @return the input encoded for JavaScript
* @see #forJavaScript(String)
* @see #forJavaScriptAttribute(String)
*/
public static String forJavaScriptBlock(String input) {
return encode(Encoders.JAVASCRIPT_BLOCK_ENCODER, input);
}
/**
* See {@link #forJavaScriptBlock(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forJavaScriptBlock(Writer out, String input)
throws IOException
{
encode(Encoders.JAVASCRIPT_BLOCK_ENCODER, out, input);
}
/**
* This method encodes for JavaScript strings contained within
* a JavaScript or JSON file. This method is NOT safe for
* use in ANY context embedded in HTML. The caller must
* provide the surrounding quotation characters. This method
* performs the same encode as {@link #forJavaScript(String)} with
* the exception that /
and &
are not
* escaped and "
and '
are encoded as
* \"
and \'
respectively.
*
* Unless you are interested in saving a few bytes of
* output or are writing a framework on top of this library, it is
* recommend that you use {@link #forJavaScript(String)} over this
* method.
*
* Example JSP Usage:
* This example is serving up JavaScript source directly:
*
* <%@page contentType="text/javascript; charset=UTF-8"%>
* var data = "<%=Encode.forJavaScriptSource(data)%>";
*
*
* This example is serving up JSON data (users of this use-case
* are encouraged to read up on "JSON Hijacking"):
*
* <%@page contentType="application/json; charset=UTF-8"%>
* <% myapp.jsonHijackingPreventionMeasure(); %>
* {"data":"<%=Encode.forJavaScriptSource(data)%>"}
*
*
* @param input the input string to encode
* @return the input encoded for JavaScript
* @see #forJavaScript(String)
* @see #forJavaScriptAttribute(String)
* @see #forJavaScriptBlock(String)
*/
public static String forJavaScriptSource(String input) {
return encode(Encoders.JAVASCRIPT_SOURCE_ENCODER, input);
}
/**
* See {@link #forJavaScriptSource(String)} for description of encoding. This
* version writes directly to a Writer without an intervening string.
*
* @param out where to write encoded output
* @param input the input string to encode
* @throws IOException if thrown by writer
*/
public static void forJavaScriptSource(Writer out, String input)
throws IOException
{
encode(Encoders.JAVASCRIPT_SOURCE_ENCODER, out, input);
}
// Additional?
// MySQL
// PostreSQL
// Oracle
// ...
/**
* Core encoding loop shared by public methods. It first uses the
* encoder to scan the input for characters that need encoding. If
* no characters require encoding, the input string is returned.
* Otherwise a buffer is used to encode the remainder
* of the input.
*
* @param encoder the encoder to use
* @param str the string to encode
* @return the input string encoded with the provided encoder.
*/
static String encode(Encoder encoder, String str) {
if (str == null) {
// consistent with String.valueOf(...) use "null" for null.
str = "null";
}
// quick pass--see if we need to actually encode anything, if not
// return the value unchanged.
final int n = str.length();
int j = encoder.firstEncodedOffset(str, 0, n);
if (j == n) {
return str;
}
// otherwise, we need to encode. We use a buffer to avoid
// excessive memory allocation for these calls. Note: this means that
// an encoder implementation must NEVER call this method internally.
return new Buffer().encode(encoder, str, j);
}
/**
* Core encoding loop shared by public methods. It first uses the
* encoder to scan the input for characters that need encoding. If no
* characters require encoding, the input string is written directly to
* the writer. Otherwise a buffer is used to encode the
* remainder of the input to the buffers. This version saves a wrapping
* in an String.
*
* @param encoder the encoder to use
* @param out the writer for the encoded output
* @param str the string to encode
* @throws IOException if thrown by the writer
*/
static void encode(Encoder encoder, Writer out, String str)
throws IOException
{
if (str == null) {
// consistent with String.valueOf(...) use "null" for null.
str = "null";
}
// quick pass--see if we need to actually encode anything, if not
// return the value unchanged.
final int n = str.length();
int j = encoder.firstEncodedOffset(str, 0, n);
if (j == n) {
out.write(str);
return;
}
// otherwise, we need to encode. We use a buffer to avoid
// excessive memory allocation for these calls. Note: this means that
// an encoder implementation must NEVER call this method internally.
new Buffer().encode(encoder, out, str, j);
}
/**
* A buffer used for encoding.
*/
static class Buffer {
/**
* Input buffer size, used to extract a copy of the input
* from a string and then send to the encoder.
*/
static final int INPUT_BUFFER_SIZE = 1024;
/**
* Output buffer size used to store the encoded output before
* wrapping in a string.
*/
static final int OUTPUT_BUFFER_SIZE = INPUT_BUFFER_SIZE * 2;
/**
* The input buffer. A heap-allocated, array-backed buffer of
* INPUT_BUFFER_SIZE used for holding the characters to encode.
*/
final CharBuffer _input = CharBuffer.allocate(INPUT_BUFFER_SIZE);
/**
* The output buffer. A heap-allocated, array-backed buffer of
* OUTPUT_BUFFER_SIZE used for holding the encoded output.
*/
final CharBuffer _output = CharBuffer.allocate(OUTPUT_BUFFER_SIZE);
/**
* The core String encoding routine of this class. It uses the input
* and output buffers to allow the encoders to work in reuse arrays.
* When the input and/or output exceeds the capacity of the reused
* arrays, temporary ones are allocated and then discarded after
* the encode is done.
*
* @param encoder the encoder to use
* @param str the string to encode
* @param j the offset in {@code str} to start encoding
* @return the encoded result
*/
String encode(Encoder encoder, String str, int j) {
final int n = str.length();
final int remaining = n - j;
if (remaining <= INPUT_BUFFER_SIZE && j <= OUTPUT_BUFFER_SIZE) {
// the remaining input to encode fits completely in the pre-
// allocated buffer.
str.getChars(0, j, _output.array(), 0);
str.getChars(j, n, _input.array(), 0);
_input.limit(remaining).position(0);
_output.clear().position(j);
CoderResult cr = encoder.encodeArrays(_input, _output, true);
if (cr.isUnderflow()) {
return new String(_output.array(), 0, _output.position());
}
// else, it's an overflow, we need to use a new output buffer
// we'll allocate this buffer to be the exact size of the worst
// case, guaranteeing a second overflow would not be possible.
CharBuffer tmp = CharBuffer.allocate(_output.position()
+ encoder.maxEncodedLength(_input.remaining()));
// copy over everything that has been encoded so far
tmp.put(_output.array(), 0, _output.position());
cr = encoder.encodeArrays(_input, tmp, true);
if (cr.isOverflow()) {
throw new AssertionError("unexpected result from encoder");
}
return new String(tmp.array(), 0, tmp.position());
} else {
// the input it too large for our pre-allocated buffers
// we'll use a temporary direct heap allocation
final int m = j + encoder.maxEncodedLength(remaining);
CharBuffer buffer = CharBuffer.allocate(m);
str.getChars(0, j, buffer.array(), 0);
str.getChars(j, n, buffer.array(), m - remaining);
CharBuffer input = buffer.duplicate();
input.limit(m).position(m-remaining);
buffer.position(j);
CoderResult cr = encoder.encodeArrays(input, buffer, true);
if (cr.isOverflow()) {
throw new AssertionError("unexpected result from encoder");
}
return new String(buffer.array(), 0, buffer.position());
}
}
/**
* The core Writer encoding routing of this class. It uses the
* input and output buffers to allow the encoders to reuse arrays.
* Unlike the string version, this method will never allocate more
* memory, instead encoding is done in batches and flushed to the
* writer in batches as large as possible.
*
* @param encoder the encoder to use
* @param out where to write the encoded output
* @param str the string to encode
* @param j the position in the string at which the first character
* needs encoding.
* @throws IOException if thrown by the writer.
*/
void encode(Encoder encoder, Writer out, String str, int j)
throws IOException
{
out.write(str, 0, j);
final int n = str.length();
_input.clear();
_output.clear();
final char[] inputArray = _input.array();
final char[] outputArray = _output.array();
for (;;) {
final int remainingInput = n - j;
final int startPosition = _input.position();
final int batchSize = Math.min(remainingInput, _input.remaining());
str.getChars(j, j+batchSize, inputArray, startPosition);
_input.limit(startPosition + batchSize);
for (;;) {
CoderResult cr = encoder.encodeArrays(
_input, _output, batchSize == remainingInput);
if (cr.isUnderflow()) {
// get next input batch
break;
}
// else, output buffer full, flush and continue.
out.write(outputArray, 0, _output.position());
_output.clear();
}
j += _input.position() - startPosition;
if (j == n) {
// done. flush remaining output buffer and return
out.write(outputArray, 0, _output.position());
return;
}
_input.compact();
}
}
}
}