com.demandware.appsec.secure.manipulation.impl.HTMLManipulator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of content-manipulator Show documentation
Show all versions of content-manipulator Show documentation
Provide a set of Context-Based Encoders and Filterers in Java that allow application developers to sanitize application data for safe output or processing
The newest version!
/*
* Copyright 2015 Demandware Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
* file except in compliance with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language governing permissions and limitations under the
* License.
*/
package com.demandware.appsec.secure.manipulation.impl;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import com.demandware.appsec.secure.manipulation.AbstractCharacterManipulator;
import com.demandware.appsec.secure.manipulation.IManipulateOption;
/**
* HTMLManipulator handles all content related to HTML
*
* @author Chris Smith
*/
class HTMLManipulator
extends AbstractCharacterManipulator
{
static enum HTMLManipulatorOption
implements IManipulateOption
{
//@formatter:off
/*
* These characters are additional to the base immune list
* and should also be explicitly allowed
*/
CONTENT ( '\t', '\n', '\r', ' ' ),
UNQUOTED_ATTRIBUTE ( ),
SINGLE_QUOTE_ATTRIBUTE ( '\t', '\n', '\r', ' ', '"' ),
DOUBLE_QUOTE_ATTRIBUTE ( '\t', '\n', '\r', ' ', '\'' ),
/*
* Do not be tempted to add HTMLComment. HTML Comments are sometimes
* used for providing browser directives (esp IE), and so there is no
* good way to protect them
*/
;
//These characters are allowed in any HTML Context, according to the RFC
private final Character[] baseImmune =
{
'!', '#', '$', '%', '^', '(', ')', '*',
'+', ',', '-', '.', '/', ':', ';', '=',
'?', '@', '[', '\\', ']', '_', '{', '|',
'}', '~'
};
//@formatter:on
private final Character[] immune;
private HTMLManipulatorOption( Character... immune )
{
this.immune = ManipulationUtils.combineArrays( immune, this.baseImmune );
}
public Character[] getImmuneCharacters()
{
return this.immune;
}
}
// unmodifiable quick reference for character entities (since there are a lot)
private static final Map characterToEntityMap = createEntityMap();
// for control characters, use the Replacement Character (? symbol in a diamond)
private static final String REPLACE_HEX = "�";
HTMLManipulator( HTMLManipulatorOption manipulatorOption )
{
super( manipulatorOption );
}
@Override
protected String getCorrectCharacter( Character c )
{
String correctedCharacter = "";
HTMLManipulatorOption opt = (HTMLManipulatorOption) this.manipulatorOption;
// if the character is alphanumeric, or should be immune, it is OK
if ( ManipulationUtils.isAlphaNum( c ) || ManipulationUtils.isInList( c, opt.getImmuneCharacters() ) )
{
correctedCharacter = String.valueOf( c );
}
else
{
// Check if the character can be written as an entity to block attacks
String entity = characterToEntityMap.get( c );
if ( entity != null )
{
correctedCharacter = entity;
}
// Otherwise, replace illegal control characters with a safe replacement
// these characters have caused HTML parsing issues in some browsers in the past
else if ( ( c <= 0x1f ) || // lower bounds of control characters
( c >= 0x7f && c <= 0x9f ) ) // DEL through APC control characters
{
correctedCharacter = REPLACE_HEX;
}
// RFC states to output illegal characters as hex replacements
else
{
correctedCharacter = "" + ManipulationUtils.getHexForCharacter( c ) + ";";
}
}
return correctedCharacter;
}
/**
* Build a unmodifiable Map of entity Character to Name for faster lookup List taken from ESAPI HTMLEntityCodec (BSD
* license)
*/
private static synchronized Map createEntityMap()
{
Map map = new HashMap( 252 );
map.put( (char) 34, """ ); /* quotation mark */
map.put( (char) 38, "&" ); /* ampersand */
map.put( (char) 60, "<" ); /* less-than sign */
map.put( (char) 62, ">" ); /* greater-than sign */
map.put( (char) 160, " " ); /* no-break space */
map.put( (char) 161, "¡" ); /* inverted exclamation mark */
map.put( (char) 162, "¢" ); /* cent sign */
map.put( (char) 163, "£" ); /* pound sign */
map.put( (char) 164, "¤" ); /* currency sign */
map.put( (char) 165, "¥" ); /* yen sign */
map.put( (char) 166, "¦" ); /* broken bar */
map.put( (char) 167, "§" ); /* section sign */
map.put( (char) 168, "¨" ); /* diaeresis */
map.put( (char) 169, "©" ); /* copyright sign */
map.put( (char) 170, "ª" ); /* feminine ordinal indicator */
map.put( (char) 171, "«" ); /* left-pointing double angle quotation mark */
map.put( (char) 172, "¬" ); /* not sign */
map.put( (char) 173, "" ); /* soft hyphen */
map.put( (char) 174, "®" ); /* registered sign */
map.put( (char) 175, "¯" ); /* macron */
map.put( (char) 176, "°" ); /* degree sign */
map.put( (char) 177, "±" ); /* plus-minus sign */
map.put( (char) 178, "²" ); /* superscript two */
map.put( (char) 179, "³" ); /* superscript three */
map.put( (char) 180, "´" ); /* acute accent */
map.put( (char) 181, "µ" ); /* micro sign */
map.put( (char) 182, "¶" ); /* pilcrow sign */
map.put( (char) 183, "·" ); /* middle dot */
map.put( (char) 184, "¸" ); /* cedilla */
map.put( (char) 185, "¹" ); /* superscript one */
map.put( (char) 186, "º" ); /* masculine ordinal indicator */
map.put( (char) 187, "»" ); /* right-pointing double angle quotation mark */
map.put( (char) 188, "¼" ); /* vulgar fraction one quarter */
map.put( (char) 189, "½" ); /* vulgar fraction one half */
map.put( (char) 190, "¾" ); /* vulgar fraction three quarters */
map.put( (char) 191, "¿" ); /* inverted question mark */
map.put( (char) 192, "À" ); /* Latin capital letter a with grave */
map.put( (char) 193, "Á" ); /* Latin capital letter a with acute */
map.put( (char) 194, "Â" ); /* Latin capital letter a with circumflex */
map.put( (char) 195, "Ã" ); /* Latin capital letter a with tilde */
map.put( (char) 196, "Ä" ); /* Latin capital letter a with diaeresis */
map.put( (char) 197, "Å" ); /* Latin capital letter a with ring above */
map.put( (char) 198, "Æ" ); /* Latin capital letter ae */
map.put( (char) 199, "Ç" ); /* Latin capital letter c with cedilla */
map.put( (char) 200, "È" ); /* Latin capital letter e with grave */
map.put( (char) 201, "É" ); /* Latin capital letter e with acute */
map.put( (char) 202, "Ê" ); /* Latin capital letter e with circumflex */
map.put( (char) 203, "Ë" ); /* Latin capital letter e with diaeresis */
map.put( (char) 204, "Ì" ); /* Latin capital letter i with grave */
map.put( (char) 205, "Í" ); /* Latin capital letter i with acute */
map.put( (char) 206, "Î" ); /* Latin capital letter i with circumflex */
map.put( (char) 207, "Ï" ); /* Latin capital letter i with diaeresis */
map.put( (char) 208, "Ð" ); /* Latin capital letter eth */
map.put( (char) 209, "Ñ" ); /* Latin capital letter n with tilde */
map.put( (char) 210, "Ò" ); /* Latin capital letter o with grave */
map.put( (char) 211, "Ó" ); /* Latin capital letter o with acute */
map.put( (char) 212, "Ô" ); /* Latin capital letter o with circumflex */
map.put( (char) 213, "Õ" ); /* Latin capital letter o with tilde */
map.put( (char) 214, "Ö" ); /* Latin capital letter o with diaeresis */
map.put( (char) 215, "×" ); /* multiplication sign */
map.put( (char) 216, "Ø" ); /* Latin capital letter o with stroke */
map.put( (char) 217, "Ù" ); /* Latin capital letter u with grave */
map.put( (char) 218, "Ú" ); /* Latin capital letter u with acute */
map.put( (char) 219, "Û" ); /* Latin capital letter u with circumflex */
map.put( (char) 220, "Ü" ); /* Latin capital letter u with diaeresis */
map.put( (char) 221, "Ý" ); /* Latin capital letter y with acute */
map.put( (char) 222, "Þ" ); /* Latin capital letter thorn */
map.put( (char) 223, "ß" ); /* Latin small letter sharp sXCOMMAX German Eszett */
map.put( (char) 224, "à" ); /* Latin small letter a with grave */
map.put( (char) 225, "á" ); /* Latin small letter a with acute */
map.put( (char) 226, "â" ); /* Latin small letter a with circumflex */
map.put( (char) 227, "ã" ); /* Latin small letter a with tilde */
map.put( (char) 228, "ä" ); /* Latin small letter a with diaeresis */
map.put( (char) 229, "å" ); /* Latin small letter a with ring above */
map.put( (char) 230, "æ" ); /* Latin lowercase ligature ae */
map.put( (char) 231, "ç" ); /* Latin small letter c with cedilla */
map.put( (char) 232, "è" ); /* Latin small letter e with grave */
map.put( (char) 233, "é" ); /* Latin small letter e with acute */
map.put( (char) 234, "ê" ); /* Latin small letter e with circumflex */
map.put( (char) 235, "ë" ); /* Latin small letter e with diaeresis */
map.put( (char) 236, "ì" ); /* Latin small letter i with grave */
map.put( (char) 237, "í" ); /* Latin small letter i with acute */
map.put( (char) 238, "î" ); /* Latin small letter i with circumflex */
map.put( (char) 239, "ï" ); /* Latin small letter i with diaeresis */
map.put( (char) 240, "ð" ); /* Latin small letter eth */
map.put( (char) 241, "ñ" ); /* Latin small letter n with tilde */
map.put( (char) 242, "ò" ); /* Latin small letter o with grave */
map.put( (char) 243, "ó" ); /* Latin small letter o with acute */
map.put( (char) 244, "ô" ); /* Latin small letter o with circumflex */
map.put( (char) 245, "õ" ); /* Latin small letter o with tilde */
map.put( (char) 246, "ö" ); /* Latin small letter o with diaeresis */
map.put( (char) 247, "÷" ); /* division sign */
map.put( (char) 248, "ø" ); /* Latin small letter o with stroke */
map.put( (char) 249, "ù" ); /* Latin small letter u with grave */
map.put( (char) 250, "ú" ); /* Latin small letter u with acute */
map.put( (char) 251, "û" ); /* Latin small letter u with circumflex */
map.put( (char) 252, "ü" ); /* Latin small letter u with diaeresis */
map.put( (char) 253, "ý" ); /* Latin small letter y with acute */
map.put( (char) 254, "þ" ); /* Latin small letter thorn */
map.put( (char) 255, "ÿ" ); /* Latin small letter y with diaeresis */
map.put( (char) 338, "Œ" ); /* Latin capital ligature oe */
map.put( (char) 339, "œ" ); /* Latin small ligature oe */
map.put( (char) 352, "Š" ); /* Latin capital letter s with caron */
map.put( (char) 353, "š" ); /* Latin small letter s with caron */
map.put( (char) 376, "Ÿ" ); /* Latin capital letter y with diaeresis */
map.put( (char) 402, "ƒ" ); /* Latin small letter f with hook */
map.put( (char) 710, "ˆ" ); /* modifier letter circumflex accent */
map.put( (char) 732, "˜" ); /* small tilde */
map.put( (char) 913, "Α" ); /* Greek capital letter alpha */
map.put( (char) 914, "Β" ); /* Greek capital letter beta */
map.put( (char) 915, "Γ" ); /* Greek capital letter gamma */
map.put( (char) 916, "Δ" ); /* Greek capital letter delta */
map.put( (char) 917, "Ε" ); /* Greek capital letter epsilon */
map.put( (char) 918, "Ζ" ); /* Greek capital letter zeta */
map.put( (char) 919, "Η" ); /* Greek capital letter eta */
map.put( (char) 920, "Θ" ); /* Greek capital letter theta */
map.put( (char) 921, "Ι" ); /* Greek capital letter iota */
map.put( (char) 922, "Κ" ); /* Greek capital letter kappa */
map.put( (char) 923, "Λ" ); /* Greek capital letter lambda */
map.put( (char) 924, "Μ" ); /* Greek capital letter mu */
map.put( (char) 925, "Ν" ); /* Greek capital letter nu */
map.put( (char) 926, "Ξ" ); /* Greek capital letter xi */
map.put( (char) 927, "Ο" ); /* Greek capital letter omicron */
map.put( (char) 928, "Π" ); /* Greek capital letter pi */
map.put( (char) 929, "Ρ" ); /* Greek capital letter rho */
map.put( (char) 931, "Σ" ); /* Greek capital letter sigma */
map.put( (char) 932, "Τ" ); /* Greek capital letter tau */
map.put( (char) 933, "Υ" ); /* Greek capital letter upsilon */
map.put( (char) 934, "Φ" ); /* Greek capital letter phi */
map.put( (char) 935, "Χ" ); /* Greek capital letter chi */
map.put( (char) 936, "Ψ" ); /* Greek capital letter psi */
map.put( (char) 937, "Ω" ); /* Greek capital letter omega */
map.put( (char) 945, "α" ); /* Greek small letter alpha */
map.put( (char) 946, "β" ); /* Greek small letter beta */
map.put( (char) 947, "γ" ); /* Greek small letter gamma */
map.put( (char) 948, "δ" ); /* Greek small letter delta */
map.put( (char) 949, "ε" ); /* Greek small letter epsilon */
map.put( (char) 950, "ζ" ); /* Greek small letter zeta */
map.put( (char) 951, "η" ); /* Greek small letter eta */
map.put( (char) 952, "θ" ); /* Greek small letter theta */
map.put( (char) 953, "ι" ); /* Greek small letter iota */
map.put( (char) 954, "κ" ); /* Greek small letter kappa */
map.put( (char) 955, "λ" ); /* Greek small letter lambda */
map.put( (char) 956, "μ" ); /* Greek small letter mu */
map.put( (char) 957, "ν" ); /* Greek small letter nu */
map.put( (char) 958, "ξ" ); /* Greek small letter xi */
map.put( (char) 959, "ο" ); /* Greek small letter omicron */
map.put( (char) 960, "π" ); /* Greek small letter pi */
map.put( (char) 961, "ρ" ); /* Greek small letter rho */
map.put( (char) 962, "ς" ); /* Greek small letter final sigma */
map.put( (char) 963, "σ" ); /* Greek small letter sigma */
map.put( (char) 964, "τ" ); /* Greek small letter tau */
map.put( (char) 965, "υ" ); /* Greek small letter upsilon */
map.put( (char) 966, "φ" ); /* Greek small letter phi */
map.put( (char) 967, "χ" ); /* Greek small letter chi */
map.put( (char) 968, "ψ" ); /* Greek small letter psi */
map.put( (char) 969, "ω" ); /* Greek small letter omega */
map.put( (char) 977, "ϑ" ); /* Greek theta symbol */
map.put( (char) 978, "ϒ" ); /* Greek upsilon with hook symbol */
map.put( (char) 982, "ϖ" ); /* Greek pi symbol */
map.put( (char) 8194, " " ); /* en space */
map.put( (char) 8195, " " ); /* em space */
map.put( (char) 8201, " " ); /* thin space */
map.put( (char) 8204, "" ); /* zero width non-joiner */
map.put( (char) 8205, "" ); /* zero width joiner */
map.put( (char) 8206, "" ); /* left-to-right mark */
map.put( (char) 8207, "" ); /* right-to-left mark */
map.put( (char) 8211, "–" ); /* en dash */
map.put( (char) 8212, "—" ); /* em dash */
map.put( (char) 8216, "‘" ); /* left single quotation mark */
map.put( (char) 8217, "’" ); /* right single quotation mark */
map.put( (char) 8218, "‚" ); /* single low-9 quotation mark */
map.put( (char) 8220, "“" ); /* left double quotation mark */
map.put( (char) 8221, "”" ); /* right double quotation mark */
map.put( (char) 8222, "„" ); /* double low-9 quotation mark */
map.put( (char) 8224, "†" ); /* dagger */
map.put( (char) 8225, "‡" ); /* double dagger */
map.put( (char) 8226, "•" ); /* bullet */
map.put( (char) 8230, "…" ); /* horizontal ellipsis */
map.put( (char) 8240, "‰" ); /* per mille sign */
map.put( (char) 8242, "′" ); /* prime */
map.put( (char) 8243, "″" ); /* double prime */
map.put( (char) 8249, "‹" ); /* single left-pointing angle quotation mark */
map.put( (char) 8250, "›" ); /* single right-pointing angle quotation mark */
map.put( (char) 8254, "‾" ); /* overline */
map.put( (char) 8260, "⁄" ); /* fraction slash */
map.put( (char) 8364, "€" ); /* euro sign */
map.put( (char) 8465, "ℑ" ); /* black-letter capital i */
map.put( (char) 8472, "℘" ); /* script capital pXCOMMAX Weierstrass p */
map.put( (char) 8476, "ℜ" ); /* black-letter capital r */
map.put( (char) 8482, "™" ); /* trademark sign */
map.put( (char) 8501, "ℵ" ); /* alef symbol */
map.put( (char) 8592, "←" ); /* leftwards arrow */
map.put( (char) 8593, "↑" ); /* upwards arrow */
map.put( (char) 8594, "→" ); /* rightwards arrow */
map.put( (char) 8595, "↓" ); /* downwards arrow */
map.put( (char) 8596, "↔" ); /* left right arrow */
map.put( (char) 8629, "↵" ); /* downwards arrow with corner leftwards */
map.put( (char) 8656, "⇐" ); /* leftwards double arrow */
map.put( (char) 8657, "⇑" ); /* upwards double arrow */
map.put( (char) 8658, "⇒" ); /* rightwards double arrow */
map.put( (char) 8659, "⇓" ); /* downwards double arrow */
map.put( (char) 8660, "⇔" ); /* left right double arrow */
map.put( (char) 8704, "∀" ); /* for all */
map.put( (char) 8706, "∂" ); /* partial differential */
map.put( (char) 8707, "∃" ); /* there exists */
map.put( (char) 8709, "∅" ); /* empty set */
map.put( (char) 8711, "∇" ); /* nabla */
map.put( (char) 8712, "∈" ); /* element of */
map.put( (char) 8713, "∉" ); /* not an element of */
map.put( (char) 8715, "∋" ); /* contains as member */
map.put( (char) 8719, "∏" ); /* n-ary product */
map.put( (char) 8721, "∑" ); /* n-ary summation */
map.put( (char) 8722, "−" ); /* minus sign */
map.put( (char) 8727, "∗" ); /* asterisk operator */
map.put( (char) 8730, "√" ); /* square root */
map.put( (char) 8733, "∝" ); /* proportional to */
map.put( (char) 8734, "∞" ); /* infinity */
map.put( (char) 8736, "∠" ); /* angle */
map.put( (char) 8743, "∧" ); /* logical and */
map.put( (char) 8744, "∨" ); /* logical or */
map.put( (char) 8745, "∩" ); /* intersection */
map.put( (char) 8746, "∪" ); /* union */
map.put( (char) 8747, "∫" ); /* integral */
map.put( (char) 8756, "∴" ); /* therefore */
map.put( (char) 8764, "∼" ); /* tilde operator */
map.put( (char) 8773, "≅" ); /* congruent to */
map.put( (char) 8776, "≈" ); /* almost equal to */
map.put( (char) 8800, "≠" ); /* not equal to */
map.put( (char) 8801, "≡" ); /* identical toXCOMMAX equivalent to */
map.put( (char) 8804, "≤" ); /* less-than or equal to */
map.put( (char) 8805, "≥" ); /* greater-than or equal to */
map.put( (char) 8834, "⊂" ); /* subset of */
map.put( (char) 8835, "⊃" ); /* superset of */
map.put( (char) 8836, "⊄" ); /* not a subset of */
map.put( (char) 8838, "⊆" ); /* subset of or equal to */
map.put( (char) 8839, "⊇" ); /* superset of or equal to */
map.put( (char) 8853, "⊕" ); /* circled plus */
map.put( (char) 8855, "⊗" ); /* circled times */
map.put( (char) 8869, "⊥" ); /* up tack */
map.put( (char) 8901, "⋅" ); /* dot operator */
map.put( (char) 8968, "⌈" ); /* left ceiling */
map.put( (char) 8969, "⌉" ); /* right ceiling */
map.put( (char) 8970, "⌊" ); /* left floor */
map.put( (char) 8971, "⌋" ); /* right floor */
map.put( (char) 9001, "〈" ); /* left-pointing angle bracket */
map.put( (char) 9002, "〉" ); /* right-pointing angle bracket */
map.put( (char) 9674, "◊" ); /* lozenge */
map.put( (char) 9824, "♠" ); /* black spade suit */
map.put( (char) 9827, "♣" ); /* black club suit */
map.put( (char) 9829, "♥" ); /* black heart suit */
map.put( (char) 9830, "♦" ); /* black diamond suit */
return Collections.unmodifiableMap( map );
}
}