All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.demandware.appsec.secure.manipulation.impl.HTMLManipulator Maven / Gradle / Ivy

Go to download

Provide a set of Context-Based Encoders and Filterers in Java that allow application developers to sanitize application data for safe output or processing

The newest version!
/*
 * Copyright 2015 Demandware Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
 * file except in compliance with the License. You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language governing permissions and limitations under the
 * License.
 */
package com.demandware.appsec.secure.manipulation.impl;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

import com.demandware.appsec.secure.manipulation.AbstractCharacterManipulator;
import com.demandware.appsec.secure.manipulation.IManipulateOption;

/**
 * HTMLManipulator handles all content related to HTML
 *
 * @author Chris Smith
 */
class HTMLManipulator
    extends AbstractCharacterManipulator
{

    static enum HTMLManipulatorOption
        implements IManipulateOption
    {

        //@formatter:off
        
        /*
         * These characters are additional to the base immune list
         * and should also be explicitly allowed
         */
        CONTENT                 ( '\t', '\n', '\r', ' '         ),
        UNQUOTED_ATTRIBUTE      (                               ),
        SINGLE_QUOTE_ATTRIBUTE  ( '\t', '\n', '\r', ' ', '"'    ),
        DOUBLE_QUOTE_ATTRIBUTE  ( '\t', '\n', '\r', ' ', '\''   ),
        /* 
         * Do not be tempted to add HTMLComment. HTML Comments are sometimes
         * used for providing browser directives (esp IE), and so there is no
         * good way to protect them
         */
        ;

        //These characters are allowed in any HTML Context, according to the RFC
        private final Character[] baseImmune =
                    {
                        '!', '#', '$', '%', '^', '(', ')', '*',
                        '+', ',', '-', '.', '/', ':', ';', '=',
                        '?', '@', '[', '\\', ']', '_', '{', '|',
                        '}', '~'
                    };

        //@formatter:on

        private final Character[] immune;

        private HTMLManipulatorOption( Character... immune )
        {
            this.immune = ManipulationUtils.combineArrays( immune, this.baseImmune );
        }

        public Character[] getImmuneCharacters()
        {
            return this.immune;
        }
    }

    // unmodifiable quick reference for character entities (since there are a lot)
    private static final Map characterToEntityMap = createEntityMap();

    // for control characters, use the Replacement Character (? symbol in a diamond)
    private static final String REPLACE_HEX = "�";

    HTMLManipulator( HTMLManipulatorOption manipulatorOption )
    {
        super( manipulatorOption );
    }

    @Override
    protected String getCorrectCharacter( Character c )
    {
        String correctedCharacter = "";
        HTMLManipulatorOption opt = (HTMLManipulatorOption) this.manipulatorOption;

        // if the character is alphanumeric, or should be immune, it is OK
        if ( ManipulationUtils.isAlphaNum( c ) || ManipulationUtils.isInList( c, opt.getImmuneCharacters() ) )
        {
            correctedCharacter = String.valueOf( c );
        }
        else
        {
            // Check if the character can be written as an entity to block attacks
            String entity = characterToEntityMap.get( c );

            if ( entity != null )
            {
                correctedCharacter = entity;
            }
            // Otherwise, replace illegal control characters with a safe replacement
            // these characters have caused HTML parsing issues in some browsers in the past
            else if ( ( c <= 0x1f ) || // lower bounds of control characters
                ( c >= 0x7f && c <= 0x9f ) ) // DEL through APC control characters
            {
                correctedCharacter = REPLACE_HEX;
            }
            // RFC states to output illegal characters as hex replacements
            else
            {
                correctedCharacter = "&#x" + ManipulationUtils.getHexForCharacter( c ) + ";";
            }
        }
        return correctedCharacter;
    }

    /**
     * Build a unmodifiable Map of entity Character to Name for faster lookup List taken from ESAPI HTMLEntityCodec (BSD
     * license)
     */
    private static synchronized Map createEntityMap()
    {
        Map map = new HashMap( 252 );

        map.put( (char) 34, """ ); /* quotation mark */
        map.put( (char) 38, "&" ); /* ampersand */
        map.put( (char) 60, "<" ); /* less-than sign */
        map.put( (char) 62, ">" ); /* greater-than sign */
        map.put( (char) 160, " " ); /* no-break space */
        map.put( (char) 161, "¡" ); /* inverted exclamation mark */
        map.put( (char) 162, "¢" ); /* cent sign */
        map.put( (char) 163, "£" ); /* pound sign */
        map.put( (char) 164, "¤" ); /* currency sign */
        map.put( (char) 165, "¥" ); /* yen sign */
        map.put( (char) 166, "¦" ); /* broken bar */
        map.put( (char) 167, "§" ); /* section sign */
        map.put( (char) 168, "¨" ); /* diaeresis */
        map.put( (char) 169, "©" ); /* copyright sign */
        map.put( (char) 170, "ª" ); /* feminine ordinal indicator */
        map.put( (char) 171, "«" ); /* left-pointing double angle quotation mark */
        map.put( (char) 172, "¬" ); /* not sign */
        map.put( (char) 173, "­" ); /* soft hyphen */
        map.put( (char) 174, "®" ); /* registered sign */
        map.put( (char) 175, "¯" ); /* macron */
        map.put( (char) 176, "°" ); /* degree sign */
        map.put( (char) 177, "±" ); /* plus-minus sign */
        map.put( (char) 178, "²" ); /* superscript two */
        map.put( (char) 179, "³" ); /* superscript three */
        map.put( (char) 180, "´" ); /* acute accent */
        map.put( (char) 181, "µ" ); /* micro sign */
        map.put( (char) 182, "¶" ); /* pilcrow sign */
        map.put( (char) 183, "·" ); /* middle dot */
        map.put( (char) 184, "¸" ); /* cedilla */
        map.put( (char) 185, "¹" ); /* superscript one */
        map.put( (char) 186, "º" ); /* masculine ordinal indicator */
        map.put( (char) 187, "»" ); /* right-pointing double angle quotation mark */
        map.put( (char) 188, "¼" ); /* vulgar fraction one quarter */
        map.put( (char) 189, "½" ); /* vulgar fraction one half */
        map.put( (char) 190, "¾" ); /* vulgar fraction three quarters */
        map.put( (char) 191, "¿" ); /* inverted question mark */
        map.put( (char) 192, "À" ); /* Latin capital letter a with grave */
        map.put( (char) 193, "Á" ); /* Latin capital letter a with acute */
        map.put( (char) 194, "Â" ); /* Latin capital letter a with circumflex */
        map.put( (char) 195, "Ã" ); /* Latin capital letter a with tilde */
        map.put( (char) 196, "Ä" ); /* Latin capital letter a with diaeresis */
        map.put( (char) 197, "Å" ); /* Latin capital letter a with ring above */
        map.put( (char) 198, "Æ" ); /* Latin capital letter ae */
        map.put( (char) 199, "Ç" ); /* Latin capital letter c with cedilla */
        map.put( (char) 200, "È" ); /* Latin capital letter e with grave */
        map.put( (char) 201, "É" ); /* Latin capital letter e with acute */
        map.put( (char) 202, "Ê" ); /* Latin capital letter e with circumflex */
        map.put( (char) 203, "Ë" ); /* Latin capital letter e with diaeresis */
        map.put( (char) 204, "Ì" ); /* Latin capital letter i with grave */
        map.put( (char) 205, "Í" ); /* Latin capital letter i with acute */
        map.put( (char) 206, "Î" ); /* Latin capital letter i with circumflex */
        map.put( (char) 207, "Ï" ); /* Latin capital letter i with diaeresis */
        map.put( (char) 208, "Ð" ); /* Latin capital letter eth */
        map.put( (char) 209, "Ñ" ); /* Latin capital letter n with tilde */
        map.put( (char) 210, "Ò" ); /* Latin capital letter o with grave */
        map.put( (char) 211, "Ó" ); /* Latin capital letter o with acute */
        map.put( (char) 212, "Ô" ); /* Latin capital letter o with circumflex */
        map.put( (char) 213, "Õ" ); /* Latin capital letter o with tilde */
        map.put( (char) 214, "Ö" ); /* Latin capital letter o with diaeresis */
        map.put( (char) 215, "×" ); /* multiplication sign */
        map.put( (char) 216, "Ø" ); /* Latin capital letter o with stroke */
        map.put( (char) 217, "Ù" ); /* Latin capital letter u with grave */
        map.put( (char) 218, "Ú" ); /* Latin capital letter u with acute */
        map.put( (char) 219, "Û" ); /* Latin capital letter u with circumflex */
        map.put( (char) 220, "Ü" ); /* Latin capital letter u with diaeresis */
        map.put( (char) 221, "Ý" ); /* Latin capital letter y with acute */
        map.put( (char) 222, "Þ" ); /* Latin capital letter thorn */
        map.put( (char) 223, "ß" ); /* Latin small letter sharp sXCOMMAX German Eszett */
        map.put( (char) 224, "à" ); /* Latin small letter a with grave */
        map.put( (char) 225, "á" ); /* Latin small letter a with acute */
        map.put( (char) 226, "â" ); /* Latin small letter a with circumflex */
        map.put( (char) 227, "ã" ); /* Latin small letter a with tilde */
        map.put( (char) 228, "ä" ); /* Latin small letter a with diaeresis */
        map.put( (char) 229, "å" ); /* Latin small letter a with ring above */
        map.put( (char) 230, "æ" ); /* Latin lowercase ligature ae */
        map.put( (char) 231, "ç" ); /* Latin small letter c with cedilla */
        map.put( (char) 232, "è" ); /* Latin small letter e with grave */
        map.put( (char) 233, "é" ); /* Latin small letter e with acute */
        map.put( (char) 234, "ê" ); /* Latin small letter e with circumflex */
        map.put( (char) 235, "ë" ); /* Latin small letter e with diaeresis */
        map.put( (char) 236, "ì" ); /* Latin small letter i with grave */
        map.put( (char) 237, "í" ); /* Latin small letter i with acute */
        map.put( (char) 238, "î" ); /* Latin small letter i with circumflex */
        map.put( (char) 239, "ï" ); /* Latin small letter i with diaeresis */
        map.put( (char) 240, "ð" ); /* Latin small letter eth */
        map.put( (char) 241, "ñ" ); /* Latin small letter n with tilde */
        map.put( (char) 242, "ò" ); /* Latin small letter o with grave */
        map.put( (char) 243, "ó" ); /* Latin small letter o with acute */
        map.put( (char) 244, "ô" ); /* Latin small letter o with circumflex */
        map.put( (char) 245, "õ" ); /* Latin small letter o with tilde */
        map.put( (char) 246, "ö" ); /* Latin small letter o with diaeresis */
        map.put( (char) 247, "÷" ); /* division sign */
        map.put( (char) 248, "ø" ); /* Latin small letter o with stroke */
        map.put( (char) 249, "ù" ); /* Latin small letter u with grave */
        map.put( (char) 250, "ú" ); /* Latin small letter u with acute */
        map.put( (char) 251, "û" ); /* Latin small letter u with circumflex */
        map.put( (char) 252, "ü" ); /* Latin small letter u with diaeresis */
        map.put( (char) 253, "ý" ); /* Latin small letter y with acute */
        map.put( (char) 254, "þ" ); /* Latin small letter thorn */
        map.put( (char) 255, "ÿ" ); /* Latin small letter y with diaeresis */
        map.put( (char) 338, "Œ" ); /* Latin capital ligature oe */
        map.put( (char) 339, "œ" ); /* Latin small ligature oe */
        map.put( (char) 352, "Š" ); /* Latin capital letter s with caron */
        map.put( (char) 353, "š" ); /* Latin small letter s with caron */
        map.put( (char) 376, "Ÿ" ); /* Latin capital letter y with diaeresis */
        map.put( (char) 402, "ƒ" ); /* Latin small letter f with hook */
        map.put( (char) 710, "ˆ" ); /* modifier letter circumflex accent */
        map.put( (char) 732, "˜" ); /* small tilde */
        map.put( (char) 913, "Α" ); /* Greek capital letter alpha */
        map.put( (char) 914, "Β" ); /* Greek capital letter beta */
        map.put( (char) 915, "Γ" ); /* Greek capital letter gamma */
        map.put( (char) 916, "Δ" ); /* Greek capital letter delta */
        map.put( (char) 917, "Ε" ); /* Greek capital letter epsilon */
        map.put( (char) 918, "Ζ" ); /* Greek capital letter zeta */
        map.put( (char) 919, "Η" ); /* Greek capital letter eta */
        map.put( (char) 920, "Θ" ); /* Greek capital letter theta */
        map.put( (char) 921, "Ι" ); /* Greek capital letter iota */
        map.put( (char) 922, "Κ" ); /* Greek capital letter kappa */
        map.put( (char) 923, "Λ" ); /* Greek capital letter lambda */
        map.put( (char) 924, "Μ" ); /* Greek capital letter mu */
        map.put( (char) 925, "Ν" ); /* Greek capital letter nu */
        map.put( (char) 926, "Ξ" ); /* Greek capital letter xi */
        map.put( (char) 927, "Ο" ); /* Greek capital letter omicron */
        map.put( (char) 928, "Π" ); /* Greek capital letter pi */
        map.put( (char) 929, "Ρ" ); /* Greek capital letter rho */
        map.put( (char) 931, "Σ" ); /* Greek capital letter sigma */
        map.put( (char) 932, "Τ" ); /* Greek capital letter tau */
        map.put( (char) 933, "Υ" ); /* Greek capital letter upsilon */
        map.put( (char) 934, "Φ" ); /* Greek capital letter phi */
        map.put( (char) 935, "Χ" ); /* Greek capital letter chi */
        map.put( (char) 936, "Ψ" ); /* Greek capital letter psi */
        map.put( (char) 937, "Ω" ); /* Greek capital letter omega */
        map.put( (char) 945, "α" ); /* Greek small letter alpha */
        map.put( (char) 946, "β" ); /* Greek small letter beta */
        map.put( (char) 947, "γ" ); /* Greek small letter gamma */
        map.put( (char) 948, "δ" ); /* Greek small letter delta */
        map.put( (char) 949, "ε" ); /* Greek small letter epsilon */
        map.put( (char) 950, "ζ" ); /* Greek small letter zeta */
        map.put( (char) 951, "η" ); /* Greek small letter eta */
        map.put( (char) 952, "θ" ); /* Greek small letter theta */
        map.put( (char) 953, "ι" ); /* Greek small letter iota */
        map.put( (char) 954, "κ" ); /* Greek small letter kappa */
        map.put( (char) 955, "λ" ); /* Greek small letter lambda */
        map.put( (char) 956, "μ" ); /* Greek small letter mu */
        map.put( (char) 957, "ν" ); /* Greek small letter nu */
        map.put( (char) 958, "ξ" ); /* Greek small letter xi */
        map.put( (char) 959, "ο" ); /* Greek small letter omicron */
        map.put( (char) 960, "π" ); /* Greek small letter pi */
        map.put( (char) 961, "ρ" ); /* Greek small letter rho */
        map.put( (char) 962, "ς" ); /* Greek small letter final sigma */
        map.put( (char) 963, "σ" ); /* Greek small letter sigma */
        map.put( (char) 964, "τ" ); /* Greek small letter tau */
        map.put( (char) 965, "υ" ); /* Greek small letter upsilon */
        map.put( (char) 966, "φ" ); /* Greek small letter phi */
        map.put( (char) 967, "χ" ); /* Greek small letter chi */
        map.put( (char) 968, "ψ" ); /* Greek small letter psi */
        map.put( (char) 969, "ω" ); /* Greek small letter omega */
        map.put( (char) 977, "ϑ" ); /* Greek theta symbol */
        map.put( (char) 978, "ϒ" ); /* Greek upsilon with hook symbol */
        map.put( (char) 982, "ϖ" ); /* Greek pi symbol */
        map.put( (char) 8194, " " ); /* en space */
        map.put( (char) 8195, " " ); /* em space */
        map.put( (char) 8201, " " ); /* thin space */
        map.put( (char) 8204, "‌" ); /* zero width non-joiner */
        map.put( (char) 8205, "‍" ); /* zero width joiner */
        map.put( (char) 8206, "‎" ); /* left-to-right mark */
        map.put( (char) 8207, "‏" ); /* right-to-left mark */
        map.put( (char) 8211, "–" ); /* en dash */
        map.put( (char) 8212, "—" ); /* em dash */
        map.put( (char) 8216, "‘" ); /* left single quotation mark */
        map.put( (char) 8217, "’" ); /* right single quotation mark */
        map.put( (char) 8218, "‚" ); /* single low-9 quotation mark */
        map.put( (char) 8220, "“" ); /* left double quotation mark */
        map.put( (char) 8221, "”" ); /* right double quotation mark */
        map.put( (char) 8222, "„" ); /* double low-9 quotation mark */
        map.put( (char) 8224, "†" ); /* dagger */
        map.put( (char) 8225, "‡" ); /* double dagger */
        map.put( (char) 8226, "•" ); /* bullet */
        map.put( (char) 8230, "…" ); /* horizontal ellipsis */
        map.put( (char) 8240, "‰" ); /* per mille sign */
        map.put( (char) 8242, "′" ); /* prime */
        map.put( (char) 8243, "″" ); /* double prime */
        map.put( (char) 8249, "‹" ); /* single left-pointing angle quotation mark */
        map.put( (char) 8250, "›" ); /* single right-pointing angle quotation mark */
        map.put( (char) 8254, "‾" ); /* overline */
        map.put( (char) 8260, "⁄" ); /* fraction slash */
        map.put( (char) 8364, "€" ); /* euro sign */
        map.put( (char) 8465, "ℑ" ); /* black-letter capital i */
        map.put( (char) 8472, "℘" ); /* script capital pXCOMMAX Weierstrass p */
        map.put( (char) 8476, "ℜ" ); /* black-letter capital r */
        map.put( (char) 8482, "™" ); /* trademark sign */
        map.put( (char) 8501, "ℵ" ); /* alef symbol */
        map.put( (char) 8592, "←" ); /* leftwards arrow */
        map.put( (char) 8593, "↑" ); /* upwards arrow */
        map.put( (char) 8594, "→" ); /* rightwards arrow */
        map.put( (char) 8595, "↓" ); /* downwards arrow */
        map.put( (char) 8596, "↔" ); /* left right arrow */
        map.put( (char) 8629, "↵" ); /* downwards arrow with corner leftwards */
        map.put( (char) 8656, "⇐" ); /* leftwards double arrow */
        map.put( (char) 8657, "⇑" ); /* upwards double arrow */
        map.put( (char) 8658, "⇒" ); /* rightwards double arrow */
        map.put( (char) 8659, "⇓" ); /* downwards double arrow */
        map.put( (char) 8660, "⇔" ); /* left right double arrow */
        map.put( (char) 8704, "∀" ); /* for all */
        map.put( (char) 8706, "∂" ); /* partial differential */
        map.put( (char) 8707, "∃" ); /* there exists */
        map.put( (char) 8709, "∅" ); /* empty set */
        map.put( (char) 8711, "∇" ); /* nabla */
        map.put( (char) 8712, "∈" ); /* element of */
        map.put( (char) 8713, "∉" ); /* not an element of */
        map.put( (char) 8715, "∋" ); /* contains as member */
        map.put( (char) 8719, "∏" ); /* n-ary product */
        map.put( (char) 8721, "∑" ); /* n-ary summation */
        map.put( (char) 8722, "−" ); /* minus sign */
        map.put( (char) 8727, "∗" ); /* asterisk operator */
        map.put( (char) 8730, "√" ); /* square root */
        map.put( (char) 8733, "∝" ); /* proportional to */
        map.put( (char) 8734, "∞" ); /* infinity */
        map.put( (char) 8736, "∠" ); /* angle */
        map.put( (char) 8743, "∧" ); /* logical and */
        map.put( (char) 8744, "∨" ); /* logical or */
        map.put( (char) 8745, "∩" ); /* intersection */
        map.put( (char) 8746, "∪" ); /* union */
        map.put( (char) 8747, "∫" ); /* integral */
        map.put( (char) 8756, "∴" ); /* therefore */
        map.put( (char) 8764, "∼" ); /* tilde operator */
        map.put( (char) 8773, "≅" ); /* congruent to */
        map.put( (char) 8776, "≈" ); /* almost equal to */
        map.put( (char) 8800, "≠" ); /* not equal to */
        map.put( (char) 8801, "≡" ); /* identical toXCOMMAX equivalent to */
        map.put( (char) 8804, "≤" ); /* less-than or equal to */
        map.put( (char) 8805, "≥" ); /* greater-than or equal to */
        map.put( (char) 8834, "⊂" ); /* subset of */
        map.put( (char) 8835, "⊃" ); /* superset of */
        map.put( (char) 8836, "⊄" ); /* not a subset of */
        map.put( (char) 8838, "⊆" ); /* subset of or equal to */
        map.put( (char) 8839, "⊇" ); /* superset of or equal to */
        map.put( (char) 8853, "⊕" ); /* circled plus */
        map.put( (char) 8855, "⊗" ); /* circled times */
        map.put( (char) 8869, "⊥" ); /* up tack */
        map.put( (char) 8901, "⋅" ); /* dot operator */
        map.put( (char) 8968, "⌈" ); /* left ceiling */
        map.put( (char) 8969, "⌉" ); /* right ceiling */
        map.put( (char) 8970, "⌊" ); /* left floor */
        map.put( (char) 8971, "⌋" ); /* right floor */
        map.put( (char) 9001, "⟨" ); /* left-pointing angle bracket */
        map.put( (char) 9002, "⟩" ); /* right-pointing angle bracket */
        map.put( (char) 9674, "◊" ); /* lozenge */
        map.put( (char) 9824, "♠" ); /* black spade suit */
        map.put( (char) 9827, "♣" ); /* black club suit */
        map.put( (char) 9829, "♥" ); /* black heart suit */
        map.put( (char) 9830, "♦" ); /* black diamond suit */

        return Collections.unmodifiableMap( map );
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy