org.owasp.encoder.CDATAEncoder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
There is a newer version: 2024.11.18751.20241128T090041Z-241100
// Copyright (c) 2012 Jeff Ichnowski
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
//     * Redistributions of source code must retain the above
//       copyright notice, this list of conditions and the following
//       disclaimer.
//
//     * Redistributions in binary form must reproduce the above
//       copyright notice, this list of conditions and the following
//       disclaimer in the documentation and/or other materials
//       provided with the distribution.
//
//     * Neither the name of the OWASP nor the names of its
//       contributors may be used to endorse or promote products
//       derived from this software without specific prior written
//       permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
// OF THE POSSIBILITY OF SUCH DAMAGE.
package org.owasp.encoder;

import java.nio.CharBuffer;
import java.nio.charset.CoderResult;

/**
 * CDATAEncoder -- encoder for CDATA sections. CDATA sections are generally good
 * for including large blocks of text that contain characters that normally
 * require encoding (ampersand, quotes, less-than, etc...). The CDATA context
 * however still does not allow invalid characters, and can be closed by the
 * sequence "]]>". This encoder removes invalid XML characters, and encodes
 * "]]>" (to "]]]]><![CDATA[>"). The result is that the data integrity is
 * maintained, but the code receiving the output will have to handle multiple
 * CDATA events. As an alternate approach, the caller could pre-encode "]]>" to
 * something of their choosing (e.g. data.replaceAll("\\]\\]>", "]] >")), then
 * use this encoder to remove any invalid XML characters.
 *
 * @author Jeff Ichnowski
 */
class CDATAEncoder extends Encoder {

    /**
     * The encoding of @{code "]]>"}.
     */
    private static final char[] CDATA_END_ENCODED
            = "]]]]>".toCharArray();

    /**
     * Length of {@code "]]]]>"}.
     */
    private static final int CDATA_END_ENCODED_LENGTH = 15;

    /**
     * Length of {@code "]]>"}.
     */
    private static final int CDATA_END_LENGTH = 3;

    @Override
    protected int maxEncodedLength(int n) {
        // "]" becomes "]" (1 -> 1)
        // "]]" becomes "]]" (2 -> 2)
        // "]]>" becomes "]]]]>" (3 -> 15)
        // "]]>]" becomes "]]]]>]" (3 -> 15 + 1 -> 1)
        // ...

        int worstCase = n / CDATA_END_LENGTH;
        int remainder = n % CDATA_END_LENGTH;

        return worstCase * CDATA_END_ENCODED_LENGTH + remainder;

//        return (n - remainder) * 5 + remainder;
    }

    @Override
    protected int firstEncodedOffset(String input, int off, int len) {
        final int n = off + len;
        //int closeCount = 0; //unused...
        for (int i = off; i < n; ++i) {
            char ch = input.charAt(i);
            if (ch <= Unicode.MAX_ASCII) {
                if (ch != ']') {
                    if (ch < ' ' && ch != '\n' && ch != '\r' && ch != '\t') {
                        return i;
//                    } else {
//                        // valid
                    }

                } else if (i + 1 < n) {
                    if (input.charAt(i + 1) != ']') {
                        // "]x" (next character is safe for this to be ']')
                    } else {
                        // "]]?"
                        // keep looping through ']'
                        for (; i + 2 < n && input.charAt(i + 2) == ']'; ++i) {
                            // valid
                        }
                        // at this point we've looped through a sequence
                        // of 2 or more "]", if the next character is ">"
                        // we need to encode "]]>".
                        if (i + 2 < n) {
                            if (input.charAt(i + 2) == '>') {
                                return i;
//                                } else {
//                                    // valid
                            }

                        } else {
                            return n;
                        }
                    }
                } else {
                    return n;
                }
            } else if (ch < Character.MIN_HIGH_SURROGATE) {
                if (ch <= Unicode.MAX_C1_CTRL_CHAR && ch != Unicode.NEL) {
                    return i;
//                } else {
//                    // valid
                }
            } else if (ch <= Character.MAX_HIGH_SURROGATE) {
                if (i + 1 < n) {
                    if (Character.isLowSurrogate(input.charAt(i + 1))) {
                        int cp = Character.toCodePoint(ch, input.charAt(i + 1));
                        if (Unicode.isNonCharacter(cp)) {
                            return i;
                        } else {
                            ++i;
                            // valid pair
                        }
                    } else {
                        return i;
                    }
                } else {
                    // end of input, high without low = invalid
                    return i;
                }
            } else if (// low surrogate without preceding high surrogate
                    ch <= Character.MAX_LOW_SURROGATE
                    // or non-characters
                    || ch > '\ufffd'
                    || ('\ufdd0' <= ch && ch <= '\ufdef'))
            {
                return i;
//            } else {
//                // valid
            }

        }
        return n;
    }

    @Override
    protected CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput) {
        final char[] in = input.array();
        final char[] out = output.array();
        int i = input.arrayOffset() + input.position();
        final int n = input.arrayOffset() + input.limit();
        int j = output.arrayOffset() + output.position();
        final int m = output.arrayOffset() + output.limit();

        for (; i < n; ++i) {
            char ch = in[i];
            if (ch <= Unicode.MAX_ASCII) {
                if (ch != ']') {
                    if (j >= m) {
                        return overflow(input, i, output, j);
                    }
                    if (ch >= ' ' || ch == '\n' || ch == '\r' || ch == '\t') {
                        out[j++] = ch;
                    } else {
                        out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
                    }
                } else if (i + 1 < n) {
                    if (in[i + 1] != ']') {
                        // "]x" (next character is safe for this to be ']')
                        if (j >= m) {
                            return overflow(input, i, output, j);
                        }
                        out[j++] = ']';
                    } else {
                        // "]]?"
                        // keep looping through ']'
                        for (; i + 2 < n && in[i + 2] == ']'; ++i) {
                            if (j >= m) {
                                return overflow(input, i, output, j);
                            }
                            out[j++] = ']';
                        }
                        // at this point we've looped through a sequence
                        // of 2 or more "]", if the next character is ">"
                        // we need to encode "]]>".
                        if (i + 2 < n) {
                            if (in[i + 2] == '>') {
                                if (j + CDATA_END_ENCODED_LENGTH > m) {
                                    return overflow(input, i, output, j);
                                }
                                System.arraycopy(CDATA_END_ENCODED, 0, out, j, CDATA_END_ENCODED_LENGTH);
                                j += CDATA_END_ENCODED_LENGTH;
                                i += 2;
                            } else {
                                if (j >= m) {
                                    return overflow(input, i, output, j);
                                }
                                out[j++] = ']';
                            }
                        } else if (endOfInput) {
                            if (j + 2 > m) {
                                return overflow(input, i, output, j);
                            }
                            out[j++] = ']';
                            out[j++] = ']';
                            i = n;
                            break;
                        } else {
                            break;
                        }
                    }
                } else if (endOfInput) {
                    // seen "]", then end of input.
                    if (j >= m) {
                        return overflow(input, i, output, j);
                    }
                    out[j++] = ']';
                    i++;
                    break;
                } else {
                    break;
                }
            } else if (ch < Character.MIN_HIGH_SURROGATE) {
                if (ch > Unicode.MAX_C1_CTRL_CHAR || ch == Unicode.NEL) {
                    if (j >= m) {
                        return overflow(input, i, output, j);
                    }
                    out[j++] = ch;
                } else {
                    // C1 control code
                    if (j >= m) {
                        return overflow(input, i, output, j);
                    }
                    out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
                }
            } else if (ch <= Character.MAX_HIGH_SURROGATE) {
                if (i + 1 < n) {
                    if (Character.isLowSurrogate(in[i + 1])) {
                        int cp = Character.toCodePoint(ch, in[i + 1]);
                        if (Unicode.isNonCharacter(cp)) {
                            if (j >= m) {
                                return overflow(input, i, output, j);
                            }
                            out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
                            ++i;
                        } else {
                            if (j + 1 >= m) {
                                return overflow(input, i, output, j);
                            }
                            out[j++] = ch;
                            out[j++] = in[++i];
                        }
                    } else {
                        // high without low
                        if (j >= m) {
                            return overflow(input, i, output, j);
                        }
                        out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
                    }
                } else if (endOfInput) {
                    // end of input, high without low = invalid
                    if (j >= m) {
                        return overflow(input, i, output, j);
                    }
                    out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
                } else {
                    break;
                }
            } else if (// low surrogate without preceding high surrogate
                    ch <= Character.MAX_LOW_SURROGATE
                    // or non-characters
                    || ch > '\ufffd'
                    || ('\ufdd0' <= ch && ch <= '\ufdef'))
            {
                if (j >= m) {
                    return overflow(input, i, output, j);
                }
                out[j++] = XMLEncoder.INVALID_CHARACTER_REPLACEMENT;
            } else {
                if (j >= m) {
                    return overflow(input, i, output, j);
                }
                out[j++] = ch;
            }
        }
        return underflow(input, i, output, j);
    }

    @Override
    public String toString() {
        return "CDATAEncoder";
    }
}