All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.myfaces.shared_tomahawk.renderkit.html.util.HTMLEncoder Maven / Gradle / Ivy

Go to download

JSF components and utilities that can be used with any JSF implementation. This library is based on the JSF1.1 version of Tomahawk, but with minor source code and build changes to take advantage of JSF2.1 features. A JSF2.1 implementation is required to use this version of the Tomahawk library.

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.myfaces.shared_tomahawk.renderkit.html.util;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;

/**
 * Converts Strings so that they can be used within HTML-Code.
 */
public abstract class HTMLEncoder
{
    /**
     * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
     */
    public static String encode (String string)
    {
        return encode(string, false, true);
    }

    /**
     * Variant of {@link #encode} where encodeNbsp is true.
     */
    public static String encode (String string, boolean encodeNewline)
    {
        return encode(string, encodeNewline, true);
    }

    /**
     * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
     */
    public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp)
    {
        return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
    }

    /**
     * Encodes the given string, so that it can be used within a html page.
     * @param string the string to convert
     * @param encodeNewline if true newline characters are converted to <br>'s
     * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s
     * @param encodeNonLatin if true encode non-latin characters as numeric character references
     */
    public static String encode (String string,
                                 boolean encodeNewline,
                                 boolean encodeSubsequentBlanksToNbsp,
                                 boolean encodeNonLatin)
    {
        if (string == null)
        {
            return "";
        }

        StringBuilder sb = null;    //create later on demand
        String app;
        char c;
        for (int i = 0; i < string.length (); ++i)
        {
            app = null;
            c = string.charAt(i);
            
            // All characters before letters
            if ((int)c < 0x41)
            {
                switch (c)
                {
                    case '"': app = """; break;    //"
                    case '&': app = "&"; break;     //&
                    case '<': app = "<"; break;      //<
                    case '>': app = ">"; break;      //>
                    case ' ':
                        if (encodeSubsequentBlanksToNbsp &&
                                (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' ')))
                        {
                            //Space at beginning or after another space
                            app = " ";
                        }
                        break;
                    case '\n':
                        if (encodeNewline)
                        {
                            app = "
"; } break; default: break; } } else if (encodeNonLatin && (int)c > 0x80) { switch(c) { //german umlauts case '\u00E4' : app = "ä"; break; case '\u00C4' : app = "Ä"; break; case '\u00F6' : app = "ö"; break; case '\u00D6' : app = "Ö"; break; case '\u00FC' : app = "ü"; break; case '\u00DC' : app = "Ü"; break; case '\u00DF' : app = "ß"; break; //misc //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? case '\u20AC': app = "€"; break; case '\u00AB': app = "«"; break; case '\u00BB': app = "»"; break; case '\u00A0': app = " "; break; default : //encode all non basic latin characters app = "&#" + ((int)c) + ";"; break; } } if (app != null) { if (sb == null) { sb = new StringBuilder(string.substring(0, i)); } sb.append(app); } else { if (sb != null) { sb.append(c); } } } if (sb == null) { return string; } else { return sb.toString(); } } /** * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. */ public static void encode (Writer writer, String string) throws IOException { encode(writer, string, false, true); } /** * Variant of {@link #encode} where encodeNbsp is true. */ public static void encode (Writer writer, String string, boolean encodeNewline) throws IOException { encode(writer, string, encodeNewline, true); } /** * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true */ public static void encode (Writer writer, String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) throws IOException { encode(writer, string, encodeNewline, encodeSubsequentBlanksToNbsp, true); } public static void encode (Writer writer, String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp, boolean encodeNonLatin) throws IOException { if (string == null) { return; } int start = 0; String app; char c; for (int i = 0; i < string.length (); ++i) { app = null; c = string.charAt(i); // All characters before letters if ((int)c < 0x41) { switch (c) { case '"': app = """; break; //" case '&': app = "&"; break; //& case '<': app = "<"; break; //< case '>': app = ">"; break; //> case ' ': if (encodeSubsequentBlanksToNbsp && (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' '))) { //Space at beginning or after another space app = " "; } break; case '\n': if (encodeNewline) { app = "
"; } break; default: break; } } else if (encodeNonLatin && (int)c > 0x80) { switch(c) { //german umlauts case '\u00E4' : app = "ä"; break; case '\u00C4' : app = "Ä"; break; case '\u00F6' : app = "ö"; break; case '\u00D6' : app = "Ö"; break; case '\u00FC' : app = "ü"; break; case '\u00DC' : app = "Ü"; break; case '\u00DF' : app = "ß"; break; //misc //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? case '\u20AC': app = "€"; break; case '\u00AB': app = "«"; break; case '\u00BB': app = "»"; break; case '\u00A0': app = " "; break; default : //encode all non basic latin characters app = "&#" + ((int)c) + ";"; break; } } if (app != null) { //if (sb == null) //{ // sb = new StringBuilder(string.substring(0, i)); //} //sb.append(app); if (start < i) { writer.write(string, start, i-start); } start = i+1; writer.write(app); } //else //{ // if (sb != null) // { // sb.append(c); // } //} } //if (sb == null) //{ // return string; //} //else //{ // return sb.toString(); //} if (start == 0) { writer.write(string); } else if (start < string.length()) { writer.write(string,start,string.length()-start); } } /** * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. */ public static void encode (char[] string, int offset, int length, Writer writer) throws IOException { encode(string, offset, length, false, true, writer); } /** * Variant of {@link #encode} where encodeNbsp is true. */ public static void encode (char[] string, int offset, int length, boolean encodeNewline, Writer writer) throws IOException { encode(string, offset, length, encodeNewline, true, writer); } /** * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true */ public static void encode (char[] string, int offset, int length, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException { encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer); } /** * Encodes the given string, so that it can be used within a html page. * @param string the string to convert * @param encodeNewline if true newline characters are converted to <br>'s * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s * @param encodeNonLatin if true encode non-latin characters as numeric character references */ public static void encode (char[] string, int offset, int length, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp, boolean encodeNonLatin, Writer writer) throws IOException { if (string == null || length < 0 || offset >= string.length) { return; } offset = Math.max(0, offset); int realLength = Math.min(length, string.length - offset); //StringBuilder sb = null; //create later on demand String app; char c; int start = offset; for (int i = offset; i < offset + realLength; ++i) { app = null; c = string[i]; // All characters before letters if ((int)c < 0x41) { switch (c) { case '"': app = """; break; //" case '&': app = "&"; break; //& case '<': app = "<"; break; //< case '>': app = ">"; break; //> case ' ': if (encodeSubsequentBlanksToNbsp && (i == 0 || (i - 1 >= 0 && string[i - 1] == ' '))) { //Space at beginning or after another space app = " "; } break; case '\n': if (encodeNewline) { app = "
"; } break; default: break; } } else if (encodeNonLatin && (int)c > 0x80) { switch(c) { //german umlauts case '\u00E4' : app = "ä"; break; case '\u00C4' : app = "Ä"; break; case '\u00F6' : app = "ö"; break; case '\u00D6' : app = "Ö"; break; case '\u00FC' : app = "ü"; break; case '\u00DC' : app = "Ü"; break; case '\u00DF' : app = "ß"; break; //misc //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? case '\u20AC': app = "€"; break; case '\u00AB': app = "«"; break; case '\u00BB': app = "»"; break; case '\u00A0': app = " "; break; default : //encode all non basic latin characters app = "&#" + ((int)c) + ";"; break; } } if (app != null) { //if (sb == null) //{ // sb = new StringBuilder(realLength*2); // sb.append(string, offset, i - offset); //} //sb.append(app); if (start < i) { writer.write(string, start, i-start); } start = i+1; writer.write(app); } /* else { if (sb != null) { sb.append(c); } }*/ } //if (sb == null) //{ // writer.write(string, offset, realLength); //} //else //{ // writer.write(sb.toString()); //} if (start == offset) { writer.write(string, offset, realLength); } else if (start < offset+realLength) { writer.write(string,start,offset+realLength-start); } } private static final String HEX_CHARSET = "0123456789ABCDEF"; private static final String UTF8 = "UTF-8"; /** * Encode an URI, escaping or percent-encoding all required characters and * following the rules mentioned on RFC 3986. * * @param string * @param encodeNonLatin * @return * @throws IOException */ public static String encodeURIAtributte(final String string, final String characterEncoding) throws IOException { StringBuilder sb = null; //create later on demand String app; char c; boolean endLoop = false; for (int i = 0; i < string.length (); ++i) { app = null; c = string.charAt(i); // This are the guidelines to be taken into account by this algorithm to encode: // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters // // control = // space = // delims = "<" | ">" | "#" | "%" | <"> // %3C %3E %23 %25 %22 // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`" // %7D %7B %7C %5C %5E %5B %5D %60 // // ".... Data corresponding to excluded characters must be escaped in order to // be properly represented within a URI....." // RFC 3986 Section 3. Syntax Components // // "... The generic URI syntax consists of a hierarchical sequence of // components referred to as the scheme, authority, path, query, and // fragment. // // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] // // hier-part = "//" authority path-abempty // / path-absolute // / path-rootless // / path-empty // ...." // RFC 3986 Section 2.2: // Reserved characters (should not be percent-encoded) // reserved = gen-delims / sub-delims // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" // %3A %2F %3F %23 %5B %5D %40 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396, // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) // "...those rules were redefined to directly specify the characters allowed...." // There is also other characters moved from excluded list to reserved: // "[" / "]" / "#" // RFC 3986 Section 2.3: // "... for consistency, percent-encoded octets in the ranges of ALPHA // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), // underscore (%5F), or tilde (%7E) should not be created by URI // producers...." // RFC 3986 Section 3.2.2. Host // host = IP-literal / IPv4address / reg-name // The reg-name syntax allows percent-encoded octets in order to // represent non-ASCII registered names in a uniform way that is // independent of the underlying name resolution technology. Non-ASCII // characters must first be encoded according to UTF-8 [STD63], and then // each octet of the corresponding UTF-8 sequence must be percent- // encoded to be represented as URI characters. URI producing // applications must not use percent-encoding in host unless it is used // to represent a UTF-8 character sequence. // RFC 3986 Section 3.4 Query // query = *( pchar / "/" / "?" ) // // "... However, as query components are often used to carry identifying information // in the form of "key=value" pairs and one frequently used value is a reference to // another URI, it is sometimes better for usability to avoid percent-encoding those characters....." // // RFC 3986 Section 2.5 Identifying Data (Apply to query section) // // When a new URI scheme defines a component that represents textual // data consisting of characters from the Universal Character Set [UCS], // the data should first be encoded as octets according to the UTF-8 // character encoding [STD63]; then only those octets that do not // correspond to characters in the unreserved set should be percent- // encoded. For example, the character A would be represented as "A", // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented // as "%C3%80", and the character KATAKANA LETTER A would be represented // as "%E3%82%A2". // // RFC 3986 Section 3.5 Fragment // fragment = *( pchar / "/" / "?" ) // // Note that follows the same as query // Based on the extracts the strategy to apply on this method is: // // On scheme ":" hier-part // // Escape or percent encode chars inside : // // - From %00 to %20, // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of // duplicate encoding, encode it when we are sure // that there are not encoded twice) // - "<" %3C, ">" %3E // - "\" %5C, "^" %5E, "`" %60 // - "{" %7B, "|" %7C, "}" %7D // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this // part of an URI, but it is preferred to encode it that omit it). // // The remaining characters must not be encoded // // Characters after ? or # should be percent encoding but only the necessary ones: // // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of // duplicate encoding, encode it when we are sure // that there are not encoded twice) // - "<" %3C, ">" %3E, // - "\" %5C, "^" %5E, "`" %60 // - "{" %7B, "|" %7C, "}" %7D // - From %7F ad infinitum (each character as many bytes as necessary but take into account // that a single char should contain 2,3 or more bytes!. This data should be encoded // translating from the document character encoding to percent encoding, because this values // could be retrieved from httpRequest.getParameter() and it uses the current character encoding // for decode values) // // "&" should be encoded as "&" because this link is inside an html page, and // put only & is invalid in this context. if ( (c <= (char)0x20) || (c >= (char)0x7F) || c == '"' || c == '<' || c == '>' || c == '\\' || c == '^' || c == '`' || c == '{' || c == '|' || c == '}') { // The percent encoding on this part should be done using UTF-8 charset // as RFC 3986 Section 3.2.2 says. // Also there is a reference on // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars // that recommend use of UTF-8 instead the document character encoding. // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113) app = percentEncode(c, "UTF-8"); } else if (c == '%') { if (i + 2 < string.length()) { char c1 = string.charAt(i+1); char c2 = string.charAt(i+2); if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) && (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z'))) { // do not percent encode, because it could be already encoded // and we don't want encode it twice } else { app = percentEncode(c, UTF8); } } else { app = percentEncode(c, UTF8); } } else if (c == '?' || c == '#') { if (i+1 < string.length()) { // The remaining part of the URI are data that should be encoded // using the document character encoding. app = c + encodeURIQuery(string.substring(i+1), characterEncoding); endLoop = true; } } else { //No encoding, just do nothing, char will be added later. } if (app != null) { if (sb == null) { sb = new StringBuilder(string.substring(0, i)); } sb.append(app); } else { if (sb != null) { sb.append(c); } } if (endLoop) { break; } } if (sb == null) { return string; } else { return sb.toString(); } } /** * Encode a unicode char value in percentEncode, decoding its bytes using a specified * characterEncoding. * * @param c * @param characterEncoding * @return */ private static String percentEncode(char c, String characterEncoding) { String app = null; if (c > (char)((short)0x007F)) { //percent encode in the proper encoding to be consistent app = percentEncodeNonUsAsciiCharacter(c, characterEncoding); } else { //percent encode US-ASCII char (0x00-0x7F range) app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10); } return app; } private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding) { ByteArrayOutputStream baos = new ByteArrayOutputStream(10); StringBuilder builder = new StringBuilder(); try { OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding); writer.write(c); writer.flush(); } catch(IOException e) { baos.reset(); return null; } byte [] byteArray = baos.toByteArray(); for (int i=0; i < byteArray.length; i++) { builder.append('%'); builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); } return builder.toString(); } /** * Encode the query part using the document charset encoding provided. * * * @param string * @param characterEncoding * @return */ private static String encodeURIQuery(final String string, final String characterEncoding) { StringBuilder sb = null; //create later on demand String app; char c; boolean endLoop = false; for (int i = 0; i < string.length (); ++i) { app = null; c = string.charAt(i); // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so // we make easier and omit this one) // - "<" %3C, ">" %3E, // - "\" %5C, "^" %5E, "`" %60 // - "{" %7B, "|" %7C, "}" %7D // - From %7F ad infinitum (each character as many bytes as necessary but take into account // that a single char should contain 2,3 or more bytes!. This data should be encoded // translating from the document character encoding to percent encoding) // // "&" should be encoded as "&" because this link is inside an html page, and // put & is invalid in this context if ( (c <= (char)0x20) || (c >= (char)0x7F) || c == '"' || c == '<' || c == '>' || c == '\\' || c == '^' || c == '`' || c == '{' || c == '|' || c == '}') { // The percent encoding on this part should be done using UTF-8 charset // as RFC 3986 Section 3.2.2 says app = percentEncode(c, characterEncoding); } else if (c == '%') { if (i + 2 < string.length()) { char c1 = string.charAt(i+1); char c2 = string.charAt(i+2); if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) && (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z'))) { // do not percent encode, because it could be already encoded } else { app = percentEncode(c, characterEncoding); } } else { app = percentEncode(c, characterEncoding); } } else if (c == '&') { if (i+4 < string.length() ) { if ('a' == string.charAt(i+1) && 'm' == string.charAt(i+2) && 'p' == string.charAt(i+3) && ';' == string.charAt(i+4)) { //Skip } else { app = "&"; } } else { app = "&"; } } else { //No encoding, just do nothing, char will be added later. } if (app != null) { if (sb == null) { sb = new StringBuilder(string.substring(0, i)); } sb.append(app); } else { if (sb != null) { sb.append(c); } } if (endLoop) { break; } } if (sb == null) { return string; } else { return sb.toString(); } } /** * Encode an URI, escaping or percent-encoding all required characters and * following the rules mentioned on RFC 3986. * * @param string * @param encodeNonLatin * @return * @throws IOException */ public static void encodeURIAtributte(Writer writer, final String string, final String characterEncoding) throws IOException { //StringBuilder sb = null; //create later on demand int start = 0; String app; char c; boolean endLoop = false; for (int i = 0; i < string.length (); ++i) { app = null; c = string.charAt(i); // This are the guidelines to be taken into account by this algorithm to encode: // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters // // control = // space = // delims = "<" | ">" | "#" | "%" | <"> // %3C %3E %23 %25 %22 // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`" // %7D %7B %7C %5C %5E %5B %5D %60 // // ".... Data corresponding to excluded characters must be escaped in order to // be properly represented within a URI....." // RFC 3986 Section 3. Syntax Components // // "... The generic URI syntax consists of a hierarchical sequence of // components referred to as the scheme, authority, path, query, and // fragment. // // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] // // hier-part = "//" authority path-abempty // / path-absolute // / path-rootless // / path-empty // ...." // RFC 3986 Section 2.2: // Reserved characters (should not be percent-encoded) // reserved = gen-delims / sub-delims // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" // %3A %2F %3F %23 %5B %5D %40 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396, // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) // "...those rules were redefined to directly specify the characters allowed...." // There is also other characters moved from excluded list to reserved: // "[" / "]" / "#" // RFC 3986 Section 2.3: // "... for consistency, percent-encoded octets in the ranges of ALPHA // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), // underscore (%5F), or tilde (%7E) should not be created by URI // producers...." // RFC 3986 Section 3.2.2. Host // host = IP-literal / IPv4address / reg-name // The reg-name syntax allows percent-encoded octets in order to // represent non-ASCII registered names in a uniform way that is // independent of the underlying name resolution technology. Non-ASCII // characters must first be encoded according to UTF-8 [STD63], and then // each octet of the corresponding UTF-8 sequence must be percent- // encoded to be represented as URI characters. URI producing // applications must not use percent-encoding in host unless it is used // to represent a UTF-8 character sequence. // RFC 3986 Section 3.4 Query // query = *( pchar / "/" / "?" ) // // "... However, as query components are often used to carry identifying information // in the form of "key=value" pairs and one frequently used value is a reference to // another URI, it is sometimes better for usability to avoid percent-encoding those characters....." // // RFC 3986 Section 2.5 Identifying Data (Apply to query section) // // When a new URI scheme defines a component that represents textual // data consisting of characters from the Universal Character Set [UCS], // the data should first be encoded as octets according to the UTF-8 // character encoding [STD63]; then only those octets that do not // correspond to characters in the unreserved set should be percent- // encoded. For example, the character A would be represented as "A", // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented // as "%C3%80", and the character KATAKANA LETTER A would be represented // as "%E3%82%A2". // // RFC 3986 Section 3.5 Fragment // fragment = *( pchar / "/" / "?" ) // // Note that follows the same as query // Based on the extracts the strategy to apply on this method is: // // On scheme ":" hier-part // // Escape or percent encode chars inside : // // - From %00 to %20, // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of // duplicate encoding, encode it when we are sure // that there are not encoded twice) // - "<" %3C, ">" %3E // - "\" %5C, "^" %5E, "`" %60 // - "{" %7B, "|" %7C, "}" %7D // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this // part of an URI, but it is preferred to encode it that omit it). // // The remaining characters must not be encoded // // Characters after ? or # should be percent encoding but only the necessary ones: // // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of // duplicate encoding, encode it when we are sure // that there are not encoded twice) // - "<" %3C, ">" %3E, // - "\" %5C, "^" %5E, "`" %60 // - "{" %7B, "|" %7C, "}" %7D // - From %7F ad infinitum (each character as many bytes as necessary but take into account // that a single char should contain 2,3 or more bytes!. This data should be encoded // translating from the document character encoding to percent encoding, because this values // could be retrieved from httpRequest.getParameter() and it uses the current character encoding // for decode values) // // "&" should be encoded as "&" because this link is inside an html page, and // put only & is invalid in this context. if ( (c <= (char)0x20) || (c >= (char)0x7F) || c == '"' || c == '<' || c == '>' || c == '\\' || c == '^' || c == '`' || c == '{' || c == '|' || c == '}') { // The percent encoding on this part should be done using UTF-8 charset // as RFC 3986 Section 3.2.2 says. // Also there is a reference on // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars // that recommend use of UTF-8 instead the document character encoding. // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113) //app = percentEncode(c, "UTF-8"); if (start < i) { writer.write(string, start, i-start); } start = i+1; percentEncode(writer, c, "UTF-8"); } else if (c == '%') { if (i + 2 < string.length()) { char c1 = string.charAt(i+1); char c2 = string.charAt(i+2); if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) && (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z'))) { // do not percent encode, because it could be already encoded // and we don't want encode it twice } else { //app = percentEncode(c, UTF8); if (start < i) { writer.write(string, start, i-start); } start = i+1; percentEncode(writer, c, UTF8); } } else { //app = percentEncode(c, UTF8); if (start < i) { writer.write(string, start, i-start); } start = i+1; percentEncode(writer, c, UTF8); } } else if (c == '?' || c == '#') { if (i+1 < string.length()) { // The remaining part of the URI are data that should be encoded // using the document character encoding. //app = c + encodeURIQuery(string.substring(i+1), characterEncoding); if (start < i) { writer.write(string, start, i-start); } start = i+1; writer.write(c); //encodeURIQuery(writer, string.substring(i+1), characterEncoding); encodeURIQuery(writer, string, i+1, characterEncoding); endLoop = true; } } else { //No encoding, just do nothing, char will be added later. } if (app != null) { //if (sb == null) //{ // sb = new StringBuilder(string.substring(0, i)); //} //sb.append(app); if (start < i) { writer.write(string, start, i-start); } start = i+1; writer.write(app); } //else //{ // if (sb != null) // { // sb.append(c); // } //} if (endLoop) { start = string.length(); break; } } //if (sb == null) //{ // return string; //} //else //{ // return sb.toString(); //} if (start == 0) { writer.write(string); } else if (start < string.length()) { writer.write(string,start,string.length()-start); } } /** * Encode a unicode char value in percentEncode, decoding its bytes using a specified * characterEncoding. * * @param c * @param characterEncoding * @return */ private static void percentEncode(Writer writer, char c, String characterEncoding) throws IOException { String app = null; if (c > (char)((short)0x007F)) { //percent encode in the proper encoding to be consistent //app = percentEncodeNonUsAsciiCharacter(writer c, characterEncoding); percentEncodeNonUsAsciiCharacter(writer, c, characterEncoding); } else { //percent encode US-ASCII char (0x00-0x7F range) //app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10); writer.write('%'); writer.write(HEX_CHARSET.charAt( ((c >> 0x4) % 0x10))); writer.write(HEX_CHARSET.charAt(c % 0x10)); } //return app; } private static void percentEncodeNonUsAsciiCharacter(Writer currentWriter, char c, String characterEncoding) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(10); StringBuilder builder = new StringBuilder(); try { OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding); writer.write(c); writer.flush(); } catch(IOException e) { baos.reset(); return; } byte [] byteArray = baos.toByteArray(); for (int i=0; i < byteArray.length; i++) { //builder.append('%'); //builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); //builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); currentWriter.write('%'); currentWriter.write(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); currentWriter.write(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); } //return builder.toString(); } /** * Encode the query part using the document charset encoding provided. * * * @param string * @param characterEncoding * @return */ private static void encodeURIQuery(Writer writer, final String string, int offset, final String characterEncoding) throws IOException { //StringBuilder sb = null; //create later on demand int start = offset; int realLength = string.length()-offset; String app; char c; //boolean endLoop = false; for (int i = offset; i < offset+realLength; ++i) { app = null; c = string.charAt(i); // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so // we make easier and omit this one) // - "<" %3C, ">" %3E, // - "\" %5C, "^" %5E, "`" %60 // - "{" %7B, "|" %7C, "}" %7D // - From %7F ad infinitum (each character as many bytes as necessary but take into account // that a single char should contain 2,3 or more bytes!. This data should be encoded // translating from the document character encoding to percent encoding) // // "&" should be encoded as "&" because this link is inside an html page, and // put & is invalid in this context if ( (c <= (char)0x20) || (c >= (char)0x7F) || c == '"' || c == '<' || c == '>' || c == '\\' || c == '^' || c == '`' || c == '{' || c == '|' || c == '}') { // The percent encoding on this part should be done using UTF-8 charset // as RFC 3986 Section 3.2.2 says //app = percentEncode(c, characterEncoding); if (start < i) { writer.write(string, start, i-start); } start = i+1; percentEncode(writer, c, characterEncoding); } else if (c == '%') { if (i + 2 < string.length()) { char c1 = string.charAt(i+1); char c2 = string.charAt(i+2); if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) && (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z'))) { // do not percent encode, because it could be already encoded } else { //app = percentEncode(c, characterEncoding); if (start < i) { writer.write(string, start, i-start); } start = i+1; percentEncode(writer, c, characterEncoding); } } else { //app = percentEncode(c, characterEncoding); if (start < i) { writer.write(string, start, i-start); } start = i+1; percentEncode(writer, c, characterEncoding); } } else if (c == '&') { if (i+4 < string.length() ) { if ('a' == string.charAt(i+1) && 'm' == string.charAt(i+2) && 'p' == string.charAt(i+3) && ';' == string.charAt(i+4)) { //Skip } else { app = "&"; } } else { app = "&"; } } else { //No encoding, just do nothing, char will be added later. } if (app != null) { //if (sb == null) //{ // sb = new StringBuilder(string.substring(0, i)); //} //sb.append(app); if (start < i) { writer.write(string, start, i-start); } start = i+1; writer.write(app); } //else //{ // if (sb != null) // { // sb.append(c); // } //} //if (endLoop) //{ // break; //} } //if (sb == null) //{ // return string; //} //else //{ // return sb.toString(); //} if (start == offset) { writer.write(string, offset, realLength); } else if (start < offset+realLength) { writer.write(string,start,offset+realLength-start); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy