org.semanticwb.base.util.URLEncoder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of SWBBase Show documentation
Code utilities for SemanticWebBuilder
The newest version!
/*
 * SemanticWebBuilder es una plataforma para el desarrollo de portales y aplicaciones de integración,
 * colaboración y conocimiento, que gracias al uso de tecnología semántica puede generar contextos de
 * información alrededor de algún tema de interés o bien integrar información y aplicaciones de diferentes
 * fuentes, donde a la información se le asigna un significado, de forma que pueda ser interpretada y
 * procesada por personas y/o sistemas, es una creación original del Fondo de Información y Documentación
 * para la Industria INFOTEC, cuyo registro se encuentra actualmente en trámite.
 *
 * INFOTEC pone a su disposición la herramienta SemanticWebBuilder a través de su licenciamiento abierto al público (‘open source’),
 * en virtud del cual, usted podrá usarlo en las mismas condiciones con que INFOTEC lo ha diseñado y puesto a su disposición;
 * aprender de él; distribuirlo a terceros; acceder a su código fuente y modificarlo, y combinarlo o enlazarlo con otro software,
 * todo ello de conformidad con los términos y condiciones de la LICENCIA ABIERTA AL PÚBLICO que otorga INFOTEC para la utilización
 * del SemanticWebBuilder 4.0.
 *
 * INFOTEC no otorga garantía sobre SemanticWebBuilder, de ninguna especie y naturaleza, ni implícita ni explícita,
 * siendo usted completamente responsable de la utilización que le dé y asumiendo la totalidad de los riesgos que puedan derivar
 * de la misma.
 *
 * Si usted tiene cualquier duda o comentario sobre SemanticWebBuilder, INFOTEC pone a su disposición la siguiente
 * dirección electrónica:
 *  http://www.semanticwebbuilder.org
 */
package org.semanticwb.base.util;

import java.io.ByteArrayOutputStream;
import java.io.BufferedWriter;
import java.io.OutputStreamWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.BitSet;
import java.security.AccessController;
import org.semanticwb.SWBUtils;
import sun.security.action.GetPropertyAction;

// TODO: Auto-generated Javadoc
/**
 * Utility class for HTML form encoding. This class contains static methods
 * for converting a String to the application/x-www-form-urlencoded MIME
 * format. For more information about HTML form encoding, consult the HTML
 * specification.
 *
 * 
 * When encoding a String, the following rules apply:
 *
 * 

 * 

 * The alphanumeric characters "a" through
 *     "z", "A" through
 *     "Z" and "0"
 *     through "9" remain the same.
 * 
The special characters ".",
 *     "-", "*", and
 *     "_" remain the same.
 * 
The space character " " is
 *     converted into a plus sign "+".
 * 
All other characters are unsafe and are first converted into
 *     one or more bytes using some encoding scheme. Then each byte is
 *     represented by the 3-character string
 *     "%xy", where xy is the
 *     two-digit hexadecimal representation of the byte.
 *     The recommended encoding scheme to use is UTF-8. However,
 *     for compatibility reasons, if an encoding is not specified,
 *     then the default encoding of the platform is used.
 * 
 *
 * 
 * For example using UTF-8 as the encoding scheme the string "The
 * string ü@foo-bar" would get converted to
 * "The+string+%C3%BC%40foo-bar" because in UTF-8 the character
 * ü is encoded as two bytes C3 (hex) and BC (hex), and the
 * character @ is encoded as one byte 40 (hex).
 *
 * @author  Herb Jellinek
 * @version 1.26, 04/15/02
 * @since   JDK1.0
 */
public class URLEncoder
{
    
    /** The dont need encoding. */
    static BitSet dontNeedEncoding;
    
    /** The Constant caseDiff. */
    static final int caseDiff = ('a' - 'A');
    
    /** The dflt enc name. */
    static String dfltEncName = null;

    static
    {

        /* The list of characters that are not encoded has been
         * determined as follows:
         *
         * RFC 2396 states:
         * -----
         * Data characters that are allowed in a URI but do not have a
         * reserved purpose are called unreserved.  These include upper
         * and lower case letters, decimal digits, and a limited set of
         * punctuation marks and symbols.
         *
         * unreserved  = alphanum | mark
         *
         * mark        = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
         *
         * Unreserved characters can be escaped without changing the
         * semantics of the URI, but this should not be done unless the
         * URI is being used in a context that does not allow the
         * unescaped character to appear.
         * -----
         *
         * It appears that both Netscape and Internet Explorer escape
         * all special characters from this list with the exception
         * of "-", "_", ".", "*". While it is not clear why they are
         * escaping the other characters, perhaps it is safest to
         * assume that there might be contexts in which the others
         * are unsafe if not escaped. Therefore, we will use the same
         * list. It is also noteworthy that this is consistent with
         * O'Reilly's "HTML: The Definitive Guide" (page 164).
         *
         * As a last note, Intenet Explorer does not encode the "@"
         * character which is clearly not unreserved according to the
         * RFC. We are being consistent with the RFC in this matter,
         * as is Netscape.
         *
         */

        dontNeedEncoding = new BitSet(256);
        int i;
        for (i = 'a'; i <= 'z'; i++)
        {
            dontNeedEncoding.set(i);
        }
        for (i = 'A'; i <= 'Z'; i++)
        {
            dontNeedEncoding.set(i);
        }
        for (i = '0'; i <= '9'; i++)
        {
            dontNeedEncoding.set(i);
        }
        //dontNeedEncoding.set(' '); /* encoding a space to a + is done
        //			    * in the encode() method */
        dontNeedEncoding.set('-');
        dontNeedEncoding.set('_');
        dontNeedEncoding.set('.');
        dontNeedEncoding.set('*');

        try
        {
            dfltEncName = (String) AccessController.doPrivileged(
                new GetPropertyAction("file.encoding")
            );
        }catch(Exception e)
        {
            dfltEncName=SWBUtils.TEXT.getDafaultEncoding();
        }
        //System.out.println("dfltEncName:"+dfltEncName);
    }

    /**
     * You can't call the constructor.
     */
    private URLEncoder()
    {
    }

    /**
     * Translates a string into x-www-form-urlencoded
     * format. This method uses the platform's default encoding
     * as the encoding scheme to obtain the bytes for unsafe characters.
     * 
     * @param s the s
     * @return  the translated .
     */
    public static String encode(String s)
    {

        String str = null;

        if (s != null) {
            try
            {
                str = encode(s, dfltEncName);
            } catch (UnsupportedEncodingException e)
            {
                // The system should always have the platform default
            }
        }

        return str;
    }

    /**
     * Translates a string into application/x-www-form-urlencoded
     * format using a specific encoding scheme. This method uses the
     * supplied encoding scheme to obtain the bytes for unsafe
     * characters.
     * 
     * Note: The 
     * World Wide Web Consortium Recommendation states that
     * UTF-8 should be used. Not doing so may introduce
     * incompatibilites.
     * 
     * @param s the s
     * @param enc the enc
     * @return  the translated .
     * @throws UnsupportedEncodingException the unsupported encoding exception
     * @exception  UnsupportedEncodingException
     * If the named encoding is not supported
     * @see URLDecoder#decode(java.lang.String, java.lang.String)
     * @since 1.4
     */
    public static String encode(String s, String enc)
            throws UnsupportedEncodingException
    {

        boolean needToChange = false;
        boolean wroteUnencodedChar = false;
        int maxBytesPerChar = 10; // rather arbitrary limit, but safe for now
        StringBuffer out = new StringBuffer(s.length());
        ByteArrayOutputStream buf = new ByteArrayOutputStream(maxBytesPerChar);

        BufferedWriter writer =
                new BufferedWriter(new OutputStreamWriter(buf, enc));

        for (int i = 0; i < s.length(); i++)
        {
            int c = (int) s.charAt(i);
            //System.out.println("Examining character: " + c);
            if (dontNeedEncoding.get(c))
            {
                if (c == ' ')
                {
                    c = '+';
                    needToChange = true;
                }
                //System.out.println("Storing: " + c);
                out.append((char) c);
                wroteUnencodedChar = true;
            } else
            {
                // convert to external encoding before hex conversion
                try
                {
                    if (wroteUnencodedChar)
                    { // Fix for 4407610
                        writer = new BufferedWriter(new OutputStreamWriter(buf, enc));
                        wroteUnencodedChar = false;
                    }
                    writer.write(c);
                    /*
                     * If this character represents the start of a Unicode
                     * surrogate pair, then pass in two characters. It's not
                     * clear what should be done if a bytes reserved in the
                     * surrogate pairs range occurs outside of a legal
                     * surrogate pair. For now, just treat it as if it were
                     * any other character.
                     */
                    if (c >= 0xD800 && c <= 0xDBFF)
                    {
                        /*
                          System.out.println(Integer.toHexString(c)
                          + " is high surrogate");
                        */
                        if ((i + 1) < s.length())
                        {
                            int d = (int) s.charAt(i + 1);
                            /*
                              System.out.println("\tExamining "
                              + Integer.toHexString(d));
                            */
                            if (d >= 0xDC00 && d <= 0xDFFF)
                            {
                                /*
                                  System.out.println("\t"
                                  + Integer.toHexString(d)
                                  + " is low surrogate");
                                */
                                writer.write(d);
                                i++;
                            }
                        }
                    }
                    writer.flush();
                } catch (IOException e)
                {
                    buf.reset();
                    continue;
                }
                byte[] ba = buf.toByteArray();
                for (int j = 0; j < ba.length; j++)
                {
                    out.append('%');
                    char ch = Character.forDigit((ba[j] >> 4) & 0xF, 16);
                    // converting to use uppercase letter as part of
                    // the hex value if ch is a letter.
                    if (Character.isLetter(ch))
                    {
                        ch -= caseDiff;
                    }
                    out.append(ch);
                    ch = Character.forDigit(ba[j] & 0xF, 16);
                    if (Character.isLetter(ch))
                    {
                        ch -= caseDiff;
                    }
                    out.append(ch);
                }
                buf.reset();
                needToChange = true;
            }
        }

        return (needToChange ? out.toString() : s);
    }
}