org.owasp.esapi.codecs.PercentCodec Maven / Gradle / Ivy

/**
 * OWASP Enterprise Security API (ESAPI)
 * 
 * This file is part of the Open Web Application Security Project (OWASP)
 * Enterprise Security API (ESAPI) project. For details, please see
 * http://www.owasp.org/index.php/ESAPI.
 *
 * Copyright (c) 2007 - The OWASP Foundation
 * 
 * The ESAPI is published by OWASP under the BSD license. You should read and accept the
 * LICENSE before you use, modify, and/or redistribute this software.
 * 
 * @author Jeff Williams Aspect Security
 * @created 2007
 */
package org.owasp.esapi.codecs;

import java.io.UnsupportedEncodingException;
import java.util.Set;

import org.owasp.esapi.util.CollectionsUtil;

/**
 * Implementation of the Codec interface for percent encoding (aka URL encoding).
 * 
 * @author Jeff Williams (jeff.williams .at. aspectsecurity.com) Aspect Security
 * @since June 1, 2007
 * @see org.owasp.esapi.Encoder
 */
public class PercentCodec extends AbstractCharacterCodec
{
	private static final String ALPHA_NUMERIC_STR = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
	@SuppressWarnings("unused")
	private static final String RFC3986_RESERVED_STR = ":/?#[]@!$&'()*+,;=";
	private static final String RFC3986_NON_ALPHANUMERIC_UNRESERVED_STR = "-._~";
		// rfc3986 2.3: For consistency, percent-encoded octets
		// in the ranges of ALPHA (%41-%5A and %61-%7A), DIGIT
		// (%30-%39), hyphen (%2D), period (%2E), underscore
		// (%5F), or tilde (%7E) should not be created by URI
		// producers
	private static final boolean ENCODED_NON_ALPHA_NUMERIC_UNRESERVED = true;
	private static final String UNENCODED_STR = ALPHA_NUMERIC_STR +
		(ENCODED_NON_ALPHA_NUMERIC_UNRESERVED ? "" : RFC3986_NON_ALPHANUMERIC_UNRESERVED_STR);
	private static final Set UNENCODED_SET = CollectionsUtil.strToUnmodifiableSet(UNENCODED_STR);

	/**
	 * Convinence method to encode a string into UTF-8. This
	 * wraps the {@link UnsupportedEncodingException} that
	 * {@link String#getBytes(String)} throws in a
	 * {@link IllegalStateException} as UTF-8 support is required
	 * by the Java spec and should never throw this exception.
	 * @param str the string to encode
	 * @return str encoded in UTF-8 as bytes.
	 * @throws IllegalStateException wrapped {@link
	 *	UnsupportedEncodingException} if
	 *	{@link String.getBytes(String)} throws it.
	 */
	private static byte[] toUtf8Bytes(String str)
	{
		try
		{
			return str.getBytes("UTF-8");
		}
		catch(UnsupportedEncodingException e)
		{
			throw new IllegalStateException("The Java spec requires UTF-8 support.", e);
		}
	}

	/**
	 * Append the two upper case hex characters for a byte.
	 * @param sb The string buffer to append to.
	 * @param b The byte to hexify
	 * @return sb with the hex characters appended.
	 */
	// rfc3986 2.1: For consistency, URI producers 
	// should use uppercase hexadecimal digits for all percent-
	// encodings.
	private static StringBuilder appendTwoUpperHex(StringBuilder sb, int b)
	{
		if(b < Byte.MIN_VALUE || b > Byte.MAX_VALUE)
			throw new IllegalArgumentException("b is not a byte (was " + b + ')');
		b &= 0xFF;
		if(b<0x10)
			sb.append('0');
		return sb.append(Integer.toHexString(b).toUpperCase());
	}

	/**
	 * Encode a character for URLs
	 * @param immune Additional characters not to encode. Note this could
     *               break URL encoding as referenced in RFC 3986. You should
     *               especially be wary of including '%' in this list of immune
     *               characters since it is used as the "escape" character for
     *               the hex encoding and including it may result in subsequent
     *               and/or dangerous results when decoding.
	 * @param c character to encode
	 * @return the encoded string representing c
	 */
	public String encodeCharacter( char[] immune, Character c )
	{
		String cStr = String.valueOf(c.charValue());
		byte[] bytes;
		StringBuilder sb;

        // check for user specified immune characters
        if ( immune != null && containsCharacter( c.charValue(), immune ) )
            return cStr;

        // check for standard characters (e.g., alphanumeric, etc.)
		if(UNENCODED_SET.contains(c))
			return cStr;

		bytes = toUtf8Bytes(cStr);
		sb = new StringBuilder(bytes.length * 3);
		for(byte b : bytes)
			appendTwoUpperHex(sb.append('%'), b);
		return sb.toString();
	}

	/**
	 * {@inheritDoc}
	 * 
	 * Formats all are legal both upper/lower case:
	 *   %hh;
	 *   
	 * @param input
	 * 			encoded character using percent characters (such as URL encoding)
	 */
	public Character decodeCharacter( PushbackSequence input ) {
		input.mark();
		Character first = input.next();
		if ( first == null ) {
			input.reset();
			return null;
		}

		// if this is not an encoded character, return null
		if (first != '%' ) {
			input.reset();
			return null;
		}

		// Search for exactly 2 hex digits following
		StringBuilder sb = new StringBuilder();
		for ( int i=0; i<2; i++ ) {
			Character c = input.nextHex();
			if ( c != null ) sb.append( c );
		}
		if ( sb.length() == 2 ) {
			try {
				// parse the hex digit and create a character
				int i = Integer.parseInt(sb.toString(), 16);
				if (Character.isValidCodePoint(i)) {
					return (char) i;
				}
			} catch( NumberFormatException ignored ) { }
		}
		input.reset();
		return null;
	}

}