ch.sentric.PercentCodec Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of url-normalization_2.10 Show documentation
url-normalization
The newest version!
/**
 * Copyright 2013 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ch.sentric;

import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.nio.charset.Charset;

/**
 * 
 * Percent-encoding, also known as URL encoding, is a mechanism for encoding
 * information in a Uniform Resource Identifier (URI) under certain
 * circumstances. Although it is known as URL encoding it is, in fact, used more
 * generally within the main Uniform Resource Identifier (URI) set, which
 * includes both Uniform Resource Locator (URL) and Uniform Resource Name (URN).
 * As such it is also used in the preparation of data of the
 * "application/x-www-form-urlencoded" media type, as is often used in the
 * submission of HTML form data in HTTP requests.

 * See http://en.wikipedia.org/wiki/Percent-encoding.
 * 
 * Some of this code has been copied from the bixo project
 * src/main/java/bixo/urldb/SimpleUrlNormalizer.java.
 */
public class PercentCodec {
    // !...*...'...(...)...;...:...@...&...=...+...$...,.../...?...%...#...[
    // %21 %2A %27 %28 %29 %3B %3A %40 %26 %3D %2B %24 %2C %2F %3F %25 %23 %5B
    // ]
    // %5D

    // You only need to encode "reserved purpose" characters, and that sub-set
    // of the reserved chars varies depending upon the protocol and the
    // component. Since we only are really worried about normalizing http(s)
    // URLs Not really sure about ':' and '?' being reserved in queries private

    private static final String RESERVED_QUERY_CHARS = "%&;=:?#";
    private static final String RESERVED_PATH_CHARS = "%/?#";
    private static final String HEX_CODES = "0123456789abcdefABCDEF";

    public String encodePathPart(final String pathPart) {
	return encode(pathPart, RESERVED_PATH_CHARS);
    }

    public String encodeQueryComponent(final String queryComponent) {
	return encode(queryComponent, RESERVED_QUERY_CHARS);
    }

    public String encode(final String component, final String reservedChars) {
	final StringBuilder result = new StringBuilder();
	for (int i = 0; i < component.length();) {
	    final int codePoint = component.codePointAt(i);
	    if (codePoint == 0x0020) {
		result.append('+');
	    } else if (codePoint >= 0x007F) {
		result.append(encodeCodePoint(codePoint));
	    } else if ((codePoint < 0x0020) || (reservedChars.indexOf((char) codePoint) != -1)) {
		result.append(String.format("%%%02x", codePoint));
	    } else {
		result.append((char) codePoint);
	    }

	    i += Character.charCount(codePoint);
	}

	return result.toString();
    }

    public String decode(final String url) {
	// FUTURE - handle unsupported %uHHHH sequences for Unicode code points.
	// FUTURE - detect & handle incorrectly encoded URLs

	// First, try to catch unescaped '%' characters.
	final String result = escapeIsolatedPercentSigns(url);

	try {
	    return URLDecoder.decode(result, Charset.defaultCharset().toString());
	} catch (final UnsupportedEncodingException e) {
	    throw new IllegalStateException("Unexpected exception during URL decoding", e);
	}
    }

    private static String encodeCodePoint(final int codepoint) {
	try {
	    final int[] codepoints = { codepoint };
	    final byte[] bytes = new String(codepoints, 0, 1).getBytes(Charset.defaultCharset().toString());

	    final StringBuilder result = new StringBuilder();
	    for (final byte value : bytes) {
		result.append(String.format("%%%02x", value));
	    }

	    return result.toString();
	} catch (final UnsupportedEncodingException e) {
	    throw new IllegalStateException("Unexpected exception during URL encoding", e);
	}
    }

    private static String escapeIsolatedPercentSigns(final String in) {
	StringBuilder builder = null;
	int offset = 0;
	int lastOffset = 0;
	while ((offset = in.indexOf('%', offset)) != -1) {

	    offset += 1;

	    if (offset > in.length() - 2 || HEX_CODES.indexOf(in.charAt(offset)) == -1 || HEX_CODES.indexOf(in.charAt(offset + 1)) == -1) {
		if (null == builder) {
		    builder = new StringBuilder();
		}
		builder.append(in.substring(lastOffset, offset));
		builder.append("25");
		lastOffset = offset;
	    }
	}
	if (null != builder) {
	    builder.append(in.substring(lastOffset));
	    return builder.toString();
	}

	return in;
    }
}