org.archive.url.LaxURLCodec Maven / Gradle / Ivy
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.url;
import java.io.ByteArrayOutputStream;
import java.io.UnsupportedEncodingException;
import java.util.BitSet;
import org.apache.commons.codec.net.URLCodec;
import com.google.common.base.Charsets;
/**
* @author gojomo
*/
public class LaxURLCodec extends URLCodec {
public static LaxURLCodec DEFAULT = new LaxURLCodec("UTF-8");
// passthrough constructor
public LaxURLCodec(String encoding) {
super(encoding);
}
/**
* Decodes an array of URL safe 7-bit characters into an array of
* original bytes. Escaped characters are converted back to their
* original representation.
*
* Differs from URLCodec.decodeUrl() in that it throws no
* exceptions; bad or incomplete escape sequences are ignored
* and passed into result undecoded. This matches the behavior
* of browsers, which will use inconsistently-encoded URIs
* in HTTP request-lines.
*
* @param bytes array of URL safe characters
* @return array of original bytes
*/
public static final byte[] decodeUrlLoose(byte[] bytes)
{
if (bytes == null) {
return null;
}
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
for (int i = 0; i < bytes.length; i++) {
int b = bytes[i];
if (b == '+') {
buffer.write(' ');
continue;
}
if (b == '%') {
if(i+2 -1 && l > -1) {
// good encoding
int c = ((u << 4) + l);
buffer.write((char)c);
i += 2;
continue;
} // else: bad encoding digits, leave '%' in place
} // else: insufficient encoding digits, leave '%' in place
}
buffer.write(b);
}
return buffer.toByteArray();
}
/**
* A more expansive set of ASCII URI characters to consider as 'safe' to
* leave unencoded, based on actual browser behavior.
*/
public static BitSet EXPANDED_URI_SAFE = new BitSet(256);
static {
// alpha characters
for (int i = 'a'; i <= 'z'; i++) {
EXPANDED_URI_SAFE.set(i);
}
for (int i = 'A'; i <= 'Z'; i++) {
EXPANDED_URI_SAFE.set(i);
}
// numeric characters
for (int i = '0'; i <= '9'; i++) {
EXPANDED_URI_SAFE.set(i);
}
// special chars
EXPANDED_URI_SAFE.set('-');
EXPANDED_URI_SAFE.set('~');
EXPANDED_URI_SAFE.set('_');
EXPANDED_URI_SAFE.set('.');
EXPANDED_URI_SAFE.set('*');
EXPANDED_URI_SAFE.set('/');
EXPANDED_URI_SAFE.set('=');
EXPANDED_URI_SAFE.set('&');
EXPANDED_URI_SAFE.set('+');
EXPANDED_URI_SAFE.set(',');
EXPANDED_URI_SAFE.set(':');
EXPANDED_URI_SAFE.set(';');
EXPANDED_URI_SAFE.set('@');
EXPANDED_URI_SAFE.set('$');
EXPANDED_URI_SAFE.set('!');
EXPANDED_URI_SAFE.set(')');
EXPANDED_URI_SAFE.set('(');
// experiments indicate: Firefox (1.0.6) never escapes '%'
EXPANDED_URI_SAFE.set('%');
// experiments indicate: Firefox (1.0.6) does not escape '|' or '''
EXPANDED_URI_SAFE.set('|');
EXPANDED_URI_SAFE.set('\'');
}
public static BitSet QUERY_SAFE = new BitSet(256);
static {
QUERY_SAFE.or(EXPANDED_URI_SAFE);
// Tests indicate Firefox (1.0.7-1) doesn't escape curlies in query str.
QUERY_SAFE.set('{');
QUERY_SAFE.set('}');
// nor any of these: [ ] ^ ?
QUERY_SAFE.set('[');
QUERY_SAFE.set(']');
QUERY_SAFE.set('^');
QUERY_SAFE.set('?');
}
/**
* Encodes a string into its URL safe form using the specified
* string charset. Unsafe characters are escaped.
*
* This method is analogous to superclass encode() methods,
* additionally offering the ability to specify a different
* 'safe' character set (such as EXPANDED_URI_SAFE).
*
* @param safe BitSet of characters that don't need to be encoded
* @param pString String to encode
* @param cs Name of character set to use
* @return Encoded version of pString
.
* @throws UnsupportedEncodingException
*/
public String encode(BitSet safe, String pString, String cs)
throws UnsupportedEncodingException {
if (pString == null) {
return null;
}
return new String(encodeUrl(safe,pString.getBytes(cs)), Charsets.US_ASCII);
}
}