main.java.io.mola.galimatias.URLParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of galimatias Show documentation
Show all versions of galimatias Show documentation
Fork of the galimatias URL parsing and normalization library.
The newest version!
/*
* Copyright (c) 2013 Santiago M. Mola
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
package io.mola.galimatias;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import static io.mola.galimatias.URLUtils.*;
final class URLParser {
private final URL base;
private final String input;
private final URL url;
private final ParseURLState stateOverride;
private URLParsingSettings settings;
private int startIdx;
private int endIdx;
private int idx;
private boolean isEOF;
private int c;
public URLParser(final String input) {
this(null, input, null, null);
}
public URLParser(final URL base, final String input) {
this(base, input, null, null);
}
public URLParser(final String input, final URL url, final ParseURLState stateOverride) {
this(null, input, url, stateOverride);
}
public URLParser(final URL base, final String input, final URL url, final ParseURLState stateOverride) {
this.base = base;
this.input = input;
this.url = url;
this.stateOverride = stateOverride;
this.settings = URLParsingSettings.create();
}
public URLParser settings(final URLParsingSettings settings) {
this.settings = settings;
return this;
}
/**
* Parse URL states as defined by WHATWG URL spec.
*
* http://url.spec.whatwg.org/#scheme-start-state
*/
public static enum ParseURLState {
SCHEME_START,
SCHEME,
SCHEME_DATA,
NO_SCHEME,
RELATIVE_OR_AUTHORITY,
RELATIVE,
RELATIVE_SLASH,
AUTHORITY_FIRST_SLASH,
AUTHORITY_SECOND_SLASH,
AUTHORITY_IGNORE_SLASHES,
AUTHORITY,
FILE_HOST,
HOST,
PORT,
RELATIVE_PATH_START,
RELATIVE_PATH,
QUERY,
FRAGMENT
}
private void setIdx(final int i) {
this.idx = i;
this.isEOF = i >= endIdx;
this.c = (isEOF || idx < startIdx)? 0x00 : input.codePointAt(i);
}
private void incIdx() {
final int charCount = Character.charCount(this.c);
setIdx(this.idx + charCount);
}
private void decrIdx() {
if (idx <= startIdx) {
setIdx(idx - 1);
return;
}
final int charCount = Character.charCount(this.input.codePointBefore(idx));
setIdx(this.idx - charCount);
}
private char at(final int i) {
if (i >= endIdx) {
return 0x00;
}
return input.charAt(i);
}
private void handleError(GalimatiasParseException parseException) throws GalimatiasParseException {
this.settings.errorHandler().error(parseException);
}
private void handleError(String message) throws GalimatiasParseException {
handleError(new GalimatiasParseException(message, idx));
}
private void handleFatalError(GalimatiasParseException parseException) throws GalimatiasParseException {
this.settings.errorHandler().fatalError(parseException);
throw parseException;
}
private void handleFatalError(String message) throws GalimatiasParseException {
handleFatalError(new GalimatiasParseException(message, idx));
}
private void handleInvalidPercentEncodingError() throws GalimatiasParseException {
handleError(GalimatiasParseException.builder()
.withMessage("Percentage (\"%\") is not followed by two hexadecimal digits")
.withParseIssue(ParseIssue.INVALID_PERCENT_ENCODING)
.withPosition(idx)
.build());
}
private void handleBackslashAsDelimiterError() throws GalimatiasParseException {
handleError(GalimatiasParseException.builder()
.withMessage("Backslash (\"\\\") used as path segment delimiter")
.withParseIssue(ParseIssue.BACKSLASH_AS_DELIMITER)
.withPosition(idx)
.build());
}
private void handleIllegalWhitespaceError() throws GalimatiasParseException {
handleError(GalimatiasParseException.builder()
.withMessage("Tab, new line or carriage return found")
.withParseIssue(ParseIssue.ILLEGAL_WHITESPACE)
.withPosition(idx)
.build());
}
private void handleIllegalCharacterError(String message, int codePoint) throws GalimatiasParseException {
if (codePoint == ' ') {
message += ": space is not allowed";
} else if (codePoint == '\t') {
message += ": tab is not allowed";
} else if (codePoint == '\n') {
message += ": line break is not allowed";
} else if (codePoint == '\r') {
message += ": carriage return is not allowed";
} else {
message += ": \u201c" + new String(Character.toChars(codePoint)) + "\u201d is not allowed";
}
handleError(GalimatiasParseException.builder()
.withMessage(message)
.withParseIssue(ParseIssue.ILLEGAL_CHARACTER)
.withPosition(idx)
.build());
}
private void handleFatalMissingSchemeError() throws GalimatiasParseException {
handleFatalError(GalimatiasParseException.builder()
.withMessage("Missing scheme")
.withPosition(idx)
.withParseIssue(ParseIssue.MISSING_SCHEME)
.build());
}
private void handleFatalIllegalCharacterError(String message, int codePoint) throws GalimatiasParseException {
message += ": \u201c" + new String(Character.toChars(codePoint)) + "\u201d is not allowed";
handleFatalError(GalimatiasParseException.builder()
.withMessage(message)
.withParseIssue(ParseIssue.ILLEGAL_CHARACTER)
.withPosition(idx)
.build());
}
private void handleFatalInvalidHostError(Exception exception) throws GalimatiasParseException {
handleFatalError(GalimatiasParseException.builder()
.withMessage("Invalid host: " + exception.getMessage())
.withParseIssue(ParseIssue.INVALID_HOST)
.withPosition(idx)
.withCause(exception)
.build());
}
// Based on http://src.chromium.org/viewvc/chrome/trunk/src/url/third_party/mozilla/url_parse.cc
// http://url.spec.whatwg.org/#parsing
//
public URL parse() throws GalimatiasParseException {
if (input == null) {
throw new NullPointerException("null input");
}
final StringBuilder buffer = new StringBuilder(input.length()*2);
String encodingOverride = "utf-8";
String scheme = (url == null)? null : url.scheme();
StringBuilder schemeData = (url == null)? new StringBuilder() : new StringBuilder(url.schemeData());
String username = (url == null)? null : url.username();
String password = (url == null)? null : url.password();
Host host = (url == null)? null : url.host();
int port = (url == null)? -1 : url.port();
boolean relativeFlag = (url != null) && url.isHierarchical();
boolean atFlag = false; // @-flag
boolean bracketsFlag = false; // []-flag
List pathSegments = (url == null || stateOverride == ParseURLState.RELATIVE_PATH_START)? new ArrayList() : url.pathSegments();
StringBuilder query = (url == null || url.query() == null || stateOverride == ParseURLState.QUERY)? null : new StringBuilder(url.query());
StringBuilder fragment = (url == null || url.fragment() == null|| stateOverride == ParseURLState.FRAGMENT)? null : new StringBuilder(url.fragment());
final StringBuilder usernameBuffer = new StringBuilder(buffer.length());
StringBuilder passwordBuffer = null;
endIdx = input.length();
setIdx(startIdx);
// Skip leading and trailing spaces
while (Character.isWhitespace(c)) {
incIdx();
startIdx++;
}
while (endIdx > startIdx && Character.isWhitespace(input.charAt(endIdx - 1))) {
endIdx--;
}
ParseURLState state = (stateOverride == null)? ParseURLState.SCHEME_START : stateOverride;
// WHATWG URL 5.2.8: Keep running the following state machine by switching on state, increasing pointer by one
// after each time it is run, as long as pointer does not point past the end of input.
boolean terminate = false;
while (!terminate) {
if (idx > endIdx) {
break;
}
//log.trace("STATE: {} | IDX: {} | C: {} | {}", state.name(), idx, c, new String(Character.toChars(c)));
switch (state) {
case SCHEME_START: {
// WHATWG URL .8.1: If c is an ASCII alpha, append c, lowercased, to buffer, and set state to scheme state.
if (isASCIIAlpha(c)) {
buffer.appendCodePoint(Character.toLowerCase(c));
state = ParseURLState.SCHEME;
} else {
// WHATWG URL .8.2: Otherwise, if state override is not given, set state to no scheme state,
// and decrease pointer by one.
if (stateOverride == null) {
state = ParseURLState.NO_SCHEME;
decrIdx();
} else {
handleFatalError("Scheme must start with alpha character.");
}
}
break;
}
case SCHEME: {
// WHATWG URL .8.1: If c is an ASCII alphanumeric, "+", "-", or ".", append c, lowercased, to buffer.
if (isASCIIAlphanumeric(c) || c == '+' || c == '-' || c == '.') {
buffer.appendCodePoint(Character.toLowerCase(c));
}
// WHATWG URL .8.2: Otherwise, if c is ":", set url's scheme to buffer, buffer to the empty string,
// and then run these substeps:
else if (c == ':') {
scheme = buffer.toString();
buffer.setLength(0);
// WHATWG URL .1: If state override is given, terminate this algorithm.
if (stateOverride != null) {
terminate = true;
break;
}
// WHATWG URL .2: If url's scheme is a relative scheme, set url's relative flag.
relativeFlag = isRelativeScheme(scheme);
//XXX: This is a deviation from the URL Specification in its current form, in favour of
// URIs as specified in RFC 3986. That is, if we find scheme://, we expect a hierarchical URI.
// See https://www.w3.org/Bugs/Public/show_bug.cgi?id=24170
//TODO: We left this out to pass W3C's web-platform-tests. It should probably be back for old RFCs?
//if (!relativeFlag) {
// relativeFlag = input.regionMatches(idx + 1, "//", 0, 2);
//}
// WHATWG URL .3: If url's scheme is "file", set state to relative state.
if ("file".equals(scheme)) {
state = ParseURLState.RELATIVE;
}
// WHATWG URL .4: Otherwise, if url's relative flag is set, base is not null and base's
// scheme is equal to url's scheme, set state to relative or authority state.
else if (relativeFlag && base != null && base.scheme().equals(scheme)) {
state = ParseURLState.RELATIVE_OR_AUTHORITY;
}
// WHATWG URL .5: Otherwise, if url's relative flag is set, set state to authority first slash state.
else if (relativeFlag) {
state = ParseURLState.AUTHORITY_FIRST_SLASH;
}
// WHAT WG URL .6: Otherwise, set state to scheme data state.
else {
state = ParseURLState.SCHEME_DATA;
}
}
// WHATWG URL: Otherwise, if state override is not given, set buffer to the empty string,
// state to no scheme state, and start over (from the first code point in input).
else if (stateOverride == null) {
buffer.setLength(0);
state = ParseURLState.NO_SCHEME;
idx = -1; // Note that it'll be incremented by 1 after the switch
}
// WHATWG URL: Otherwise, if c is the EOF code point, terminate this algorithm.
else if (isEOF) {
terminate = true;
}
// WHATWG URL: Otherwise, parse error, terminate this algorithm.
else {
handleFatalIllegalCharacterError("Illegal character in scheme", c);
}
break;
}
case SCHEME_DATA: {
// WHATWG URL: If c is "?", set url's query to the empty string and state to query state.
if (c == '?') {
query = new StringBuilder();
state = ParseURLState.QUERY;
}
// WHATWG URL: Otherwise, if c is "#", set url's fragment to the empty string and state to fragment state.
else if (c == '#') {
fragment = new StringBuilder();
state = ParseURLState.FRAGMENT;
}
// WHATWG URL: Otherwise, run these substeps:
else {
// WHATWG URL: If c is not the EOF code point, not a URL code point, and not "%", parse error.
if (!isEOF && c != '%' && !isURLCodePoint(c)) {
handleIllegalCharacterError("Illegal character in scheme data", c);
}
if (c == '%') {
// WHATWG URL: If c is "%" and remaining does not start with two ASCII hex digits, parse error.
if (!isASCIIHexDigit(at(idx+1)) || !isASCIIHexDigit(at(idx+2))) {
handleInvalidPercentEncodingError();
} else {
schemeData.append((char)c)
.append(Character.toUpperCase(input.charAt(idx+1)))
.append(Character.toUpperCase(input.charAt(idx+2)));
setIdx(idx+2);
break;
}
}
// WHATWG URL: If c is none of EOF code point, U+0009, U+000A, and U+000D, utf-8 percent encode
// c using the simple encode set, and append the result to url's scheme data.
if (!isEOF && c != 0x0009 && c != 0x000A && c != 0x000D) {
utf8PercentEncode(c, EncodeSet.SIMPLE, schemeData);
}
//TODO: Shouldn't the "else" clause give parse error?
}
break;
}
case NO_SCHEME: {
if (base == null || !isRelativeScheme(base.scheme())) {
handleFatalMissingSchemeError();
}
state = ParseURLState.RELATIVE;
idx--;
break;
}
case RELATIVE_OR_AUTHORITY: {
if (c == '/' && at(idx+1) == '/') {
state = ParseURLState.AUTHORITY_IGNORE_SLASHES;
idx++;
} else {
handleError("Relative scheme (" + scheme + ") is not followed by \"://\"");
state = ParseURLState.RELATIVE;
idx--;
}
break;
}
case RELATIVE: {
relativeFlag = true;
if (!"file".equals(scheme)) {
scheme = (base == null)? null : base.scheme();
}
if (isEOF) {
host = (base == null)? null : base.host();
port = (base == null || base.port() == base.defaultPort())? -1 : base.port();
pathSegments = (base == null)? null : base.pathSegments();
query = (base == null || base.query() == null)? null : new StringBuilder(base.query());
} else if (c == '/' || c == '\\') {
if (c == '\\') {
handleBackslashAsDelimiterError();
}
state = ParseURLState.RELATIVE_SLASH;
} else if (c == '?') {
host = (base == null)? null : base.host();
port = (base == null || base.port() == base.defaultPort())? -1 : base.port();
pathSegments = (base == null)? null : base.pathSegments();
query = new StringBuilder();
state = ParseURLState.QUERY;
} else if (c == '#') {
host = (base == null)? null : base.host();
port = (base == null || base.port() == base.defaultPort())? -1 : base.port();
pathSegments = (base == null)? null : base.pathSegments();
query = (base == null || base.query() == null)? null : new StringBuilder(base.query());
fragment = new StringBuilder();
state = ParseURLState.FRAGMENT;
} else {
if (!"file".equals(scheme) ||
!isASCIIAlpha(c) ||
(at(idx+1) != ':' && at(idx+1) != '|') ||
(idx + 1 == endIdx - 1) ||
(idx + 2 < endIdx &&
at(idx+2) != '/' && at(idx+2) != '\\' && at(idx+2) != '?' && at(idx+2) != '#')
) {
host = (base == null)? null : base.host();
port = (base == null || base.port() == base.defaultPort())? -1 : base.port();
pathSegments = (base == null)? new ArrayList() : base.pathSegments();
// Pop path
if (!pathSegments.isEmpty()) {
pathSegments.remove(pathSegments.size() - 1);
}
}
state = ParseURLState.RELATIVE_PATH;
idx--;
}
break;
}
case RELATIVE_SLASH: {
if (c == '/' || c == '\\') {
if (c == '\\') {
handleBackslashAsDelimiterError();
}
if ("file".equals(scheme)) {
state = ParseURLState.FILE_HOST;
} else {
state = ParseURLState.AUTHORITY_IGNORE_SLASHES;
}
} else {
if (!"file".equals(scheme)) {
host = (base == null)? null : base.host();
port = (base == null || base.port() == base.defaultPort())? -1 : base.port();
}
state = ParseURLState.RELATIVE_PATH;
idx--;
}
break;
}
case AUTHORITY_FIRST_SLASH: {
if (c == '/') {
state = ParseURLState.AUTHORITY_SECOND_SLASH;
} else {
handleError("Expected a slash (\"/\")");
state = ParseURLState.AUTHORITY_IGNORE_SLASHES;
decrIdx();
}
break;
}
case AUTHORITY_SECOND_SLASH: {
if (c == '/') {
state = ParseURLState.AUTHORITY_IGNORE_SLASHES;
} else {
handleError("Expected a slash (\"/\")");
state = ParseURLState.AUTHORITY_IGNORE_SLASHES;
decrIdx();
}
break;
}
case AUTHORITY_IGNORE_SLASHES: {
if (c != '/' && c != '\\') {
state = ParseURLState.AUTHORITY;
decrIdx();
} else {
handleError("Unexpected slash or backslash");
}
break;
}
case AUTHORITY: {
// If c is "@", run these substeps:
if (c == '@') {
if (atFlag) {
handleError("User or password contains an at symbol (\"@\") not percent-encoded");
buffer.insert(0, "%40");
}
atFlag = true;
for (int i = 0; i < buffer.length(); i++) {
final char otherChar = buffer.charAt(i);
if (
otherChar == 0x0009 ||
otherChar == 0x000A ||
otherChar == 0x000D
) {
handleIllegalWhitespaceError();
continue;
}
if (!isURLCodePoint(otherChar) && otherChar != '%') {
handleIllegalCharacterError("Illegal character in user or password", buffer.codePointAt(i));
}
if (otherChar == '%') {
if (i + 2 >= buffer.length() || !isASCIIHexDigit(buffer.charAt(i+1)) || !isASCIIHexDigit(buffer.charAt(i+2))) {
handleInvalidPercentEncodingError();
} else if (isASCIIHexDigit(buffer.charAt(i+1)) && isASCIIHexDigit(buffer.charAt(i+2))) {
buffer.setCharAt(i + 1, Character.toUpperCase(buffer.charAt(i + 1)));
buffer.setCharAt(i + 2, Character.toUpperCase(buffer.charAt(i + 2)));
}
}
if (otherChar == ':' && passwordBuffer == null) {
passwordBuffer = new StringBuilder(buffer.length() - i);
continue;
}
if (passwordBuffer != null) {
utf8PercentEncode(otherChar, EncodeSet.DEFAULT, passwordBuffer);
} else {
utf8PercentEncode(otherChar, EncodeSet.DEFAULT, usernameBuffer);
}
}
buffer.setLength(0);
} else if (isEOF || c == '/' || c == '\\' || c == '?' || c == '#') {
setIdx(idx - buffer.length() - 1);
if (atFlag) {
username = usernameBuffer.toString();
if (passwordBuffer != null) {
password = passwordBuffer.toString();
}
}
buffer.setLength(0);
state = ParseURLState.HOST;
} else {
buffer.appendCodePoint(c);
}
break;
}
case FILE_HOST: {
if (isEOF || c == '/' || c == '\\' || c == '?' || c == '#') {
idx--;
if (buffer.length() == 2 && isASCIIAlpha(buffer.charAt(0)) &&
(buffer.charAt(1) == ':' || buffer.charAt(1) == '|')) {
state = ParseURLState.RELATIVE_PATH;
} else if (buffer.length() == 0) {
state = ParseURLState.RELATIVE_PATH_START;
} else {
try {
host = Host.parseHost(buffer.toString());
} catch (GalimatiasParseException ex) {
handleFatalInvalidHostError(ex);
}
buffer.setLength(0);
state = ParseURLState.RELATIVE_PATH_START;
}
} else if (c == 0x0009 || c == 0x000A || c == 0x000D) {
handleIllegalWhitespaceError();
} else {
buffer.appendCodePoint(c);
}
break;
}
case HOST: { //XXX: WHATWG defines HOSTNAME as an alias, useless here.
if (c == ':' && !bracketsFlag) {
try {
host = Host.parseHost(buffer.toString());
} catch (GalimatiasParseException ex) {
handleFatalInvalidHostError(ex);
}
buffer.setLength(0);
state = ParseURLState.PORT;
if (stateOverride == ParseURLState.HOST) {
terminate = true;
}
} else if (isEOF || c == '/' || c == '\\' || c == '?' || c == '#') {
decrIdx();
try {
host = Host.parseHost(buffer.toString());
} catch (GalimatiasParseException ex) {
handleFatalInvalidHostError(ex);
}
buffer.setLength(0);
state = ParseURLState.RELATIVE_PATH_START;
if (stateOverride != null) {
terminate = true;
}
} else if (c == 0x0009 || c == 0x000A || c == 0x000D) {
handleIllegalWhitespaceError();
} else {
if (c == '[') {
bracketsFlag = true;
} else if (c == ']') {
bracketsFlag = false;
}
buffer.appendCodePoint(c);
}
break;
}
case PORT: {
if (isASCIIDigit(c)) {
buffer.appendCodePoint(c);
} else if (isEOF || c == '/' || c == '\\' || c == '?' || c == '#') {
// Remove leading zeroes
while (buffer.length() > 0 && buffer.charAt(0) == 0x0030 && buffer.length() > 1) {
buffer.deleteCharAt(0);
}
//XXX: This is redundant with URL constructor
if (buffer.toString().equals(getDefaultPortForScheme(scheme))) {
buffer.setLength(0);
}
if (buffer.length() == 0) {
port = -1;
} else {
String portMsg = "Port number must be less than 65536";
try {
port = Integer.parseInt(buffer.toString());
if (port > 65535) {
handleError(portMsg);
}
} catch (NumberFormatException e) {
handleError(portMsg);
}
}
if (stateOverride != null) {
terminate = true;
} else {
buffer.setLength(0);
state = ParseURLState.RELATIVE_PATH_START;
idx--;
}
} else if (c == 0x0009 || c == 0x000A || c == 0x000D) {
handleIllegalWhitespaceError();
} else {
handleFatalIllegalCharacterError("Illegal character in port", c);
}
break;
}
case RELATIVE_PATH_START: {
if (c == '\\') {
handleBackslashAsDelimiterError();
}
state = ParseURLState.RELATIVE_PATH;
if (c != '/' && c != '\\') {
decrIdx();
}
break;
}
case RELATIVE_PATH: {
if (isEOF || c == '/' || c == '\\' || (stateOverride == null && (c == '?' || c == '#'))) {
if (c == '\\') {
handleBackslashAsDelimiterError();
}
final String lowerCasedBuffer = buffer.toString().toLowerCase(Locale.ENGLISH);
if ("%2e".equals(lowerCasedBuffer)) {
buffer.setLength(0);
buffer.append('.');
} else if (
".%2e".equals(lowerCasedBuffer) ||
"%2e.".equals(lowerCasedBuffer) ||
"%2e%2e".equals(lowerCasedBuffer)
) {
buffer.setLength(0);
buffer.append("..");
}
if ("..".equals(buffer.toString())) {
// Pop path
if (!pathSegments.isEmpty()) {
pathSegments.remove(pathSegments.size() - 1);
}
if (c != '/' && c != '\\') {
pathSegments.add("");
}
} else if (".".equals(buffer.toString()) && c != '/' && c != '\\') {
pathSegments.add("");
} else if (!".".equals(buffer.toString())) {
if ("file".equals(scheme) && pathSegments.isEmpty() &&
buffer.length() == 2 &&
isASCIIAlpha(buffer.charAt(0)) &&
buffer.charAt(1) == '|') {
buffer.setCharAt(1, ':');
}
pathSegments.add(buffer.toString());
}
buffer.setLength(0);
if (c == '?') {
query = new StringBuilder();
state = ParseURLState.QUERY;
} else if (c == '#') {
fragment = new StringBuilder();
state = ParseURLState.FRAGMENT;
}
} else if (c == 0x0009 || c == 0x000A || c == 0x000D) {
handleIllegalWhitespaceError();
} else {
if (!isURLCodePoint(c) && c != '%') {
handleIllegalCharacterError("Illegal character in path segment", c);
}
if (c == '%') {
if (!isASCIIHexDigit(at(idx+1)) || !isASCIIHexDigit(at(idx+2))) {
handleInvalidPercentEncodingError();
} else {
buffer.append((char)c)
.append(Character.toUpperCase(input.charAt(idx+1)))
.append(Character.toUpperCase(input.charAt(idx+2)));
setIdx(idx+2);
break;
}
}
utf8PercentEncode(c, EncodeSet.DEFAULT, buffer);
}
break;
}
case QUERY: {
//XXX: When we come from stateOverride, query buffer is null
if (query == null) {
query = new StringBuilder();
}
if (isEOF || (stateOverride == null && c == '#')) {
if (relativeFlag) {
encodingOverride = "utf-8";
}
final byte[] bytes = buffer.toString().getBytes(UTF_8);
for (int i = 0; i < bytes.length; i++) {
final byte b = bytes[i];
if (b < 0x21 || b > 0x7E || b == 0x22 || b == 0x23 || b == 0x3C || b == 0x3E || b == 0x60) {
percentEncode(b, query);
} else {
query.append((char) b);
}
}
buffer.setLength(0);
if (c == '#') {
fragment = new StringBuilder();
state = ParseURLState.FRAGMENT;
}
} else if (c == 0x0009 || c == 0x000A || c == 0x000D) {
handleIllegalWhitespaceError();
} else {
if (!isURLCodePoint(c) && c != '%') {
handleIllegalCharacterError("Illegal character in query", c);
}
if (c == '%') {
if (!isASCIIHexDigit(at(idx+1)) || !isASCIIHexDigit(at(idx+2))) {
handleInvalidPercentEncodingError();
} else {
buffer.append((char)c)
.append(Character.toUpperCase(input.charAt(idx+1)))
.append(Character.toUpperCase(input.charAt(idx+2)));
setIdx(idx+2);
break;
}
}
buffer.appendCodePoint(c);
}
break;
}
case FRAGMENT: {
//XXX: When we come from stateOverride, fragment buffer is null
if (fragment == null) {
fragment = new StringBuilder();
}
if (isEOF) {
// Do nothing
} else if (c == 0x0009 || c == 0x000A || c == 0x000D) {
handleIllegalWhitespaceError();
} else {
if (!isURLCodePoint(c) && c != '%') {
handleIllegalCharacterError("Illegal character in fragment", c);
}
if (c == '%') {
if (!isASCIIHexDigit(at(idx+1)) || !isASCIIHexDigit(at(idx+2))) {
handleInvalidPercentEncodingError();
} else {
fragment.append((char)c)
.append(Character.toUpperCase(input.charAt(idx+1)))
.append(Character.toUpperCase(input.charAt(idx+2)));
setIdx(idx+2);
break;
}
}
utf8PercentEncode(c, EncodeSet.SIMPLE, fragment);
}
break;
}
}
if (idx == -1) {
setIdx(startIdx);
} else {
incIdx();
}
}
return new URL(scheme, schemeData.toString(),
username, password,
host, port, pathSegments,
(query == null)? null : query.toString(),
(fragment == null)? null : fragment.toString(),
relativeFlag);
}
String parseUsername() {
StringBuilder buffer = new StringBuilder(input.length() * 2);
startIdx = 0;
endIdx = input.length();
setIdx(0);
while (!isEOF) {
utf8PercentEncode(c, EncodeSet.USERNAME, buffer);
incIdx();
}
return buffer.toString();
}
String parsePassword() {
StringBuilder buffer = new StringBuilder(input.length() * 2);
startIdx = 0;
endIdx = input.length();
setIdx(0);
while (!isEOF) {
utf8PercentEncode(c, EncodeSet.PASSWORD, buffer);
incIdx();
}
return buffer.toString();
}
private static enum EncodeSet {
SIMPLE,
DEFAULT,
PASSWORD,
USERNAME
}
private void utf8PercentEncode(final int c, final EncodeSet encodeSet, final StringBuilder buffer) {
if (encodeSet != null) {
switch (encodeSet) {
case SIMPLE:
if (!isInSimpleEncodeSet(c)) {
buffer.appendCodePoint(c);
return;
}
break;
case DEFAULT:
if (!isInDefaultEncodeSet(c)) {
buffer.appendCodePoint(c);
return;
}
break;
case PASSWORD:
if (!isInPasswordEncodeSet(c)) {
buffer.appendCodePoint(c);
return;
}
break;
case USERNAME:
if (!isInUsernameEncodeSet(c)) {
buffer.appendCodePoint(c);
return;
}
break;
}
}
final byte[] bytes = new String(Character.toChars(c)).getBytes(UTF_8);
for (final byte b : bytes) {
percentEncode(b, buffer);
}
}
private boolean isInSimpleEncodeSet(final int c) {
return c < 0x0020 || c > 0x007E;
}
private boolean isInDefaultEncodeSet(final int c) {
return isInSimpleEncodeSet(c) || c == ' ' || c == '"' || c == '#' || c == '<' || c == '>' || c == '?' || c == '`';
}
private boolean isInPasswordEncodeSet(final int c) {
return isInDefaultEncodeSet(c) || c == '/' || c == '@' || c == '\\';
}
private boolean isInUsernameEncodeSet(final int c) {
return isInPasswordEncodeSet(c) || c == ':';
}
}