nl.basjes.parse.httpdlog.dissectors.HttpUriDissector Maven / Gradle / Ivy
/*
* Apache HTTPD & NGINX Access log parsing made easy
* Copyright (C) 2011-2017 Niels Basjes
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nl.basjes.parse.httpdlog.dissectors;
import nl.basjes.parse.core.Casts;
import nl.basjes.parse.core.Dissector;
import nl.basjes.parse.core.Parsable;
import nl.basjes.parse.core.ParsedField;
import nl.basjes.parse.core.exceptions.DissectionFailure;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.URIUtil;
import org.apache.commons.lang3.StringEscapeUtils;
import java.net.URI;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.EnumSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HttpUriDissector extends Dissector {
// --------------------------------------------
private static final String INPUT_TYPE = "HTTP.URI";
@Override
public String getInputType() {
return INPUT_TYPE;
}
// --------------------------------------------
@Override
public List getPossibleOutput() {
List result = new ArrayList<>();
result.add("HTTP.PROTOCOL:protocol");
result.add("HTTP.USERINFO:userinfo");
result.add("HTTP.HOST:host");
result.add("HTTP.PORT:port");
result.add("HTTP.PATH:path");
result.add("HTTP.QUERYSTRING:query");
result.add("HTTP.REF:ref");
return result;
}
// --------------------------------------------
@Override
public boolean initializeFromSettingsParameter(String settings) {
return true; // Everything went right.
}
// --------------------------------------------
@Override
protected void initializeNewInstance(Dissector newInstance) {
// Nothing to do
}
private boolean wantProtocol = false;
private boolean wantUserinfo = false;
private boolean wantHost = false;
private boolean wantPort = false;
private boolean wantPath = false;
private boolean wantQuery = false;
private boolean wantRef = false;
@Override
public EnumSet prepareForDissect(final String inputname, final String outputname) {
String name = extractFieldName(inputname, outputname);
if ("protocol".equals(name)) {
wantProtocol = true;
return Casts.STRING_ONLY;
}
if ("userinfo".equals(name)) {
wantUserinfo = true;
return Casts.STRING_ONLY;
}
if ("host".equals(name)) {
wantHost = true;
return Casts.STRING_ONLY;
}
if ("port".equals(name)) {
wantPort = true;
return Casts.STRING_OR_LONG;
}
if ("path".equals(name)) {
wantPath = true;
return Casts.STRING_ONLY;
}
if ("query".equals(name)) {
wantQuery = true;
return Casts.STRING_ONLY;
}
if ("ref".equals(name)) {
wantRef = true;
return Casts.STRING_ONLY;
}
return null;
}
@Override
public void prepareForRun() {
// We do not do anything extra here
}
// --------------------------------------------
private static BitSet badUriChars = new BitSet(256);
static {
badUriChars.set(0, 255, true);
badUriChars.andNot(org.apache.commons.httpclient.URI.unwise);
badUriChars.andNot(org.apache.commons.httpclient.URI.space);
badUriChars.andNot(org.apache.commons.httpclient.URI.control);
badUriChars.set('<', false);
badUriChars.set('>', false);
badUriChars.set('"', false);
}
// Match % encoded chars that are NOT followed by hex chars (may be at the end of the string)
private static final Pattern BAD_EXCAPE_PATTERN = Pattern.compile("%([^0-9a-fA-F]|[0-9a-fA-F][^0-9a-fA-F]|.$|$)");
private static final Pattern EQUALS_HASH_PATTERN = Pattern.compile("=#");
private static final Pattern HASH_AMP_PATTERN = Pattern.compile("#&");
private static final Pattern DOUBLE_HASH_PATTERN = Pattern.compile("#(.*)#");
private static final Pattern ALMOST_HTML_ENCODED = Pattern.compile("([^&])(#x[0-9a-fA-F][0-9a-fA-F];)");
@Override
public void dissect(final Parsable> parsable, final String inputname) throws DissectionFailure {
final ParsedField field = parsable.getParsableField(INPUT_TYPE, inputname);
String uriString = field.getValue().getString();
if (uriString == null || uriString.isEmpty()) {
return; // Nothing to do here
}
// First we cleanup the URI so we fail less often over 'garbage' URIs.
// See: http://stackoverflow.com/questions/11038967/brackets-in-a-request-url-are-legal-but-not-in-a-uri-java
try {
uriString = URIUtil.encode(uriString, badUriChars, "UTF-8");
} catch (URIException e) {
throw new DissectionFailure("Failed to parse URI >>" + field.getValue().getString()+"<< because of : " +e.getMessage());
}
// Before we hand it to the standard parser we hack it around a bit so we can parse
// nasty edge cases that are illegal yet do occur in real clickstreams.
// Also we force the query string to start with ?& so the returned query string starts with &
// Which leads to more consistent output after parsing.
int firstQuestionMark = uriString.indexOf('?');
int firstAmpersand = uriString.indexOf('&');
// Now we can have one of 3 situations:
// 1) No query string
// 2) Query string starts with a '?'
// (and optionally followed by one or more '&' or '?' )
// 3) Query string starts with a '&'. This is invalid but does occur!
// We may have ?x=x&y=y?z=z so we normalize it always
// to: ?&x=x&y=y&z=z
if (firstAmpersand != -1 || firstQuestionMark != -1) {
uriString = uriString.replaceAll("\\?", "&");
uriString = uriString.replaceFirst("&", "?&");
}
// We find that people muck up the URL by putting % signs in the URLs that are NOT escape sequences
// So any % that is not followed by a two 'hex' letters is fixed
uriString = BAD_EXCAPE_PATTERN.matcher(uriString).replaceAll("%25$1");
uriString = BAD_EXCAPE_PATTERN.matcher(uriString).replaceAll("%25$1");
// We have URIs with fragments like this:
// /path/?_requestid=1234#x3D;12341234&Referrer=blablabla
// So first we repair the broken encoded char
uriString = ALMOST_HTML_ENCODED.matcher(uriString).replaceAll("$1&$2");
uriString = StringEscapeUtils.unescapeHtml4(uriString);
// And we see URIs with this:
// /path/?Referrer=ADV1234#&f=API&subid=#&name=12341234
uriString = EQUALS_HASH_PATTERN.matcher(uriString).replaceAll("=");
uriString = HASH_AMP_PATTERN.matcher(uriString).replaceAll("&");
// If we still have multiple '#' in here we replace them with something else: '~'
while (true) {
Matcher doubleHashMatcher = DOUBLE_HASH_PATTERN.matcher(uriString);
if (!doubleHashMatcher.find()) {
break;
}
uriString = doubleHashMatcher.replaceAll("~$1#");
}
boolean isUrl = true;
URI uri;
try {
if (uriString.charAt(0) == '/') {
uri = URI.create("dummy-protocol://dummy.host.name" + uriString);
isUrl = false; // I.e. we do not return the values we just faked.
} else {
uri = URI.create(uriString);
}
} catch (IllegalArgumentException e) {
throw new DissectionFailure("Failed to parse URI >>" + field.getValue().getString()+"<< because of : " +e.getMessage());
}
if (wantQuery || wantPath || wantRef) {
if (wantQuery) {
String query = uri.getRawQuery();
if (query == null) {
query = "";
}
parsable.addDissection(inputname, "HTTP.QUERYSTRING", "query", query);
}
if (wantPath) {
parsable.addDissection(inputname, "HTTP.PATH", "path", uri.getPath());
}
if (wantRef) {
parsable.addDissection(inputname, "HTTP.REF", "ref", uri.getFragment());
}
}
if (isUrl) {
if (wantProtocol) {
parsable.addDissection(inputname, "HTTP.PROTOCOL", "protocol", uri.getScheme());
}
if (wantUserinfo) {
parsable.addDissection(inputname, "HTTP.USERINFO", "userinfo", uri.getUserInfo());
}
if (wantHost) {
parsable.addDissection(inputname, "HTTP.HOST", "host", uri.getHost());
}
if (wantPort) {
if (uri.getPort() != -1) {
parsable.addDissection(inputname, "HTTP.PORT", "port", uri.getPort());
}
}
}
}
// --------------------------------------------
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy