All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nl.basjes.parse.httpdlog.ApacheHttpdLogFormatDissector Maven / Gradle / Ivy

There is a newer version: 5.11.0
Show newest version
/*
 * Apache HTTPD logparsing made easy
 * Copyright (C) 2011-2016 Niels Basjes
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package nl.basjes.parse.httpdlog;

import nl.basjes.parse.core.Casts;
import nl.basjes.parse.httpdlog.dissectors.HttpFirstLineDissector;
import nl.basjes.parse.httpdlog.dissectors.tokenformat.NamedTokenParser;
import nl.basjes.parse.httpdlog.dissectors.tokenformat.TokenFormatDissector;
import nl.basjes.parse.httpdlog.dissectors.tokenformat.TokenParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@SuppressWarnings({
    "PMD.LongVariable", // I like my variable names this way
    "PMD.CyclomaticComplexity", "PMD.OnlyOneReturn",
    "PMD.BeanMembersShouldSerialize", // No beans here
    "PMD.DataflowAnomalyAnalysis" // Results in a lot of mostly useless messages.
    })
public final class ApacheHttpdLogFormatDissector extends TokenFormatDissector {

    private static final Logger LOG = LoggerFactory.getLogger(ApacheHttpdLogFormatDissector.class);

    public ApacheHttpdLogFormatDissector(final String logFormat) {
        super(logFormat);
        setInputType(HttpdLogFormatDissector.INPUT_TYPE);
    }

    public ApacheHttpdLogFormatDissector() {
        super();
        setInputType(HttpdLogFormatDissector.INPUT_TYPE);
    }

    private void overrideLogFormat(String originalLogformat, String logformat){
        LOG.debug("Specified logformat \"{}\" was mapped to {}", originalLogformat, logformat);
        super.setLogFormat(logformat);
    }

    @Override
    public void setLogFormat(final String logformat) {
        // Commonly used logformats as documented in the manuals of the Apache Httpd
        // LogFormat "%h %l %u %t \"%r\" %>s %b" common
        // LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined
        // LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %I %O" combinedio
        // LogFormat "%{Referer}i -> %U" referer
        // LogFormat "%{User-agent}i" agent

        switch (logformat.toLowerCase(Locale.getDefault())) {
            case "common":
                overrideLogFormat(logformat, "%h %l %u %t \"%r\" %>s %b");
                break;
            case "combined":
                overrideLogFormat(logformat, "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"");
                break;
            case "combinedio":
                overrideLogFormat(logformat, "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %I %O");
                break;
            case "referer":
                overrideLogFormat(logformat, "%{Referer}i -> %U");
                break;
            case "agent":
                overrideLogFormat(logformat, "%{User-agent}i");
                break;
            default:
                super.setLogFormat(logformat);
                break;
        }
    }

    // --------------------------------------------

    protected String makeHeaderNamesLowercaseInLogFormat(String logformat) {
        // In vim I would simply do: %s@{\([^}]*\)}@{\L\1\E@g
        // But such an expression is not (yet) possible in Java
        StringBuffer sb = new StringBuffer(logformat.length());
        Pattern p = Pattern.compile("%\\{([^\\}]*)\\}");
        Matcher m = p.matcher(logformat);
        while (m.find()) {
            m.appendReplacement(sb, "%{"+m.group(1).toLowerCase()+'}');
        }
        m.appendTail(sb);

        return sb.toString();
    }

    protected String removeModifiersFromLogformat(String tokenLogFormat) {
        // Modifiers
        // Particular items can be restricted to print only for responses with specific HTTP status codes
        // by placing a comma-separated list of status codes immediately following the "%".
        // The status code list may be preceded by a "!" to indicate negation.
        //
        // %400,501{User-agent}i     Logs User-agent on 400 errors and 501 errors only.
        //                           For other status codes, the literal string "-" will be logged.
        // %!200,304,302{Referer}i   Logs Referer on all requests that do not return one of the three
        //                           specified codes, "-" otherwise.

        return tokenLogFormat.replaceAll("%!?[0-9]{3}(?:,[0-9]{3})*", "%");
    }

    protected String fixTimestampFormat(String tokenLogFormat) {
        // The %t is mapped to the actual time format surrounded by '[' ']'
        // We generate the [] around it and in the rest of the parsing
        // work with the clean format.
        // This is mainly needed to ensure reuse in conjunction with Nginx parsing.
        // NOTE: The %{...}t time format does NOT get the automatic '[' ']' around it.
        return tokenLogFormat.replaceAll("%t", "[%t]");

    }

    @Override
    protected String cleanupLogFormat(String tokenLogFormat) {
        String result = removeModifiersFromLogformat(tokenLogFormat);
        result =  makeHeaderNamesLowercaseInLogFormat(result);
        result = fixTimestampFormat(result);
        return result;
    }

    private boolean enableJettyFix = false;

    public void enableJettyFix() {
        this.enableJettyFix = true;
    }

    @Override
    public String decodeExtractedValue(String tokenName, String value) {
        if (value == null || value.equals("")) {
            return value;
        }

        // In Apache logfiles a '-' means a 'not specified' / 'empty' value.
        if (value.equals("-")){
            return null;
        }

        // Jetty has suffered from this historical problem:
        // An empty user field was logged in the past as " - " instead of "-"
        // See: https://github.com/eclipse/jetty.project/commit/2332b4f
        if (this.enableJettyFix && "connection.client.user".equals(tokenName)) {
            if (value.equals(" - ")){
                return null;
            }
        }

        // http://httpd.apache.org/docs/current/mod/mod_log_config.html#formats
        // Format Notes
        // For security reasons, starting with version 2.0.46, non-printable and other special characters
        // in %r, %i and %o are escaped using \xhh sequences, where hh stands for the hexadecimal representation of
        // the raw byte. Exceptions from this rule are " and \, which are escaped by prepending a backslash, and
        // all whitespace characters, which are written in their C-style notation (\n, \t, etc).
        // In versions prior to 2.0.46, no escaping was performed on these strings so you had to be quite careful
        // when dealing with raw log files.

        if (value.equals("request.firstline")   ||  // %r         First line of request.
            value.startsWith("request.header.") ||  // %{Foobar}i The contents of Foobar: request header line(s).
            value.startsWith("response.header.")) { // %{Foobar}o The contents of Foobar: response header line(s).
            return Utils.decodeApacheHTTPDLogValue(value);
        }

        return value;
    }

    // --------------------------------------------
    @Override
    protected List createAllTokenParsers() {
        List parsers = new ArrayList<>(60);

        // Quote from
        // http://httpd.apache.org/docs/2.2/mod/mod_log_config.html#logformat
        // Format String Description
        // -------
        // %% The percent sign
        parsers.add(new FixedStringTokenParser("%%", "%"));

        // -------
        // %a Remote IP-address
        parsers.add(new TokenParser("%a",
                "connection.client.ip", "IP",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_CLF_IP));

        // %{c}a Underlying peer IP address of the connection (see the mod_remoteip module).
        parsers.add(new TokenParser("%{c}a",
                "connection.client.peerip", "IP",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_CLF_IP));

        // -------
        // %A Local IP-address
        parsers.add(new TokenParser("%A",
                "connection.server.ip", "IP",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_CLF_IP));

        // -------
        // %B Size of response in bytes, excluding HTTP headers.
        parsers.add(new TokenParser("%B",
                "response.body.bytes", "BYTES",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        // -------
        // %b Size of response in bytes, excluding HTTP headers. In CLF format,
        // i.e. a '-' rather than a 0 when no bytes are sent.
        parsers.add(new TokenParser("%b",
                "response.body.bytesclf", "BYTES",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_CLF_NUMBER));

        // -------
        // %{Foobar}C The contents of cookie Foobar in the request sent to the server.
        parsers.add(new NamedTokenParser("\\%\\{([a-z0-9\\-_]*)\\}C",
                "request.cookies.", "HTTP.COOKIE",
                Casts.STRING_ONLY, TokenParser.FORMAT_STRING));

        // -------
        // %D The time taken to serve the request, in microseconds.
        parsers.add(new TokenParser("%D",
                "server.process.time", "MICROSECONDS",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        // -------
        // %{FOOBAR}e The contents of the environment variable FOOBAR
        parsers.add(new NamedTokenParser("\\%\\{([a-z0-9\\-_]*)\\}e", "server.environment.", "VARIABLE",
                Casts.STRING_ONLY, TokenParser.FORMAT_STRING));

        // -------
        // %f Filename
        parsers.add(new TokenParser("%f",
                "server.filename", "FILENAME",
                Casts.STRING_ONLY, TokenParser.FORMAT_STRING));
        // -------

        // %h Remote host
        parsers.add(new TokenParser("%h",
                "connection.client.host", "IP",
                Casts.STRING_ONLY, TokenParser.FORMAT_NO_SPACE_STRING));

        // -------
        // %H The request protocol
        parsers.add(new TokenParser("%H",
                "request.protocol", "PROTOCOL",
                Casts.STRING_ONLY, TokenParser.FORMAT_NO_SPACE_STRING));

        // -------
        // %{Foobar}i The contents of Foobar: header line(s) in the request sent
        // to the server. Changes made by other modules (e.g. mod_headers)
        // affect this.
        parsers.add(new NamedTokenParser("\\%\\{([a-z0-9\\-_]*)\\}i",
                "request.header.", "HTTP.HEADER",
                Casts.STRING_ONLY, TokenParser.FORMAT_STRING));

        // -------
        // %k Number of keepalive requests handled on this connection.
        // Interesting if KeepAlive is being used, so that, for example,
        // a '1' means the first keepalive request after the initial one, '
        // 2' the second, etc...;
        // otherwise this is always 0 (indicating the initial request).
        // Available in versions 2.2.11 and later.
        parsers.add(new TokenParser("%k",
                "connection.keepalivecount", "NUMBER",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        // -------
        // %l Remote logname (from identd, if supplied). This will return a dash
        // unless mod_ident is present and IdentityCheck is set On.
        parsers.add(new TokenParser("%l",
                "connection.client.logname", "NUMBER",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_CLF_NUMBER));

        // -------
        // %L The request log ID from the error log (or '-' if nothing has been logged to the error log for this request).
        // Look for the matching error log line to see what request caused what error.
        parsers.add(new TokenParser("%L",
                "request.errorlogid", "STRING",
                Casts.STRING_ONLY, TokenParser.FORMAT_NO_SPACE_STRING));

        // -------
        // %m The request method
        parsers.add(new TokenParser("%m",
                "request.method", "HTTP.METHOD",
                Casts.STRING_ONLY, TokenParser.FORMAT_NO_SPACE_STRING));

        // -------
        // %{Foobar}n The contents of note Foobar from another module.
        parsers.add(new NamedTokenParser("\\%\\{([a-z0-9\\-_]*)\\}n",
                "server.module_note.", "STRING",
                Casts.STRING_ONLY, TokenParser.FORMAT_STRING));

        // -------
        // %{Foobar}o The contents of Foobar: header line(s) in the response.
        parsers.add(new NamedTokenParser("\\%\\{([a-z0-9\\-]*)\\}o",
                "response.header.", "HTTP.HEADER",
                Casts.STRING_ONLY, TokenParser.FORMAT_STRING));

        // -------
        // %p The canonical port of the server serving the request
        parsers.add(new TokenParser("%p",
                "request.server.port.canonical", "PORT",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        // -------
        // %{format}p The canonical port of the server serving the request or
        // the server's actual port or the client's actual port. Valid formats
        // are canonical, local, or remote.
        parsers.add(new TokenParser("%{canonical}p",
                "connection.server.port.canonical", "PORT",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        parsers.add(new TokenParser("%{local}p",
                "connection.server.port", "PORT",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        parsers.add(new TokenParser("%{remote}p",
                "connection.client.port", "PORT",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        // -------
        // %P The process ID of the child that serviced the request.
        parsers.add(new TokenParser("%P",
                "connection.server.child.processid", "NUMBER",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        // -------
        // %{format}P The process ID or thread id of the child that serviced the
        // request. Valid formats are pid, tid, and hextid. hextid requires
        // APR 1.2.0 or higher.
        parsers.add(new TokenParser("%{pid}P",
                "connection.server.child.processid", "NUMBER",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        parsers.add(new TokenParser("%{tid}P",
                "connection.server.child.threadid", "NUMBER",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        parsers.add(new TokenParser("%{hextid}P",
                "connection.server.child.hexthreadid", "NUMBER",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_CLF_HEXNUMBER));

        // -------
        // %q The query string (prepended with a ? if a query string exists,
        // otherwise an empty string)
        parsers.add(new TokenParser("%q",
                "request.querystring", "HTTP.QUERYSTRING",
                Casts.STRING_ONLY, TokenParser.FORMAT_NO_SPACE_STRING));

        // -------
        // %r First line of request
        parsers.add(new TokenParser("%r",
                "request.firstline", "HTTP.FIRSTLINE",
                Casts.STRING_ONLY, HttpFirstLineDissector.FIRSTLINE_REGEX));

        // -------
        // %R The handler generating the response (if any).
        parsers.add(new TokenParser("%R",
                "request.handler", "STRING",
                Casts.STRING_ONLY, TokenParser.FORMAT_STRING));

        // -------
        // %s Status. For requests that got internally redirected, this is the
        // status of the *original* request --- %>s for the last.
        parsers.add(new TokenParser("%s",
                "request.status.original", "STRING",
                Casts.STRING_ONLY, TokenParser.FORMAT_NO_SPACE_STRING));

        parsers.add(new TokenParser("%>s",
                "request.status.last", "STRING",
                Casts.STRING_ONLY, TokenParser.FORMAT_NO_SPACE_STRING));

        // -------
        // %t Time the request was received (standard english format)
        parsers.add(new TokenParser("%t",
                "request.receive.time", "TIME.STAMP",
                Casts.STRING_ONLY, TokenParser.FORMAT_STANDARD_TIME_US));

        // %{format}t The time, in the form given by format, which should be in
        // strftime(3) format. (potentially localized)
        // FIXME: Implement %{format}t "should be in strftime(3) format. (potentially localized)"
        // If the format starts with begin: (default) the time is taken at the beginning of the request processing.
        // If it starts with end: it is the time when the log entry gets written, close to the end of the request processing.
        // In addition to the formats supported by strftime(3), the following format tokens are supported:
        // sec number of seconds since the Epoch
        // msec number of milliseconds since the Epoch
        // usec number of microseconds since the Epoch
        // msec_frac millisecond fraction
        // usec_frac microsecond fraction
        // These tokens can not be combined with each other or strftime(3) formatting in the same format string.
        // You can use multiple %{format}t tokens instead.

        // msec
        parsers.add(new TokenParser("%{msec}t",
                "request.receive.time.begin.msec", "TIME.EPOCH",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        parsers.add(new TokenParser("%{begin:msec}t",
                "request.receive.time.begin.msec", "TIME.EPOCH",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        parsers.add(new TokenParser("%{end:msec}t",
                "request.receive.time.end.msec", "TIME.EPOCH",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        // usec
        parsers.add(new TokenParser("%{usec}t",
                "request.receive.time.begin.usec", "TIME.EPOCH.USEC",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        parsers.add(new TokenParser("%{begin:usec}t",
                "request.receive.time.begin.usec", "TIME.EPOCH.USEC",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        parsers.add(new TokenParser("%{end:usec}t",
                "request.receive.time.end.usec", "TIME.EPOCH.USEC",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        // msec_frac
        parsers.add(new TokenParser("%{msec_frac}t",
                "request.receive.time.begin.msec_frac", "TIME.EPOCH",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        parsers.add(new TokenParser("%{begin:msec_frac}t",
                "request.receive.time.begin.msec_frac", "TIME.EPOCH",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        parsers.add(new TokenParser("%{end:msec_frac}t",
                "request.receive.time.end.msec_frac", "TIME.EPOCH",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        // usec_frac
        parsers.add(new TokenParser("%{usec_frac}t",
                "request.receive.time.begin.usec_frac", "TIME.EPOCH.USEC_FRAC",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        parsers.add(new TokenParser("%{begin:usec_frac}t",
                "request.receive.time.begin.usec_frac", "TIME.EPOCH.USEC_FRAC",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        parsers.add(new TokenParser("%{end:usec_frac}t",
                "request.receive.time.end.usec_frac", "TIME.EPOCH.USEC_FRAC",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        // This next parser is created to only extract the localized string !!!
        parsers.add(new NamedTokenParser("\\%\\{[^\\}]*\\}t",
                "request.receive.time", "TIME.LOCALIZEDSTRING",
                Casts.STRING_ONLY, TokenParser.FORMAT_LOCALIZED_TIME)
            .setWarningMessageWhenUsed("Fully parsing localized timestamps is NOT yet supported")
        );

        // -------
        // %T The time taken to serve the request, in seconds.
        parsers.add(new TokenParser("%T",
                "response.server.processing.time", "SECONDS",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_NUMBER));

        // -------
        // %u Remote user (from auth; may be bogus if return status (%s) is 401)
        parsers.add(new TokenParser("%u",
                "connection.client.user", "STRING",
                Casts.STRING_ONLY, TokenParser.FORMAT_STRING));

        // -------
        // %U The URL path requested, not including any query string.
        parsers.add(new TokenParser("%U",
                "request.urlpath", "URI",
                Casts.STRING_ONLY, TokenParser.FORMAT_NO_SPACE_STRING));

        // -------
        // %v The canonical ServerName of the server serving the request.
        parsers.add(new TokenParser("%v",
                "connection.server.name.canonical", "STRING",
                Casts.STRING_ONLY,  TokenParser.FORMAT_NO_SPACE_STRING));

        // -------
        // %V The server name according to the UseCanonicalName setting.
        parsers.add(new TokenParser("%V",
                "connection.server.name", "STRING",
                Casts.STRING_ONLY, TokenParser.FORMAT_NO_SPACE_STRING));

        // -------
        // %X Connection status when response is completed:
        // X = connection aborted before the response completed.
        // + = connection may be kept alive after the response is sent.
        // - = connection will be closed after the response is sent.
        // (This directive was %c in late versions of Apache 1.3, but this
        // conflicted with the historical ssl %{var}c syntax.)
        parsers.add(new TokenParser("%X",
                "response.connection.status", "HTTP.CONNECTSTATUS",
                Casts.STRING_ONLY, TokenParser.FORMAT_NO_SPACE_STRING));

        // -------
        // %I Bytes received, including request and headers, cannot be zero. You
        // need to enable mod_logio to use this.
        // NOTE: In reality this CAN ben 0 (in case of HTTP 408 error code)
        parsers.add(new TokenParser("%I",
                "request.bytes", "BYTES",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_CLF_NUMBER));

        // -------
        // %O Bytes sent, including headers, cannot be zero. You need to enable
        // mod_logio to use this.
        // NOTE: In reality this CAN ben 0 (in case of HTTP 408 error code)
        parsers.add(new TokenParser("%O",
                "response.bytes", "BYTES",
                Casts.STRING_OR_LONG, TokenParser.FORMAT_CLF_NUMBER));

        // -------
        // %S Bytes transferred (received and sent), including request and headers, cannot be zero.
        // This is the combination of %I and %O. You need to enable mod_logio to use this.
        // TODO: Implement %S. I have not been able to test this one yet.
//        parsers.add(new TokenParser("%S",
//                "total.bytes", "BYTES",
//                Casts.STRING_OR_LONG, TokenParser.FORMAT_NON_ZERO_NUMBER));

        // -------
        // %{VARNAME}^ti The contents of VARNAME: trailer line(s) in the request sent to the server.
        // TODO: Implement %{VARNAME}^ti

        // -------
        // %{VARNAME}^to The contents of VARNAME: trailer line(s) in the response sent from the server.
        // TODO: Implement %{VARNAME}^to

        // Some explicit type overrides.
        // The '1' at the end indicates this is more important than the default TokenParser (which has an implicit 0).
        parsers.add(new TokenParser("%{cookie}i",
                "request.cookies",    "HTTP.COOKIES",
                Casts.STRING_ONLY, TokenParser.FORMAT_STRING, 1));
        parsers.add(new TokenParser("%{set-cookie}o",
                "response.cookies",   "HTTP.SETCOOKIES",
                Casts.STRING_ONLY, TokenParser.FORMAT_STRING, 1));
        parsers.add(new TokenParser("%{user-agent}i",
                "request.user-agent", "HTTP.USERAGENT",
                Casts.STRING_ONLY, TokenParser.FORMAT_STRING, 1));
        parsers.add(new TokenParser("%{referer}i",
                "request.referer",    "HTTP.URI",
                Casts.STRING_ONLY, TokenParser.FORMAT_STRING, 1));

        return parsers;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy