All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datatorrent.lib.logs.ApacheLogParseOperator Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package com.datatorrent.lib.logs;

import java.text.ParseException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.datatorrent.api.DefaultInputPort;
import com.datatorrent.api.DefaultOutputPort;
import com.datatorrent.api.annotation.OperatorAnnotation;
import com.datatorrent.api.annotation.Stateless;
import com.datatorrent.common.util.BaseOperator;

/**
 * Parse Apache log lines one line at a time. 
 * Regex (getAccessLogRegex) is used as a parser. 
 * The fields extracted include i/p (outputIPAddress), url (outputUrl),
 * status code (outputStatusCode), bytes (outputBytes), referer (outputReferer),
 * and agent (outputAgent).
 * 

* This is a pass through operator
*
* StateFull : No
* Partitions : Yes, No dependency among input values.
*
* Ports:
* data: expects String
* outputIPAddress: emits String
* outputUrl: emits String
* outputStatusCode: emits String
* outputBytes: emits String
* outputReferer: emits String
* outputAgent: emits String
*
* Properties: none
*

* @displayName Apache Log Parse * @category Tuple Converters * @tags apache, parse * * @since 0.3.3 */ @Stateless @OperatorAnnotation(partitionable = true) public class ApacheLogParseOperator extends BaseOperator { /** * This is the input port which receives apache log lines. */ public final transient DefaultInputPort data = new DefaultInputPort() { @Override public void process(String s) { try { processTuple(s); } catch (ParseException ex) { // ignore } } }; /** * Client IP address, output port. */ public final transient DefaultOutputPort outputIPAddress = new DefaultOutputPort(); /** * Access url port, output port. */ public final transient DefaultOutputPort outputUrl = new DefaultOutputPort(); /** * Apache status log, output port. */ public final transient DefaultOutputPort outputStatusCode = new DefaultOutputPort(); /** * Number of bytes served, output port. */ public final transient DefaultOutputPort outputBytes = new DefaultOutputPort(); /** * Referer name, output port. */ public final transient DefaultOutputPort outputReferer = new DefaultOutputPort(); /** * IP Agent, output port. */ public final transient DefaultOutputPort outputAgent = new DefaultOutputPort(); /** * Get apache log pattern regex. * @return regex string. */ protected static String getAccessLogRegex() { String regex1 = "^([\\d\\.]+)"; // Client IP String regex2 = " (\\S+)"; // - String regex3 = " (\\S+)"; // - String regex4 = " \\[([\\w:/]+\\s[+\\-]\\d{4})\\]"; // Date String regex5 = " \"[A-Z]+ (.+?) HTTP/\\S+\""; // url String regex6 = " (\\d{3})"; // HTTP code String regex7 = " (\\d+)"; // Number of bytes String regex8 = " \"([^\"]+)\""; // Referer String regex9 = " \"([^\"]+)\""; // Agent String regex10 = ".*"; // ignore the rest return regex1 + regex2 + regex3 + regex4 + regex5 + regex6 + regex7 + regex8 + regex9 + regex10; } /** * Parses Apache combined access log, and prints out the following
* 1. Requester IP
* 2. Date of Request
* 3. Requested Page Path * * @param line * : tuple to parsee * @throws ParseException */ public void processTuple(String line) throws ParseException { // Apapche log attaributes on each line. String url; String httpStatusCode; long numOfBytes; String referer; String agent; String ipAddr; // Parse each log line. Pattern accessLogPattern = Pattern.compile(getAccessLogRegex(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher accessLogEntryMatcher; accessLogEntryMatcher = accessLogPattern.matcher(line); if (accessLogEntryMatcher.matches()) { ipAddr = accessLogEntryMatcher.group(1); url = accessLogEntryMatcher.group(5); httpStatusCode = accessLogEntryMatcher.group(6); numOfBytes = Long.parseLong(accessLogEntryMatcher.group(7)); referer = accessLogEntryMatcher.group(8); agent = accessLogEntryMatcher.group(9); outputIPAddress.emit(ipAddr); outputUrl.emit(url); outputStatusCode.emit(httpStatusCode); outputBytes.emit(numOfBytes); outputReferer.emit(referer); outputAgent.emit(agent); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy