Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.log.syslog;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TimeZone;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.hadoop.hive.common.type.Timestamp;
/**
* A Syslog protocol parser.
* It should be capable of parsing RFC 3164 (BSD syslog) streams as well as RFC 5424 (defined in 2009.).
* Adapted from https://github.com/spotify/flume-syslog-source2 and modified it for several assumptions
* about the way hive logs using syslog format (specifically RFC5424).
*
* This implementation also parses structured data, returns all parsed fields as map and also un-escapes messages.
* This parser also gracefully handles some corner cases where 'msg' can be empty or line can start with '<' but not
* a valid RFC5424 format etc.
*
* Assumption:
* 1) This parser assumes the linebreaks '\n' in stack traces for example are replaced by '\r' to make single
* line message. The reader will do replacement of '\r' with '\n' at the time of read.
* 2) This parser assumes structured data values are html escaped. So it will html unescape when parsing structured
* data. (hive writes log lines directly to stderr that look like rfc5424 layout starting with '<' so the expectation
* from log4j2 is to escape those lines using html escaping).
* 3) Read event returns List<Object> conforming to sys.logs table schema in hive. The schema for sys.logs table is
* expected to be (facility STRING, severity STRING, version STRING, ts TIMESTAMP, hostname STRING, app_name STRING,
* proc_id STRING, msg_id STRING, structured_data map<STRING,STRING>, msg BINARY, unmatched BINARY)
* 4) Timestamps are in UTC
*
* This parser is tested with Log4j2's RFC5424 layout generated using the following properties
* appenders = console
* appender.console.layout.type = Rfc5424Layout
* appender.console.layout.appName = ${env:APP_NAME}
* appender.console.layout.facility = USER
* appender.console.layout.includeMDC = true
* appender.console.layout.mdcId = mdc
* appender.console.layout.messageId = ${env:MSG_ID}
* appender.console.layout.newLine = true
* appender.console.layout.newLineEscape = \\r
* appender.console.layout.exceptionPattern = %ex{full}
* appender.console.layout.loggerfields.type = LoggerFields
* appender.console.layout.loggerfields.pairs1.type = KeyValuePair
* appender.console.layout.loggerfields.pairs1.key = level
* appender.console.layout.loggerfields.pairs1.value = %p
* appender.console.layout.loggerfields.pairs2.type = KeyValuePair
* appender.console.layout.loggerfields.pairs2.key = thread
* appender.console.layout.loggerfields.pairs2.value = %enc{%t}
* appender.console.layout.loggerfields.pairs3.type = KeyValuePair
* appender.console.layout.loggerfields.pairs3.key = class
* appender.console.layout.loggerfields.pairs3.value = %c{2}
*/
public class SyslogParser implements Closeable {
// RFC 5424 section 6.
// SYSLOG-MSG format
// PRI VERSION SP TIMESTAMP SP HOSTNAME SP APP-NAME SP PROCID SP MSGID SP STRUCTURED-DATA [SP MSG]
// facility + severity forms PRI
// Version 0 is the RFC 3164 format, 1 for RFC 5424 format.
// Read event returns the following schema
// facility STRING
// severity STRING
// version STRING
// ts TIMESTAMP
// hostname STRING
// app_name STRING
// proc_id STRING
// msg_id STRING
// structured_data map
// msg BINARY
// unmatched BINARY
private final static int EXPECTED_COLUMNS = 11;
private final static String[] FACILITIES = new String[]{"KERN", "USER", "MAIL", "DAEMON", "AUTH", "SYSLOG", "LPR", "NEWS",
"UUCP", "CRON", "AUTHPRIV", "FTP", "NTP", "AUDIT", "ALERT", "CLOCK", "LOCAL0", "LOCAL1", "LOCAL2", "LOCAL3",
"LOCAL4", "LOCAL5", "LOCAL6", "LOCAL7"};
// As defined in RFC 5424.
private final static int MAX_SUPPORTED_VERSION = 1;
private InputStream in;
private boolean parseTag;
private static final TimeZone UTC = TimeZone.getTimeZone("UTC");
private static final Charset UTF8 = StandardCharsets.UTF_8;
private Charset charset;
/// Push back buffer. -1 indicates that it is empty.
private int pushBack = -1;
public SyslogParser() {
this(null);
}
/**
* Construct a new Syslog protocol parser.
* Tags are parsed, and the encoding is assumed to be UTF-8.
*/
public SyslogParser(InputStream in) {
this(in, true, UTF8);
}
/**
* Construct a new Syslog protocol parser.
*
* @param in the stream to read data from. The InputStream#read()
* function is heavily used, so make sure it is buffered.
* @param parseTag true to parse the "tag[pid]:", false to leave it as
* part of the message body.
* @param encoding the encoding to use for various string conversions,
* most notably the hostname.
*/
public SyslogParser(InputStream in, boolean parseTag, Charset encoding) {
this.in = in;
this.parseTag = parseTag;
this.charset = encoding;
}
/**
* Free the resources used by this parser.
* Note that the parser cannot be reused. Closes the underlying input
* stream.
*/
public void close() throws IOException {
in.close();
}
/**
* Read the next Syslog message from the stream.
*
* @return a parsed map of object, or null on EOF.
* @throws IOException if the underlying stream fails, or unexpected
* bytes are seen.
*/
public List