nl.basjes.hadoop.input.ApacheHttpdLogfileRecordReader Maven / Gradle / Ivy
The newest version!
/*
* Apache HTTPD & NGINX Access log parsing made easy
* Copyright (C) 2011-2023 Niels Basjes
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nl.basjes.hadoop.input;
import nl.basjes.parse.core.Casts;
import nl.basjes.parse.core.Dissector;
import nl.basjes.parse.core.Parser;
import nl.basjes.parse.core.exceptions.DissectionFailure;
import nl.basjes.parse.core.exceptions.InvalidDissectorException;
import nl.basjes.parse.core.exceptions.MissingDissectorsException;
import nl.basjes.parse.httpdlog.HttpdLoglineParser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@SuppressWarnings({ "PMD.OnlyOneReturn", "PMD.BeanMembersShouldSerialize" })
public class ApacheHttpdLogfileRecordReader extends
RecordReader {
private static final Logger LOG = LoggerFactory.getLogger(ApacheHttpdLogfileRecordReader.class);
private static final String HTTPD_LOGFILE_INPUT_FORMAT = "HTTPD Access Logfile InputFormat";
public static final String FIELDS = "fields";
// --------------------------------------------
private final LineRecordReader lineReader = new LineRecordReader();
private Parser parser;
private List fieldList = null;
private final ParsedRecord currentValue = new ParsedRecord();
private String logformat = null;
private final Set requestedFields = new HashSet<>();
private Map> typeRemappings = new HashMap<>(16);
private List additionalDissectors;
// --------------------------------------------
@SuppressWarnings("unused") // Used by the Hadoop framework
public ApacheHttpdLogfileRecordReader() {
// Nothing to do here
}
public ApacheHttpdLogfileRecordReader(String logformat,
Set requestedFields,
Map> typeRemappings,
List additionalDissectors) throws IOException {
setLogFormat(logformat);
// Mappings and additional parsers MUST come before the requested fields
this.typeRemappings = typeRemappings;
this.additionalDissectors = additionalDissectors;
addRequestedFields(requestedFields);
}
private void addRequestedFields(Set newRequestedFields) throws IOException {
requestedFields.addAll(newRequestedFields);
fieldList = new ArrayList<>(requestedFields);
try {
setupFields();
} catch (NoSuchMethodException | MissingDissectorsException | InvalidDissectorException e) {
throw new IOException("RecordReader initialization failed", e);
}
}
private void setLogFormat(String newLogformat) {
if (newLogformat == null) {
return;
}
logformat = newLogformat;
}
private boolean outputAllPossibleFields = false;
private String allPossiblePathsFieldName;
private List allPossiblePaths = null;
private Counter counterLinesRead;
private Counter counterGoodLines;
private Counter counterBadLines;
@Override
public void initialize(final InputSplit split,
final TaskAttemptContext context) throws IOException {
lineReader.initialize(split, context);
final Configuration conf = context.getConfiguration();
counterLinesRead = context.getCounter(HTTPD_LOGFILE_INPUT_FORMAT, "1:Lines read");
counterGoodLines = context.getCounter(HTTPD_LOGFILE_INPUT_FORMAT, "2:Good lines");
counterBadLines = context.getCounter(HTTPD_LOGFILE_INPUT_FORMAT, "3:Bad lines");
if (logformat == null || requestedFields.isEmpty()) {
if (logformat == null) {
logformat = conf.get("nl.basjes.parse.apachehttpdlogline.format", "common");
}
if (requestedFields.isEmpty()) {
String fields = conf.get(
"nl.basjes.parse.apachehttpdlogline.fields", null);
if (fields != null) {
fieldList = Arrays.asList(fields.split(","));
}
} else {
fieldList = new ArrayList<>(requestedFields);
}
}
if (fieldList != null) {
if (logformat != null && parser == null) {
parser = createParser();
}
for (String field : fieldList) {
currentValue.declareRequestedFieldname(field);
}
}
try {
setupFields();
} catch (NoSuchMethodException | MissingDissectorsException | InvalidDissectorException e) {
throw new IOException("RecordReader initialization failed", e);
}
}
protected Parser instantiateParser(String logFormat) {
return new HttpdLoglineParser<>(ParsedRecord.class, logFormat)
.setTypeRemappings(typeRemappings)
.addDissectors(additionalDissectors);
}
private Map> allCasts;
private void setupFields() throws MissingDissectorsException, InvalidDissectorException, NoSuchMethodException, IOException {
if (fieldList == null || fieldList.isEmpty()) {
return; // Nothing to do here
}
String firstField = fieldList.get(0);
if (fieldList.size() == 1 &&
firstField.toLowerCase().trim().equals(FIELDS)) {
outputAllPossibleFields = true;
allPossiblePaths = getParser().getPossiblePaths();
allPossiblePathsFieldName = firstField;
Parser newParser = instantiateParser(logformat)
.addParseTarget(ParsedRecord.class.getMethod("set", String.class, String.class), allPossiblePaths)
.addTypeRemappings(typeRemappings);
allCasts = newParser.getAllCasts();
}
}
public EnumSet getCasts(String name) throws IOException {
if (outputAllPossibleFields) {
return allCasts.get(name);
}
try {
return getParser().getCasts(name);
} catch (MissingDissectorsException | InvalidDissectorException e) {
throw new IOException("Fatal error in the parser", e);
}
}
public Parser getParser() throws IOException {
if (parser == null) {
parser = createParser();
}
return parser;
}
private Parser createParser() throws IOException {
if (fieldList == null || logformat == null) {
return null;
}
Parser newParser;
try {
newParser = instantiateParser(logformat);
for (String field: fieldList) {
if (field.endsWith(".*")) {
newParser.addParseTarget(ParsedRecord.class.getMethod("setMultiValueString",
String.class, String.class), field);
} else {
newParser.addParseTarget(ParsedRecord.class.getMethod("set",
String.class, String.class), field);
newParser.addParseTarget(ParsedRecord.class.getMethod("set",
String.class, Long.class), field);
newParser.addParseTarget(ParsedRecord.class.getMethod("set",
String.class, Double.class), field);
}
}
} catch (NoSuchMethodException
|SecurityException e) {
throw new IOException(e.toString());
}
return newParser;
}
// --------------------------------------------
private int errorLinesLogged = 0;
private static final int MAX_ERROR_LINES_LOGGED = 10;
@Override
public boolean nextKeyValue() throws IOException {
if (outputAllPossibleFields) {
// We now ONLY return the possible names of the fields that can be requested
if (allPossiblePaths.isEmpty()) {
return false;
}
currentValue.clear();
String value = allPossiblePaths.get(0);
allPossiblePaths.remove(0);
currentValue.set(allPossiblePathsFieldName, value);
return true;
} else {
boolean haveValue = false;
while (!haveValue) {
if (!lineReader.nextKeyValue()) {
return false;
}
counterLinesRead.increment(1L);
currentValue.clear();
String inputLine = lineReader.getCurrentValue().toString();
try {
getParser().parse(currentValue, lineReader.getCurrentValue().toString());
counterGoodLines.increment(1L);
haveValue = true;
} catch (DissectionFailure e) {
counterBadLines.increment(1L);
if (errorLinesLogged < MAX_ERROR_LINES_LOGGED) {
LOG.error("Parse error >>>{}<<< in line: >>>{}<<<", e.getMessage(), inputLine);
errorLinesLogged++;
if (errorLinesLogged == MAX_ERROR_LINES_LOGGED) {
LOG.error(">>>>>>>>>>> We now stop logging parse errors! <<<<<<<<<<<");
}
}
// Ignore bad lines and simply continue
} catch (InvalidDissectorException e) {
LOG.error("InvalidDissectorException >>>{}<<<", e.getMessage());
return false;
} catch (MissingDissectorsException e) {
LOG.error("MissingDissectorsException >>>{}<<<", e.getMessage());
return false;
}
}
}
return true;
}
@Override
public LongWritable getCurrentKey() {
// The key we return is the same byte offset as the TextInputFormat
// would give.
return lineReader.getCurrentKey();
}
@Override
public ParsedRecord getCurrentValue() {
return currentValue;
}
@Override
public float getProgress() throws IOException {
return lineReader.getProgress();
}
// --------------------------------------------
@Override
public void close() throws IOException {
lineReader.close();
}
// --------------------------------------------
}