nl.basjes.hadoop.input.ApacheHttpdLogfileInputFormat Maven / Gradle / Ivy
The newest version!
/*
* Apache HTTPD & NGINX Access log parsing made easy
* Copyright (C) 2011-2023 Niels Basjes
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nl.basjes.hadoop.input;
import nl.basjes.parse.core.Dissector;
import nl.basjes.parse.httpdlog.HttpdLoglineParser;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class ApacheHttpdLogfileInputFormat extends
FileInputFormat {
private String logFormat = null;
private final Set requestedFields = new HashSet<>();
private Map> typeRemappings;
private List additionalDissectors;
private ApacheHttpdLogfileRecordReader theRecordReader;
// --------------------------------------------
public List listPossibleFields(String logformat) {
return listPossibleFields(logformat, typeRemappings, additionalDissectors);
}
public static List listPossibleFields(String logformat, Map> typeRemappings, List additionalDissectors) {
HttpdLoglineParser parser = new HttpdLoglineParser<>(ParsedRecord.class, logformat);
parser.setTypeRemappings(typeRemappings);
parser.addDissectors(additionalDissectors);
return parser.getPossiblePaths();
}
public String getLogFormat() {
return logFormat;
}
public Set getRequestedFields() {
return requestedFields;
}
public Map> getTypeRemappings() {
return typeRemappings;
}
public List getAdditionalDissectors() {
return additionalDissectors;
}
public ApacheHttpdLogfileInputFormat() {
super();
}
public ApacheHttpdLogfileInputFormat(
String logformat,
Collection requestedFields,
Map> typeRemappings,
List additionalDissectors) {
super();
this.logFormat = logformat;
this.requestedFields.addAll(requestedFields);
this.typeRemappings = typeRemappings;
this.additionalDissectors = additionalDissectors;
}
// --------------------------------------------
public ApacheHttpdLogfileRecordReader createRecordReader() {
try {
return new ApacheHttpdLogfileRecordReader(getLogFormat(), getRequestedFields(), getTypeRemappings(), getAdditionalDissectors());
} catch (IOException e) {
return null;
}
}
public ApacheHttpdLogfileRecordReader getRecordReader() {
if (theRecordReader == null) {
theRecordReader = createRecordReader();
}
return theRecordReader;
}
@Override
public RecordReader createRecordReader(
final InputSplit split, final TaskAttemptContext context) {
return getRecordReader();
}
@Override
protected boolean isSplitable(JobContext context, Path file) {
final CompressionCodec codec =
new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
return (null == codec) || codec instanceof SplittableCompressionCodec;
}
public void setTypeRemappings(Map> newTypeRemappings) {
this.typeRemappings = newTypeRemappings;
}
}