nl.basjes.pig.input.apachehttpdlog.Loader Maven / Gradle / Ivy
/*
* Apache HTTPD logparsing made easy
* Copyright (C) 2013 Niels Basjes
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package nl.basjes.pig.input.apachehttpdlog;
import java.io.IOException;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
import nl.basjes.hadoop.input.ApacheHttpdLogfileInputFormat;
import nl.basjes.hadoop.input.ApacheHttpdLogfileRecordReader;
import nl.basjes.hadoop.input.ParsedRecord;
import nl.basjes.parse.core.Casts;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.pig.Expression;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
public class Loader
extends LoadFunc
implements LoadMetadata {
@SuppressWarnings("rawtypes")
private ApacheHttpdLogfileRecordReader reader;
private boolean isBuildingFields;
private String logformat;
private final List requestedFields = new ArrayList<>();
private final TupleFactory tupleFactory;
private ApacheHttpdLogfileInputFormat theInputFormat;
// ------------------------------------------
/**
* Pig Loaders only take string parameters. The CTOR is really the only
* interaction the user has with the Loader from the script.
*
* @param parameters specified from the call within the pig code
*/
public Loader(String... parameters) {
for (String param : parameters) {
if (logformat == null) {
logformat = param;
} else {
requestedFields.add(param);
isBuildingFields = isBuildingFields || "fields".equals(param.toLowerCase());
}
}
if (logformat == null) {
throw new IllegalArgumentException("Must specify the logformat");
}
theInputFormat = new ApacheHttpdLogfileInputFormat(getLogformat(), getRequestedFields());
reader = theInputFormat.getRecordReader();
tupleFactory = TupleFactory.getInstance();
}
// ------------------------------------------
@Override
public InputFormat, ?> getInputFormat()
throws IOException {
return theInputFormat;
}
// ------------------------------------------
public String getLogformat() {
return logformat;
}
public List getRequestedFields() {
return requestedFields;
}
// ------------------------------------------
@Override
public Tuple getNext()
throws IOException {
Tuple tuple = null;
if (isBuildingFields) {
isBuildingFields = false; // Terminate on the next iteration
return tupleFactory.newTuple(createPigExample());
}
boolean notDone = reader.nextKeyValue();
if (!notDone) {
return null;
}
ParsedRecord value = (ParsedRecord)reader.getCurrentValue();
if (value != null) {
List