com.twitter.elephantbird.util.W3CLogParser Maven / Gradle / Ivy
package com.twitter.elephantbird.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Maps;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/*
* A parser for W3C-style log lines. See LzoW3CLogInputFormat for more details about
* the format itself. Create the parser with an InputStream open to a list of
* hash/field-list lines that give all the field-sets ever used for logging. That is,
* if you logged a set of fields for two days, then added a parameter and logged that set of
* fields for a day, then added another parameter and removed a different one and logged
* that set of fields for a while, the field definition file should have three lines
* with the hashes and corresponding set of fields for each.
*/
public class W3CLogParser {
protected static final Logger LOG = LoggerFactory.getLogger(W3CLogParser.class);
private Map> fieldDef_ = Maps.newHashMap();
private final static String DELIMITER = "\\s+";
// Initialize a W3CLogParser From a InputStream
public W3CLogParser(InputStream is) throws IOException {
BufferedReader buf = new BufferedReader(new InputStreamReader(is));
String line;
while ((line = buf.readLine()) != null) {
List fields = Arrays.asList(line.split(DELIMITER));
fieldDef_.put(fields.get(0), fields.subList(1, fields.size()));
}
}
// Parse a line using the field definition, and put results into a Map of
//
public Map parse(String line) throws IOException {
Map map = new HashMap();
// Get the version CRC and find the field definition
List fields = Arrays.asList(line.split(DELIMITER));
List fieldNames = fieldDef_.get(fields.get(0));
if (fieldNames == null) {
throw new IOException("cannot find matching field definition for CRC "
+ fields.get(0));
}
if (fieldNames.size() != fields.size()) {
throw new IOException("W3C field definition and input line for CRC "
+ fields.get(0) + " does not match:\n" + line);
}
// Map values to field names
for (int fieldNum = 1; fieldNum < fieldNames.size(); fieldNum++) {
map.put(fieldNames.get(fieldNum), fields.get(fieldNum));
}
return map;
}
}