Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.digitalpebble.stormcrawler.warc.WARCSpout Maven / Gradle / Ivy
/**
* Licensed to DigitalPebble Ltd under one or more contributor license agreements. See the NOTICE
* file distributed with this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy of the
* License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
*
Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.warc;
import com.digitalpebble.stormcrawler.Constants;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.persistence.Status;
import com.digitalpebble.stormcrawler.protocol.ProtocolResponse;
import com.digitalpebble.stormcrawler.spout.FileSpout;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import java.io.IOException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import org.apache.hadoop.conf.Configuration;
import org.apache.storm.metric.api.MultiCountMetric;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import org.netpreserve.jwarc.HttpMessage;
import org.netpreserve.jwarc.HttpRequest;
import org.netpreserve.jwarc.HttpResponse;
import org.netpreserve.jwarc.IOUtils;
import org.netpreserve.jwarc.MediaType;
import org.netpreserve.jwarc.ParsingException;
import org.netpreserve.jwarc.WarcPayload;
import org.netpreserve.jwarc.WarcReader;
import org.netpreserve.jwarc.WarcRecord;
import org.netpreserve.jwarc.WarcRequest;
import org.netpreserve.jwarc.WarcResponse;
import org.netpreserve.jwarc.WarcTruncationReason;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Read WARC files from the local files system and emit the WARC captures as tuples into the
* topology same way as done by {@link com.digitalpebble.stormcrawler.bolt.FetcherBolt}.
*/
public class WARCSpout extends FileSpout {
private static final Logger LOG = LoggerFactory.getLogger(WARCSpout.class);
private int maxContentSize = -1;
private int contentBufferSize = 8192;
private boolean storeHTTPHeaders = false;
private String protocolMDprefix = "";
private WarcReader warcReader;
private String warcFileInProgress;
private WarcRequest precedingWarcRequest;
private Optional record;
private MultiCountMetric eventCounter;
protected transient Configuration hdfsConfig;
public WARCSpout(String... files) {
super(false, files);
}
public WARCSpout(String dir, String filter) {
super(dir, filter, false);
}
/**
* Holder of truncation status when WARC payload exceeding the content length limit
* (http.content.limit) is truncated.
*/
public static class TruncationStatus {
boolean isTruncated = false;
long originalSize = -1;
public void set(boolean isTruncated) {
this.isTruncated = isTruncated;
}
public boolean get() {
return isTruncated;
}
public void setOriginalSize(long size) {
originalSize = size;
}
public long getOriginalSize() {
return originalSize;
}
}
private void openWARC() {
if (warcReader != null) {
try {
warcReader.close();
} catch (IOException e) {
LOG.warn("Failed to close open WARC file", e);
}
warcReader = null;
}
byte[] head = buffer.removeFirst();
List fields = _scheme.deserialize(ByteBuffer.wrap(head));
warcFileInProgress = (String) fields.get(0);
if (warcFileInProgress == null) return;
LOG.info("Reading WARC file {}", warcFileInProgress);
ReadableByteChannel warcChannel = null;
try {
warcChannel = openChannel(warcFileInProgress);
warcReader = new WarcReader(warcChannel);
} catch (IOException e) {
LOG.error("Failed to open WARC file " + warcFileInProgress, e);
warcFileInProgress = null;
if (warcChannel != null) {
try {
warcChannel.close();
} catch (IOException ex) {
}
}
}
}
private ReadableByteChannel openChannel(String path) throws IOException {
if (path.matches("^https?://.*")) {
URL warcUrl = new URL(path);
return Channels.newChannel(warcUrl.openStream());
}
org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(path);
return Channels.newChannel(hdfsPath.getFileSystem(hdfsConfig).open(hdfsPath));
}
private void closeWARC() {
LOG.info("Finished reading WARC file {}", warcFileInProgress);
try {
warcReader.close();
} catch (IOException e) {
LOG.warn("Failed to close WARC reader", e);
}
warcReader = null;
}
/**
* Proceed to next WARC record, calculate record length of current record and add the length to
* metadata
*/
private void nextRecord(long offset, Metadata metadata) {
long nextOffset = nextRecord();
if (nextOffset > offset) {
metadata.addValue("warc.record.length", Long.toString(nextOffset - offset));
} else {
LOG.error(
"Implausible offset of next WARC record: {} - current offset: {}",
nextOffset,
offset);
}
}
/**
* Proceed to next WARC record.
*
* @return offset of next record in WARC file
*/
private long nextRecord() {
long nextOffset;
while (warcReader == null && !buffer.isEmpty()) {
openWARC();
}
if (warcReader == null) {
// failed to open any new WARC file
record = Optional.empty();
return -1;
}
try {
record = warcReader.next();
nextOffset = warcReader.position();
if (!record.isPresent()) {
closeWARC();
}
} catch (IOException e) {
LOG.error(
"Failed to read WARC {} at position {}:",
warcFileInProgress,
warcReader.position(),
e);
nextOffset = warcReader.position();
record = Optional.empty();
closeWARC();
}
return nextOffset;
}
private boolean isHttpResponse(Optional record) {
if (!record.isPresent()) return false;
if (!(record.get() instanceof WarcResponse)) return false;
if (record.get().contentType().equals(MediaType.HTTP_RESPONSE)) return true;
return false;
}
private byte[] getContent(WarcResponse record, TruncationStatus isTruncated)
throws IOException {
Optional payload = record.payload();
if (!payload.isPresent()) {
return new byte[0];
}
long size = payload.get().body().size();
ReadableByteChannel body = payload.get().body();
// Check HTTP Content-Encoding header whether payload needs decoding
List contentEncodings = record.http().headers().all("Content-Encoding");
try {
if (contentEncodings.size() > 1) {
LOG.error("Multiple Content-Encodings not supported: {}", contentEncodings);
LOG.warn("Trying to read payload of {} without Content-Encoding", record.target());
} else if (contentEncodings.isEmpty()
|| contentEncodings.get(0).equalsIgnoreCase("identity")
|| contentEncodings.get(0).equalsIgnoreCase("none")) {
// no need for decoding
} else if (contentEncodings.get(0).equalsIgnoreCase("gzip")
|| contentEncodings.get(0).equalsIgnoreCase("x-gzip")) {
LOG.debug(
"Decoding payload of {} from Content-Encoding {}",
record.target(),
contentEncodings.get(0));
body = IOUtils.gunzipChannel(body);
body.read(ByteBuffer.allocate(0));
size = -1;
} else if (contentEncodings.get(0).equalsIgnoreCase("deflate")) {
LOG.debug(
"Decoding payload of {} from Content-Encoding {}",
record.target(),
contentEncodings.get(0));
body = IOUtils.inflateChannel(body);
body.read(ByteBuffer.allocate(0));
size = -1;
} else {
LOG.error("Content-Encoding not supported: {}", contentEncodings.get(0));
LOG.warn("Trying to read payload of {} without Content-Encoding", record.target());
}
} catch (IOException e) {
LOG.error(
"Failed to read payload with Content-Encoding {}: {}",
contentEncodings.get(0),
e.getMessage());
LOG.warn("Trying to read payload of {} without Content-Encoding", record.target());
body = payload.get().body();
}
isTruncated.set(false);
if (size > maxContentSize) {
LOG.info(
"WARC payload of size {} to be truncated to {} bytes for {}",
size,
maxContentSize,
record.target());
size = maxContentSize;
}
ByteBuffer buf;
if (size >= 0) {
buf = ByteBuffer.allocate((int) size);
} else {
buf = ByteBuffer.allocate(contentBufferSize);
}
// dynamically growing list of buffers for large content of unknown size
ArrayList bufs = new ArrayList<>();
int r, read = 0;
while (read < maxContentSize) {
try {
if ((r = body.read(buf)) < 0) break; // eof
} catch (ParsingException e) {
LOG.error("Failed to read chunked content of {}: {}", record.target(), e);
/*
* caused by an invalid Transfer-Encoding or a HTTP header `Transfer-Encoding:
* chunked` removed although the Transfer-Encoding was removed in the WARC file
*/
// TODO: should retry without chunked Transfer-Encoding
break;
} catch (IOException e) {
LOG.error("Failed to read content of {}: {}", record.target(), e);
break;
}
if (r == 0 && !buf.hasRemaining()) {
buf.flip();
bufs.add(buf);
buf = ByteBuffer.allocate(Math.min(contentBufferSize, (maxContentSize - read)));
}
read += r;
}
buf.flip();
if (read == maxContentSize) {
// to mark truncation: check whether there is more content
r = body.read(ByteBuffer.allocate(1));
if (r > -1) {
isTruncated.set(true);
// read remaining body (also to figure out original length)
long truncatedLength = r;
ByteBuffer buffer = ByteBuffer.allocate(8192);
try {
while ((r = body.read(buffer)) >= 0) {
buffer.clear();
truncatedLength += r;
}
} catch (IOException e) {
// log and ignore, it's about unused content
LOG.info("Exception while determining length of truncation:", e);
}
isTruncated.setOriginalSize(read + truncatedLength);
LOG.info(
"WARC payload of size {} is truncated to {} bytes for {}",
isTruncated.getOriginalSize(),
maxContentSize,
record.target());
}
}
if (read == size) {
// short-cut: return buffer-internal array
return buf.array();
}
// copy buffers into result byte[]
byte[] arr = new byte[read];
int pos = 0;
for (ByteBuffer b : bufs) {
r = b.remaining();
b.get(arr, pos, r);
pos += r;
}
buf.get(arr, pos, buf.remaining());
return arr;
}
private static String httpHeadersVerbatim(HttpMessage http) {
return new String(http.serializeHeader(), StandardCharsets.UTF_8);
}
private void addVerbatimHttpHeaders(
Metadata metadata, WarcResponse response, HttpResponse http, HttpRequest request) {
metadata.addValue(
protocolMDprefix + ProtocolResponse.REQUEST_TIME_KEY,
Long.toString(response.date().toEpochMilli()));
if (response.ipAddress().isPresent()) {
metadata.addValue(
protocolMDprefix + ProtocolResponse.RESPONSE_IP_KEY,
response.ipAddress().get().getHostAddress());
}
if (request != null) {
metadata.addValue(
protocolMDprefix + ProtocolResponse.REQUEST_HEADERS_KEY,
httpHeadersVerbatim(request));
}
metadata.addValue(
protocolMDprefix + ProtocolResponse.RESPONSE_HEADERS_KEY,
httpHeadersVerbatim(http));
}
@Override
public void open(
Map conf, TopologyContext context, SpoutOutputCollector collector) {
super.open(conf, context, collector);
record = Optional.empty();
maxContentSize = ConfUtils.getInt(conf, "http.content.limit", -1);
if (maxContentSize == -1 || maxContentSize > Constants.MAX_ARRAY_SIZE) {
// maximum possible payload length, must fit into an array
maxContentSize = Constants.MAX_ARRAY_SIZE;
}
if (contentBufferSize > maxContentSize) {
// no need to buffer more content than max. used
contentBufferSize = maxContentSize;
}
storeHTTPHeaders = ConfUtils.getBoolean(conf, "http.store.headers", false);
protocolMDprefix =
ConfUtils.getString(
conf, ProtocolResponse.PROTOCOL_MD_PREFIX_PARAM, protocolMDprefix);
int metricsTimeBucketSecs = ConfUtils.getInt(conf, "fetcher.metrics.time.bucket.secs", 10);
eventCounter =
context.registerMetric(
"warc_spout_counter", new MultiCountMetric(), metricsTimeBucketSecs);
hdfsConfig = new Configuration();
String configKey = ConfUtils.getString(conf, "hdfs.config.key", "hdfs");
Map map = (Map) conf.get(configKey);
if (map != null) {
for (String key : map.keySet()) {
this.hdfsConfig.set(key, String.valueOf(map.get(key)));
}
}
}
@Override
public void nextTuple() {
if (!active) return;
if (buffer.isEmpty()) {
try {
populateBuffer();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
if (warcReader == null && buffer.isEmpty()) {
// input exhausted
return;
}
if (!record.isPresent()) nextRecord();
while (record.isPresent() && !isHttpResponse(record)) {
String warcType = record.get().type();
if (warcType == null) {
LOG.warn("No type for {}", record.get().getClass());
} else {
eventCounter.scope("warc_skipped_record_of_type_" + warcType).incr();
LOG.debug("Skipped WARC record of type {}", warcType);
}
if (storeHTTPHeaders && record.get() instanceof WarcRequest) {
// store request records to be able to add HTTP request
// header to metadata
precedingWarcRequest = (WarcRequest) record.get();
try {
// need to read and parse HTTP header right now
// (otherwise it's skipped)
precedingWarcRequest.http();
} catch (IOException e) {
LOG.error(
"Failed to read HTTP request for {} in {}: {}",
precedingWarcRequest.target(),
warcFileInProgress,
e);
precedingWarcRequest = null;
}
}
nextRecord();
}
if (!record.isPresent()) return;
eventCounter.scope("warc_http_response_record").incr();
WarcResponse w = (WarcResponse) record.get();
String url = w.target();
HttpResponse http;
try {
http = w.http();
} catch (IOException e) {
LOG.error("Failed to read HTTP response for {} in {}: {}", url, warcFileInProgress, e);
nextRecord();
return;
}
LOG.info("Fetched {} with status {}", url, http.status());
eventCounter.scope("fetched").incrBy(1);
final Status status = Status.fromHTTPCode(http.status());
eventCounter.scope("status_" + http.status()).incrBy(1);
Metadata metadata = new Metadata();
// add HTTP status code expected by schedulers
metadata.addValue("fetch.statusCode", Integer.toString(http.status()));
// add time when page was fetched (capture time)
metadata.addValue(
protocolMDprefix + ProtocolResponse.REQUEST_TIME_KEY,
Long.toString(w.date().toEpochMilli()));
// Add HTTP response headers to metadata
for (Map.Entry> e : http.headers().map().entrySet()) {
metadata.addValues(protocolMDprefix + e.getKey(), e.getValue());
}
if (storeHTTPHeaders) {
// if recording HTTP headers: add IP address, fetch date time,
// literal request and response headers
HttpRequest req = null;
if (precedingWarcRequest != null
&& (w.concurrentTo().contains(precedingWarcRequest.id())
|| w.target().equals(precedingWarcRequest.target()))) {
try {
req = precedingWarcRequest.http();
} catch (IOException e) {
// ignore, no HTTP request headers are no issue
}
}
addVerbatimHttpHeaders(metadata, w, http, req);
}
// add WARC record information
metadata.addValue("warc.file.name", warcFileInProgress);
long offset = warcReader.position();
metadata.addValue("warc.record.offset", Long.toString(offset));
/*
* note: warc.record.length must be calculated after WARC record has been
* entirely processed
*/
if (status == Status.FETCHED && http.status() != 304) {
byte[] content;
TruncationStatus isTruncated = new TruncationStatus();
try {
content = getContent(w, isTruncated);
} catch (IOException e) {
LOG.error("Failed to read payload for {} in {}: {}", url, warcFileInProgress, e);
content = new byte[0];
}
eventCounter.scope("bytes_fetched").incrBy(content.length);
if (isTruncated.get() || w.truncated() != WarcTruncationReason.NOT_TRUNCATED) {
WarcTruncationReason reason = WarcTruncationReason.LENGTH;
if (w.truncated() != WarcTruncationReason.NOT_TRUNCATED) {
reason = w.truncated();
}
metadata.setValue(protocolMDprefix + ProtocolResponse.TRIMMED_RESPONSE_KEY, "true");
metadata.setValue(
protocolMDprefix + ProtocolResponse.TRIMMED_RESPONSE_REASON_KEY,
reason.toString().toLowerCase(Locale.ROOT));
}
nextRecord(offset, metadata); // proceed and calculate length
_collector.emit(new Values(url, content, metadata), url);
return;
}
nextRecord(offset, metadata); // proceed and calculate length
// redirects, 404s, etc.
_collector.emit(Constants.StatusStreamName, new Values(url, metadata, status), url);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declareStream(Constants.StatusStreamName, new Fields("url", "metadata", "status"));
declarer.declare(new Fields("url", "content", "metadata"));
}
@Override
public void fail(Object msgId) {
LOG.error("Failed - unable to replay WARC record of: {}", msgId);
}
}