com.digitalpebble.stormcrawler.warc.WARCSpout Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of storm-crawler-warc Show documentation
WARC resources for StormCrawler
The newest version!
/**
 * Licensed to DigitalPebble Ltd under one or more contributor license agreements. See the NOTICE
 * file distributed with this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License. You may obtain a copy of the
 * License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * 
Unless required by applicable law or agreed to in writing, software distributed under the
 * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.digitalpebble.stormcrawler.warc;

import com.digitalpebble.stormcrawler.Constants;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.persistence.Status;
import com.digitalpebble.stormcrawler.protocol.ProtocolResponse;
import com.digitalpebble.stormcrawler.spout.FileSpout;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import java.io.IOException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import org.apache.hadoop.conf.Configuration;
import org.apache.storm.metric.api.MultiCountMetric;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import org.netpreserve.jwarc.HttpMessage;
import org.netpreserve.jwarc.HttpRequest;
import org.netpreserve.jwarc.HttpResponse;
import org.netpreserve.jwarc.IOUtils;
import org.netpreserve.jwarc.MediaType;
import org.netpreserve.jwarc.ParsingException;
import org.netpreserve.jwarc.WarcPayload;
import org.netpreserve.jwarc.WarcReader;
import org.netpreserve.jwarc.WarcRecord;
import org.netpreserve.jwarc.WarcRequest;
import org.netpreserve.jwarc.WarcResponse;
import org.netpreserve.jwarc.WarcTruncationReason;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Read WARC files from the local files system and emit the WARC captures as tuples into the
 * topology same way as done by {@link com.digitalpebble.stormcrawler.bolt.FetcherBolt}.
 */
public class WARCSpout extends FileSpout {

    private static final Logger LOG = LoggerFactory.getLogger(WARCSpout.class);

    private int maxContentSize = -1;
    private int contentBufferSize = 8192;

    private boolean storeHTTPHeaders = false;
    private String protocolMDprefix = "";

    private WarcReader warcReader;
    private String warcFileInProgress;
    private WarcRequest precedingWarcRequest;
    private Optional record;

    private MultiCountMetric eventCounter;

    protected transient Configuration hdfsConfig;

    public WARCSpout(String... files) {
        super(false, files);
    }

    public WARCSpout(String dir, String filter) {
        super(dir, filter, false);
    }

    /**
     * Holder of truncation status when WARC payload exceeding the content length limit
     * (http.content.limit) is truncated.
     */
    public static class TruncationStatus {
        boolean isTruncated = false;
        long originalSize = -1;

        public void set(boolean isTruncated) {
            this.isTruncated = isTruncated;
        }

        public boolean get() {
            return isTruncated;
        }

        public void setOriginalSize(long size) {
            originalSize = size;
        }

        public long getOriginalSize() {
            return originalSize;
        }
    }

    private void openWARC() {
        if (warcReader != null) {
            try {
                warcReader.close();
            } catch (IOException e) {
                LOG.warn("Failed to close open WARC file", e);
            }
            warcReader = null;
        }

        byte[] head = buffer.removeFirst();
        List