All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.writer.WARCWriterChainProcessor Maven / Gradle / Ivy

Go to download

This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.

There is a newer version: 3.5.0
Show newest version
package org.archive.modules.writer;

import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.io.warc.WARCRecordInfo;
import org.archive.io.warc.WARCWriter;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.deciderules.recrawl.IdenticalDigestDecideRule;
import org.archive.modules.warc.DnsResponseRecordBuilder;
import org.archive.modules.warc.FtpControlConversationRecordBuilder;
import org.archive.modules.warc.FtpResponseRecordBuilder;
import org.archive.modules.warc.HttpRequestRecordBuilder;
import org.archive.modules.warc.HttpResponseRecordBuilder;
import org.archive.modules.warc.MetadataRecordBuilder;
import org.archive.modules.warc.RevisitRecordBuilder;
import org.archive.modules.warc.WARCRecordBuilder;
import org.archive.modules.warc.WhoisResponseRecordBuilder;
import org.archive.spring.HasKeyedProperties;

/**
 * WARC writer processor. The types of records that to be written can be
 * configured by including or excluding {@link WARCRecordBuilder}
 * implementations (see {@link #setChain(List)}).
 *
 * 

This is the default chain: *

 *   <property name="chain">
 *    <list>
 *     <bean class="org.archive.modules.warc.DnsResponseRecordBuilder"/>
 *     <bean class="org.archive.modules.warc.HttpResponseRecordBuilder"/>
 *     <bean class="org.archive.modules.warc.WhoisResponseRecordBuilder"/>
 *     <bean class="org.archive.modules.warc.FtpControlConversationRecordBuilder"/>
 *     <bean class="org.archive.modules.warc.FtpResponseRecordBuilder"/>
 *     <bean class="org.archive.modules.warc.RevisitRecordBuilder"/>
 *     <bean class="org.archive.modules.warc.HttpRequestRecordBuilder"/>
 *     <bean class="org.archive.modules.warc.MetadataRecordBuilder"/>
 *    </list>
 *   </property>
 * 
* *

* Replaces {@link WARCWriterProcessor}. * * @see WARCRecordBuilder * @author nlevitt */ public class WARCWriterChainProcessor extends BaseWARCWriterProcessor implements HasKeyedProperties { private static final Logger logger = Logger.getLogger(WARCWriterChainProcessor.class.getName()); { setChain(Arrays.asList( new DnsResponseRecordBuilder(), new HttpResponseRecordBuilder(), new WhoisResponseRecordBuilder(), new FtpControlConversationRecordBuilder(), new FtpResponseRecordBuilder(), new RevisitRecordBuilder(), new HttpRequestRecordBuilder(), new MetadataRecordBuilder())); } @SuppressWarnings("unchecked") public List getChain() { return (List) kp.get("chain"); } public void setChain(List chain) { kp.put("chain", chain); } @Override protected boolean shouldWrite(CrawlURI curi) { if (getSkipIdenticalDigests() && IdenticalDigestDecideRule.hasIdenticalDigest(curi)) { curi.getAnnotations().add(ANNOTATION_UNWRITTEN + ":identicalDigest"); return false; } // WARCWriterProcessor has seemingly unnecessarily complicated logic if (curi.getFetchStatus() <= 0) { curi.getAnnotations().add(ANNOTATION_UNWRITTEN + ":status"); return false; } return true; } @Override protected ProcessResult innerProcessResult(CrawlURI curi) { try { if (shouldWrite(curi)) { return write(curi); } else { copyForwardWriteTagIfDupe(curi); } } catch (IOException e) { curi.getNonFatalFailures().add(e); logger.log(Level.SEVERE, "Failed write of Records: " + curi.toString(), e); } return ProcessResult.PROCEED; } protected ProcessResult write(final CrawlURI curi) throws IOException { WARCWriter writer = (WARCWriter) getPool().borrowFile(); // Reset writer temp stats so they reflect only this set of records. writer.resetTmpStats(); writer.resetTmpRecordLog(); long position = writer.getPosition(); try { // Roll over to new warc file if we've exceeded maxBytes. writer.checkSize(); if (writer.getPosition() != position) { // We rolled over to a new warc and wrote a warcinfo record. // Tally stats and reset temp stats, to avoid including warcinfo // record in stats for current url. setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - position)); addStats(writer.getTmpStats()); writer.resetTmpStats(); writer.resetTmpRecordLog(); position = writer.getPosition(); } writeRecords(curi, writer); } catch (IOException e) { // Invalidate this file (It gets a '.invalid' suffix). getPool().invalidateFile(writer); // Set the writer to null otherwise the pool accounting // of how many active writers gets skewed if we subsequently // do a returnWriter call on this object in the finally block. writer = null; throw e; } finally { if (writer != null) { updateMetadataAfterWrite(curi, writer, position); getPool().returnFile(writer); } } // XXX this looks wrong, check should happen *before* writing the // record, the way checkBytesWritten() currently works return checkBytesWritten(); } protected void writeRecords(CrawlURI curi, WARCWriter writer) throws IOException { URI concurrentTo = null; for (WARCRecordBuilder recordBuilder: getChain()) { if (recordBuilder.shouldBuildRecord(curi)) { WARCRecordInfo record = recordBuilder.buildRecord(curi, concurrentTo); if (record != null) { writer.writeRecord(record); if (concurrentTo == null) { concurrentTo = record.getRecordId(); } } } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy