All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.writer.WARCWriterProcessor Maven / Gradle / Ivy

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules.writer;

import static org.archive.format.warc.WARCConstants.FTP_CONTROL_CONVERSATION_MIMETYPE;
import static org.archive.format.warc.WARCConstants.HEADER_KEY_CONCURRENT_TO;
import static org.archive.format.warc.WARCConstants.HEADER_KEY_IP;
import static org.archive.format.warc.WARCConstants.HEADER_KEY_PAYLOAD_DIGEST;
import static org.archive.format.warc.WARCConstants.HEADER_KEY_PROFILE;
import static org.archive.format.warc.WARCConstants.HEADER_KEY_TRUNCATED;
import static org.archive.format.warc.WARCConstants.HTTP_REQUEST_MIMETYPE;
import static org.archive.format.warc.WARCConstants.HTTP_RESPONSE_MIMETYPE;
import static org.archive.format.warc.WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_HEAD;
import static org.archive.format.warc.WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_LENGTH;
import static org.archive.format.warc.WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_TIME;
import static org.archive.format.warc.WARCConstants.PROFILE_REVISIT_IDENTICAL_DIGEST;
import static org.archive.format.warc.WARCConstants.TYPE;
import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL;
import static org.archive.modules.CoreAttributeConstants.A_FTP_CONTROL_CONVERSATION;
import static org.archive.modules.CoreAttributeConstants.A_FTP_FETCH_STATUS;
import static org.archive.modules.CoreAttributeConstants.A_SOURCE_TAG;
import static org.archive.modules.CoreAttributeConstants.A_WARC_RESPONSE_HEADERS;
import static org.archive.modules.CoreAttributeConstants.A_WARC_STATS;
import static org.archive.modules.CoreAttributeConstants.HEADER_TRUNC;
import static org.archive.modules.CoreAttributeConstants.LENGTH_TRUNC;
import static org.archive.modules.CoreAttributeConstants.TIMER_TRUNC;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_CONTENT_DIGEST_COUNT;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ORIGINAL_DATE;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ORIGINAL_URL;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_FILENAME;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_FILE_OFFSET;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_RECORD_ID;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WRITE_TAG;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.InetAddress;
import java.net.URI;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.io.ReplayInputStream;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.io.warc.WARCWriter;
import org.archive.io.warc.WARCWriterPool;
import org.archive.io.warc.WARCWriterPoolSettings;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.revisit.IdenticalPayloadDigestRevisit;
import org.archive.modules.revisit.RevisitProfile;
import org.archive.spring.ConfigPath;
import org.archive.uid.RecordIDGenerator;
import org.archive.uid.UUIDGenerator;
import org.archive.util.ArchiveUtils;
import org.archive.util.anvl.ANVLRecord;
import org.json.JSONException;
import org.json.JSONObject;

/**
 * WARCWriterProcessor.
 * Intends to follow the WARC/1.0 specification.
 * 
 * 

TODO: Remove ANVLRecord. Rename NameValue or use RFC822 * (commons-httpclient?) or find something else. * * @author stack */ public class WARCWriterProcessor extends WriterPoolProcessor implements WARCWriterPoolSettings { @SuppressWarnings("unused") private static final long serialVersionUID = 6182850087635847443L; private static final Logger logger = Logger.getLogger(WARCWriterProcessor.class.getName()); private ConcurrentMap> stats = new ConcurrentHashMap>(); public ConcurrentMap> getStats() { return stats; } private AtomicLong urlsWritten = new AtomicLong(); public long getDefaultMaxFileSize() { return 1000000000L; // 1 SI giga-byte (10^9 bytes), per WARC appendix A } public List getDefaultStorePaths() { List paths = new ArrayList(); paths.add(new ConfigPath("warcs default store path", "warcs")); return paths; } /** * Whether to write 'request' type records. Default is true. */ { setWriteRequests(true); } public boolean getWriteRequests() { return (Boolean) kp.get("writeRequests"); } public void setWriteRequests(boolean writeRequests) { kp.put("writeRequests",writeRequests); } /** * Whether to write 'metadata' type records. Default is true. */ { setWriteMetadata(true); } public boolean getWriteMetadata() { return (Boolean) kp.get("writeMetadata"); } public void setWriteMetadata(boolean writeMetadata) { kp.put("writeMetadata",writeMetadata); } /** * Generator for record IDs */ protected RecordIDGenerator generator = new UUIDGenerator(); public RecordIDGenerator getRecordIDGenerator() { return generator; } public void setRecordIDGenerator(RecordIDGenerator generator) { this.generator = generator; } @Deprecated public void setWriteRevisitForIdenticalDigests(boolean writeRevisits) { logger.warning("setting writeRevisitForIdenticalDigests is deprecated, value ignored"); } @Deprecated public void setWriteRevisitForNotModified(boolean writeRevisits) { logger.warning("setting writeRevisitForNotModified is deprecated, value ignored"); } private transient List cachedMetadata; public WARCWriterProcessor() { } @Override protected void setupPool(final AtomicInteger serialNo) { setPool(new WARCWriterPool(serialNo, this, getPoolMaxActive(), getMaxWaitForIdleMs())); } /** * Writes a CrawlURI and its associated data to store file. * * Currently this method understands the following uri types: dns, http, and * https. * * @param curi CrawlURI to process. * */ @Override protected ProcessResult innerProcessResult(CrawlURI curi) { String scheme = curi.getUURI().getScheme().toLowerCase(); try { if (shouldWrite(curi)) { return write(scheme, curi); } else { copyForwardWriteTagIfDupe(curi); } } catch (IOException e) { curi.getNonFatalFailures().add(e); logger.log(Level.SEVERE, "Failed write of Records: " + curi.toString(), e); } return ProcessResult.PROCEED; } protected ProcessResult write(final String lowerCaseScheme, final CrawlURI curi) throws IOException { WARCWriter writer = (WARCWriter) getPool().borrowFile(); // Reset writer temp stats so they reflect only this set of records. writer.resetTmpStats(); writer.resetTmpRecordLog(); long position = writer.getPosition(); try { // Roll over to new warc file if we've exceeded maxBytes. writer.checkSize(); if (writer.getPosition() != position) { // We rolled over to a new warc and wrote a warcinfo record. // Tally stats and reset temp stats, to avoid including warcinfo // record in stats for current url. setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - position)); addStats(writer.getTmpStats()); writer.resetTmpStats(); writer.resetTmpRecordLog(); position = writer.getPosition(); } // Write a request, response, and metadata all in the one // 'transaction'. final URI baseid = getRecordID(); final String timestamp = ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); if (lowerCaseScheme.startsWith("http")) { writeHttpRecords(curi, writer, baseid, timestamp); } else if (lowerCaseScheme.equals("dns")) { writeDnsRecords(curi, writer, baseid, timestamp); } else if (lowerCaseScheme.equals("ftp")) { writeFtpRecords(writer, curi, baseid, timestamp); } else if (lowerCaseScheme.equals("whois")) { writeWhoisRecords(writer, curi, baseid, timestamp); } else { logger.warning("No handler for scheme " + lowerCaseScheme); } } catch (IOException e) { // Invalidate this file (It gets a '.invalid' suffix). getPool().invalidateFile(writer); // Set the writer to null otherwise the pool accounting // of how many active writers gets skewed if we subsequently // do a returnWriter call on this object in the finally block. writer = null; throw e; } finally { if (writer != null) { updateMetadataAfterWrite(curi, writer, position); getPool().returnFile(writer); } } // XXX this looks wrong, check should happen *before* writing the // record, the way checkBytesWritten() currently works return checkBytesWritten(); } protected Map> copyStats(Map> orig) { Map> copy = new HashMap>(orig.size()); for (String k: orig.keySet()) { copy.put(k, new HashMap(orig.get(k))); } return copy; } protected void updateMetadataAfterWrite(final CrawlURI curi, WARCWriter writer, long startPosition) { if (WARCWriter.getStat(writer.getTmpStats(), WARCWriter.TOTALS, WARCWriter.NUM_RECORDS) > 0l) { addStats(writer.getTmpStats()); urlsWritten.incrementAndGet(); } if (logger.isLoggable(Level.FINE)) { logger.fine("wrote " + WARCWriter.getStat(writer.getTmpStats(), WARCWriter.TOTALS, WARCWriter.SIZE_ON_DISK) + " bytes to " + writer.getFile().getName() + " for " + curi); } setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - startPosition)); curi.addExtraInfo("warcFilename", writer.getFilenameWithoutOccupiedSuffix()); curi.addExtraInfo("warcFileOffset", startPosition); curi.getData().put(A_WARC_STATS, copyStats(writer.getTmpStats())); // history for uri-based dedupe Map[] history = curi.getFetchHistory(); if (history != null && history[0] != null) { history[0].put(A_WRITE_TAG, writer.getFilenameWithoutOccupiedSuffix()); } // history for uri-agnostic, content digest based dedupe if (curi.getContentDigest() != null && curi.hasContentDigestHistory()) { for (WARCRecordInfo warcRecord: writer.getTmpRecordLog()) { if ((warcRecord.getType() == WARCRecordType.response || warcRecord.getType() == WARCRecordType.resource) && warcRecord.getContentStream() != null && warcRecord.getContentLength() > 0) { curi.getContentDigestHistory().put(A_ORIGINAL_URL, warcRecord.getUrl()); curi.getContentDigestHistory().put(A_WARC_RECORD_ID, warcRecord.getRecordId().toString()); curi.getContentDigestHistory().put(A_WARC_FILENAME, warcRecord.getWARCFilename()); curi.getContentDigestHistory().put(A_WARC_FILE_OFFSET, warcRecord.getWARCFileOffset()); curi.getContentDigestHistory().put(A_ORIGINAL_DATE, warcRecord.getCreate14DigitDate()); curi.getContentDigestHistory().put(A_CONTENT_DIGEST_COUNT, 1); } else if (warcRecord.getType() == WARCRecordType.revisit && curi.getRevisitProfile() instanceof IdenticalPayloadDigestRevisit) { Integer oldCount = (Integer) curi.getContentDigestHistory().get(A_CONTENT_DIGEST_COUNT); if (oldCount == null) { // shouldn't happen, log a warning? oldCount = 1; } curi.getContentDigestHistory().put(A_CONTENT_DIGEST_COUNT, oldCount + 1); } } } } protected void addStats(Map> substats) { for (String key: substats.keySet()) { // intentionally redundant here -- if statement avoids creating // unused empty map every time; putIfAbsent() ensures thread safety if (stats.get(key) == null) { stats.putIfAbsent(key, new ConcurrentHashMap()); } for (String subkey: substats.get(key).keySet()) { AtomicLong oldValue = stats.get(key).get(subkey); if (oldValue == null) { oldValue = stats.get(key).putIfAbsent(subkey, new AtomicLong(substats.get(key).get(subkey))); } if (oldValue != null) { oldValue.addAndGet(substats.get(key).get(subkey)); } } } } protected void writeDnsRecords(final CrawlURI curi, WARCWriter w, final URI baseid, final String timestamp) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(curi.getContentType()); recordInfo.setRecordId(baseid); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); String ip = (String)curi.getData().get(A_DNS_SERVER_IP_LABEL); if (ip != null && ip.length() > 0) { recordInfo.addExtraHeader(HEADER_KEY_IP, ip); } ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } recordInfo.getRecordId(); } protected void writeWhoisRecords(WARCWriter w, CrawlURI curi, URI baseid, String timestamp) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(curi.getContentType()); recordInfo.setRecordId(baseid); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); Object whoisServerIP = curi.getData().get(CoreAttributeConstants.A_WHOIS_SERVER_IP); if (whoisServerIP != null) { recordInfo.addExtraHeader(HEADER_KEY_IP, whoisServerIP.toString()); } ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } recordInfo.getRecordId(); } protected void writeHttpRecords(final CrawlURI curi, WARCWriter w, final URI baseid, final String timestamp) throws IOException { // Add named fields for ip, checksum, and relate the metadata // and request to the resource field. // TODO: Use other than ANVL (or rename ANVL as NameValue or // use RFC822 (commons-httpclient?). ANVLRecord headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi)); URI rid; if (curi.isRevisit()) { rid = writeRevisit(w, timestamp, HTTP_RESPONSE_MIMETYPE, baseid, curi, headers); } else { if (curi.getContentDigest() != null) { headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST, curi.getContentDigestSchemeString()); } // Check for truncated annotation String value = null; Collection anno = curi.getAnnotations(); if (anno.contains(TIMER_TRUNC)) { value = NAMED_FIELD_TRUNCATED_VALUE_TIME; } else if (anno.contains(LENGTH_TRUNC)) { value = NAMED_FIELD_TRUNCATED_VALUE_LENGTH; } else if (anno.contains(HEADER_TRUNC)) { value = NAMED_FIELD_TRUNCATED_VALUE_HEAD; } // TODO: Add annotation for TRUNCATED_VALUE_UNSPECIFIED if (value != null) { headers.addLabelValue(HEADER_KEY_TRUNCATED, value); } rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE, baseid, curi, headers); } headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>'); if (getWriteRequests()) { writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE, baseid, curi, headers); } if (getWriteMetadata()) { writeMetadata(w, timestamp, baseid, curi, headers); } } protected void writeFtpRecords(WARCWriter w, final CrawlURI curi, final URI baseid, final String timestamp) throws IOException { ANVLRecord headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi)); String controlConversation = curi.getData().get(A_FTP_CONTROL_CONVERSATION).toString(); URI rid = writeFtpControlConversation(w, timestamp, baseid, curi, headers, controlConversation); if (curi.getContentDigest() != null) { headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST, curi.getContentDigestSchemeString()); } if (curi.getRecorder() != null) { if (curi.isRevisit()) { rid = writeRevisit(w, timestamp, null, baseid, curi, headers, 0); } else { headers = new ANVLRecord(); // Check for truncated annotation String value = null; Collection anno = curi.getAnnotations(); if (anno.contains(TIMER_TRUNC)) { value = NAMED_FIELD_TRUNCATED_VALUE_TIME; } else if (anno.contains(LENGTH_TRUNC)) { value = NAMED_FIELD_TRUNCATED_VALUE_LENGTH; } else if (anno.contains(HEADER_TRUNC)) { value = NAMED_FIELD_TRUNCATED_VALUE_HEAD; } // TODO: Add annotation for TRUNCATED_VALUE_UNSPECIFIED if (value != null) { headers.addLabelValue(HEADER_KEY_TRUNCATED, value); } if (curi.getContentDigest() != null) { headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST, curi.getContentDigestSchemeString()); } headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>'); rid = writeResource(w, timestamp, curi.getContentType(), baseid, curi, headers); } } if (getWriteMetadata()) { headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>'); writeMetadata(w, timestamp, baseid, curi, headers); } } protected URI writeFtpControlConversation(WARCWriter w, String timestamp, URI baseid, CrawlURI curi, ANVLRecord headers, String controlConversation) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setUrl(curi.toString()); recordInfo.setMimetype(FTP_CONTROL_CONVERSATION_MIMETYPE); recordInfo.setExtraHeaders(headers); recordInfo.setEnforceLength(true); recordInfo.setType(WARCRecordType.metadata); recordInfo.setRecordId(qualifyRecordID(baseid, TYPE, WARCRecordType.metadata.toString())); byte[] b = controlConversation.getBytes("UTF-8"); recordInfo.setContentStream(new ByteArrayInputStream(b)); recordInfo.setContentLength((long) b.length); w.writeRecord(recordInfo); return recordInfo.getRecordId(); } protected URI writeRequest(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.request); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(mimetype); recordInfo.setExtraHeaders(namedFields); recordInfo.setContentLength(curi.getRecorder().getRecordedOutput().getSize()); recordInfo.setEnforceLength(true); final URI uid = qualifyRecordID(baseid, TYPE, WARCRecordType.request.toString()); recordInfo.setRecordId(uid); ReplayInputStream ris = curi.getRecorder().getRecordedOutput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } return recordInfo.getRecordId(); } protected URI writeResponse(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord suppliedFields) throws IOException { ANVLRecord namedFields = suppliedFields; if(curi.getData().containsKey(A_WARC_RESPONSE_HEADERS)) { namedFields = namedFields.clone(); for (Object headerObj : curi.getDataList(A_WARC_RESPONSE_HEADERS)) { String[] kv = StringUtils.split(((String)headerObj),":",2); namedFields.addLabelValue(kv[0].trim(), kv[1].trim()); } } WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(mimetype); recordInfo.setRecordId(baseid); recordInfo.setExtraHeaders(namedFields); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } return recordInfo.getRecordId(); } protected URI writeResource(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.resource); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(mimetype); recordInfo.setRecordId(baseid); recordInfo.setExtraHeaders(namedFields); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } return recordInfo.getRecordId(); } protected URI writeRevisit(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord headers) throws IOException { long revisedLength = 0; // By default, truncate all data if (curi.getRevisitProfile().getProfileName().equals(PROFILE_REVISIT_IDENTICAL_DIGEST) ) { // Save response from identical digest matches revisedLength = curi.getRecorder().getRecordedInput().getContentBegin(); revisedLength = revisedLength > 0 ? revisedLength : curi.getRecorder().getRecordedInput().getSize(); } return writeRevisit(w, timestamp, mimetype, baseid, curi, headers, revisedLength); } protected URI writeRevisit(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord headers, final long contentLength) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.revisit); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(mimetype); recordInfo.setRecordId(baseid); recordInfo.setContentLength(contentLength); recordInfo.setEnforceLength(false); RevisitProfile revisitProfile = curi.getRevisitProfile(); headers.addLabelValue(HEADER_KEY_PROFILE, revisitProfile.getProfileName()); headers.addLabelValue(HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH); Map revisitHeaders = revisitProfile.getWarcHeaders(); if (!revisitHeaders.isEmpty()) { recordInfo.setExtraHeaders(headers); for ( String key : revisitHeaders.keySet()) { headers.addLabelValue(key, revisitHeaders.get(key)); } } ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } return recordInfo.getRecordId(); } /** * Saves a header from the given HTTP operation into the * provider headers under a new name */ protected void saveHeader(CrawlURI curi, ANVLRecord warcHeaders, String origName, String newName) { String value = curi.getHttpResponseHeader(origName); if (value != null) { warcHeaders.addLabelValue(newName, value); } } protected URI writeMetadata(final WARCWriter w, final String timestamp, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.metadata); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(ANVLRecord.MIMETYPE); recordInfo.setExtraHeaders(namedFields); recordInfo.setEnforceLength(true); recordInfo.setRecordId(qualifyRecordID(baseid, TYPE, WARCRecordType.metadata.toString())); // Get some metadata from the curi. // TODO: Get all curi metadata. // TODO: Use other than ANVL (or rename ANVL as NameValue or use // RFC822 (commons-httpclient?). ANVLRecord r = new ANVLRecord(); if (curi.isSeed()) { r.addLabel("seed"); } else { if (curi.forceFetch()) { r.addLabel("force-fetch"); } if(StringUtils.isNotBlank(flattenVia(curi))) { r.addLabelValue("via", flattenVia(curi)); } if(StringUtils.isNotBlank(curi.getPathFromSeed())) { r.addLabelValue("hopsFromSeed", curi.getPathFromSeed()); } if (curi.containsDataKey(A_SOURCE_TAG)) { r.addLabelValue("sourceTag", (String)curi.getData().get(A_SOURCE_TAG)); } } long duration = curi.getFetchCompletedTime() - curi.getFetchBeginTime(); if (duration > -1) { r.addLabelValue("fetchTimeMs", Long.toString(duration)); } if (curi.getData().containsKey(A_FTP_FETCH_STATUS)) { r.addLabelValue("ftpFetchStatus", curi.getData().get(A_FTP_FETCH_STATUS).toString()); } if (curi.getRecorder() != null && curi.getRecorder().getCharset() != null) { r.addLabelValue("charsetForLinkExtraction", curi.getRecorder().getCharset().name()); } for (String annotation: curi.getAnnotations()) { if (annotation.startsWith("usingCharsetIn") || annotation.startsWith("inconsistentCharsetIn")) { String[] kv = annotation.split(":", 2); r.addLabelValue(kv[0], kv[1]); } } // Add outlinks though they are effectively useless without anchor text. Collection links = curi.getOutLinks(); if (links != null && links.size() > 0) { for (CrawlURI link: links) { r.addLabelValue("outlink", link.getURI()+" "+link.getLastHop()+" "+link.getViaContext()); } } // TODO: Other curi fields to write to metadata. // // Credentials // // fetch-began-time: 1154569278774 // fetch-completed-time: 1154569281816 // // Annotations. byte [] b = r.getUTF8Bytes(); recordInfo.setContentStream(new ByteArrayInputStream(b)); recordInfo.setContentLength((long) b.length); w.writeRecord(recordInfo); return recordInfo.getRecordId(); } protected URI getRecordID() throws IOException { return generator.getRecordID(); } protected URI qualifyRecordID(final URI base, final String key, final String value) throws IOException { Map qualifiers = new HashMap(1); qualifiers.put(key, value); return generator.qualifyRecordID(base, qualifiers); } public List getMetadata() { if (cachedMetadata != null) { return cachedMetadata; } ANVLRecord record = new ANVLRecord(); record.addLabelValue("software", "Heritrix/" + ArchiveUtils.VERSION + " http://crawler.archive.org"); try { InetAddress host = InetAddress.getLocalHost(); record.addLabelValue("ip", host.getHostAddress()); record.addLabelValue("hostname", host.getCanonicalHostName()); } catch (UnknownHostException e) { logger.log(Level.WARNING,"unable top obtain local crawl engine host",e); } // conforms to ISO 28500:2009 as of May 2009 // as described at http://bibnum.bnf.fr/WARC/ // latest draft as of November 2008 record.addLabelValue("format","WARC File Format 1.0"); record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); // Get other values from metadata provider CrawlMetadata provider = getMetadataProvider(); addIfNotBlank(record,"operator", provider.getOperator()); addIfNotBlank(record,"publisher", provider.getOrganization()); addIfNotBlank(record,"audience", provider.getAudience()); addIfNotBlank(record,"isPartOf", provider.getJobName()); // TODO: make date match 'job creation date' as in Heritrix 1.x // until then, leave out (plenty of dates already in WARC // records // String rawDate = provider.getBeginDate(); // if(StringUtils.isNotBlank(rawDate)) { // Date date; // try { // date = ArchiveUtils.parse14DigitDate(rawDate); // addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date)); // } catch (ParseException e) { // logger.log(Level.WARNING,"obtaining warc created date",e); // } // } addIfNotBlank(record,"description", provider.getDescription()); addIfNotBlank(record,"robots", provider.getRobotsPolicyName().toLowerCase()); addIfNotBlank(record,"http-header-user-agent", provider.getUserAgent()); addIfNotBlank(record,"http-header-from", provider.getOperatorFrom()); // really ugly to return as List, but changing would require // larger refactoring return Collections.singletonList(record.toString()); } protected void addIfNotBlank(ANVLRecord record, String label, String value) { if(StringUtils.isNotBlank(value)) { record.addLabelValue(label, value); } } @Override protected JSONObject toCheckpointJson() throws JSONException { JSONObject json = super.toCheckpointJson(); json.put("urlsWritten", urlsWritten); json.put("stats", stats); return json; } @Override protected void fromCheckpointJson(JSONObject json) throws JSONException { super.fromCheckpointJson(json); // conditionals below are for backward compatibility with old checkpoints if (json.has("urlsWritten")) { urlsWritten.set(json.getLong("urlsWritten")); } if (json.has("stats")) { HashMap> cpStats = new HashMap>(); JSONObject jsonStats = json.getJSONObject("stats"); if (JSONObject.getNames(jsonStats) != null) { for (String key1: JSONObject.getNames(jsonStats)) { JSONObject jsonSubstats = jsonStats.getJSONObject(key1); if (!cpStats.containsKey(key1)) { cpStats.put(key1, new HashMap()); } Map substats = cpStats.get(key1); for (String key2: JSONObject.getNames(jsonSubstats)) { long value = jsonSubstats.getLong(key2); substats.put(key2, value); } } addStats(cpStats); } } } @Override public String report() { // XXX note in report that stats include recovered checkpoint? logger.info("final stats: " + stats); StringBuilder buf = new StringBuilder(); buf.append("Processor: " + getClass().getName() + "\n"); buf.append(" Function: Writes WARCs\n"); buf.append(" Total CrawlURIs: " + urlsWritten + "\n"); buf.append(" Revisit records: " + WARCWriter.getStat(stats, WARCRecordType.revisit.toString(), WARCWriter.NUM_RECORDS) + "\n"); long bytes = WARCWriter.getStat(stats, WARCRecordType.response.toString(), WARCWriter.CONTENT_BYTES) + WARCWriter.getStat(stats, WARCRecordType.resource.toString(), WARCWriter.CONTENT_BYTES); buf.append(" Crawled content bytes (including http headers): " + bytes + " (" + ArchiveUtils.formatBytesForDisplay(bytes) + ")\n"); bytes = WARCWriter.getStat(stats, WARCWriter.TOTALS, WARCWriter.TOTAL_BYTES); buf.append(" Total uncompressed bytes (including all warc records): " + bytes + " (" + ArchiveUtils.formatBytesForDisplay(bytes) + ")\n"); buf.append(" Total size on disk ("+ (getCompress() ? "compressed" : "uncompressed") + "): " + getTotalBytesWritten() + " (" + ArchiveUtils.formatBytesForDisplay(getTotalBytesWritten()) + ")\n"); return buf.toString(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy