All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.extractor.HTTPContentDigest Maven / Gradle / Ivy

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.extractor;

import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.logging.Logger;
import java.util.regex.Matcher;

import org.apache.commons.lang.StringUtils;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.util.TextUtils;

/**
 * A processor for calculating custom HTTP content digests in place of the 
 * default (if any) computed by the HTTP fetcher processors.
 * 

* This processor allows the user to specify a regular expression called * strip-reg-expr. Any segment of a document (text only, binary files will * be skipped) that matches this regular expression will by rewritten with * the blank character (character 32 in the ANSI character set) for the * purpose of the digest this has no effect on the document for subsequent * processing or archiving. *

* NOTE: Content digest only accounts for the document body, not headers. *

* The operator will also be able to specify a maximum length for documents * being evaluated by this processors. Documents exceeding that length will be * ignored. *

* To further discriminate by file type or URL, an operator should use the * override and refinement options. *

* It is generally recommended that this recalculation only be performed when * absolutely needed (because of stripping data that changes automatically each * time the URL is fetched) as this is an expensive operation. * * NOTE: This processor may open a ReplayCharSequence from the * CrawlURI's Recorder, without closing that ReplayCharSequence, to allow * reuse by later processors in sequence. In the usual (Heritrix) case, a * call after all processing to the Recorder's endReplays() method ensures * timely close of any reused ReplayCharSequences. Reuse of this processor * elsewhere should ensure a similar cleanup call to Recorder.endReplays() * occurs. * * @author Kristinn Sigurdsson */ public class HTTPContentDigest extends Processor { @SuppressWarnings("unused") private static final long serialVersionUID = 3L; private static Logger logger = Logger.getLogger(HTTPContentDigest.class.getName()); /** * A regular expression that matches those portions of downloaded documents * that need to be ignored when calculating the content digest. Segments * matching this expression will be rewritten with the blank character for * the content digest. */ { setStripRegex(""); } public String getStripRegex() { return (String) kp.get("stripRegex"); } public void setStripRegex(String regex) { kp.put("stripRegex",regex); } /** Maximum file size for - longer files will be ignored. -1 = unlimited*/ { setMaxSizeToDigest(1*1024*1024L); // 1MB } public long getMaxSizeToDigest() { return (Long) kp.get("maxSizeToDigest"); } public void setMaxSizeToDigest(long threshold) { kp.put("maxSizeToDigest",threshold); } private static final String SHA1 = "SHA1"; /** * Constructor. */ public HTTPContentDigest() { } protected boolean shouldProcess(CrawlURI uri) { if (!uri.getContentType().startsWith("text")) { return false; } long maxSize = getMaxSizeToDigest(); if ((maxSize > - 1) && (maxSize < uri.getContentSize())) { return false; } return true; } protected void innerProcess(CrawlURI curi) throws InterruptedException { // Ok, if we got this far we need to calculate the content digest. // Get the regex String regex = getStripRegex(); // Get a replay of the document character seq. ReplayCharSequence cs = null; try { cs = curi.getRecorder().getContentReplayCharSequence(); // Create a MessageDigest MessageDigest digest = null; try { digest = MessageDigest.getInstance(SHA1); } catch (NoSuchAlgorithmException e1) { e1.printStackTrace(); return; } digest.reset(); String s = null; if (StringUtils.isEmpty(regex)) { s = cs.toString(); } else { // Process the document Matcher m = TextUtils.getMatcher(regex, cs); s = m.replaceAll(" "); TextUtils.recycleMatcher(m); } digest.update(s.getBytes()); // Get the new digest value byte[] newDigestValue = digest.digest(); // Save new digest value curi.setContentDigest(SHA1, newDigestValue); } catch (Exception e) { curi.getNonFatalFailures().add(e); logger.warning("Failed get of replay char sequence " + curi.toString() + " " + e.getMessage() + " " + Thread.currentThread().getName()); return; // Can't proceed if this happens. } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy