org.archive.modules.extractor.HTTPContentDigest Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.
There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.extractor;

import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.logging.Logger;
import java.util.regex.Matcher;

import org.apache.commons.lang.StringUtils;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.util.TextUtils;

/**
 * A processor for calculating custom HTTP content digests in place of the 
 * default (if any) computed by the HTTP fetcher processors.
 * 
 * This processor allows the user to specify a regular expression called 
 * strip-reg-expr. Any segment of a document (text only, binary files will
 * be skipped) that matches this regular expression will by rewritten with 
 * the blank character (character 32 in the ANSI character set)  for the 
 * purpose of the digest this has no effect on the document for subsequent 
 * processing or archiving.
 * 

 * NOTE: Content digest only accounts for the document body, not headers.
 * 

 * The operator will also be able to specify a maximum length for documents 
 * being evaluated by this processors. Documents exceeding that length will be 
 * ignored.
 * 

 * To further discriminate by file type or URL, an operator should use the 
 * override and refinement options. 
 * 
 * It is generally recommended that this recalculation only be performed when 
 * absolutely needed (because of stripping data that changes automatically each 
 * time the URL is fetched) as this is an expensive operation.
 * 
 * NOTE: This processor may open a ReplayCharSequence from the 
 * CrawlURI's Recorder, without closing that ReplayCharSequence, to allow
 * reuse by later processors in sequence. In the usual (Heritrix) case, a 
 * call after all processing to the Recorder's endReplays() method ensures
 * timely close of any reused ReplayCharSequences. Reuse of this processor
 * elsewhere should ensure a similar cleanup call to Recorder.endReplays()
 * occurs. 
 * 
 * @author Kristinn Sigurdsson
 */
public class HTTPContentDigest extends Processor {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 3L;

    private static Logger logger =
        Logger.getLogger(HTTPContentDigest.class.getName());

    
    /**
     * A regular expression that matches those portions of downloaded documents
     * that need to be ignored when calculating the content digest. Segments
     * matching this expression will be rewritten with the blank character for
     * the content digest.
     */
    {
        setStripRegex("");
    }
    public String getStripRegex() {
        return (String) kp.get("stripRegex");
    }
    public void setStripRegex(String regex) {
        kp.put("stripRegex",regex);
    }

    /** Maximum file size for - longer files will be ignored. -1 = unlimited*/
    {
        setMaxSizeToDigest(1*1024*1024L); // 1MB
    }
    public long getMaxSizeToDigest() {
        return (Long) kp.get("maxSizeToDigest");
    }
    public void setMaxSizeToDigest(long threshold) {
        kp.put("maxSizeToDigest",threshold);
    }
    
    private static final String SHA1 = "SHA1";

    
    /**
     * Constructor.
     */
    public HTTPContentDigest() {
    }

    
    protected boolean shouldProcess(CrawlURI uri) {
        if (!uri.getContentType().startsWith("text")) {
            return false;
        }
        
        long maxSize = getMaxSizeToDigest();
        if ((maxSize > - 1) && (maxSize < uri.getContentSize())) {
            return false;
        }
        
        return true;
    }

    protected void innerProcess(CrawlURI curi) throws InterruptedException {
        // Ok, if we got this far we need to calculate the content digest. 
        // Get the regex
        String regex = getStripRegex();
        
        // Get a replay of the document character seq.
        ReplayCharSequence cs = null;
        try {
           cs = curi.getRecorder().getContentReplayCharSequence();
           // Create a MessageDigest 
           MessageDigest digest = null;
           try {
               digest = MessageDigest.getInstance(SHA1);
           } catch (NoSuchAlgorithmException e1) {
               e1.printStackTrace();
               return;
           }

           digest.reset();

           String s = null;

           if (StringUtils.isEmpty(regex)) {
               s = cs.toString();
           } else {
               // Process the document
               Matcher m = TextUtils.getMatcher(regex, cs);
               s = m.replaceAll(" ");
               TextUtils.recycleMatcher(m);
           }
           digest.update(s.getBytes());
           // Get the new digest value
           byte[] newDigestValue = digest.digest();
           // Save new digest value
           curi.setContentDigest(SHA1, newDigestValue);
           
        } catch (Exception e) {
            curi.getNonFatalFailures().add(e);
            logger.warning("Failed get of replay char sequence " +
                curi.toString() + " " + e.getMessage() + " " +
                Thread.currentThread().getName());
            return; // Can't proceed if this happens.
        }
    }
}