org.archive.modules.writer.Kw3WriterProcessor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.
There is a newer version: 3.5.0
Show newest version
/* Created on 2006-okt-03
*
* Copyright (C) 2006 National Library of Sweden.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/

package org.archive.modules.writer;

import static org.archive.modules.writer.Kw3Constants.ARCHIVE_TIME_KEY;
import static org.archive.modules.writer.Kw3Constants.COLLECTION_KEY;
import static org.archive.modules.writer.Kw3Constants.CONTENT_LENGTH_KEY;
import static org.archive.modules.writer.Kw3Constants.CONTENT_MD5_KEY;
import static org.archive.modules.writer.Kw3Constants.HARVESTER_KEY;
import static org.archive.modules.writer.Kw3Constants.HEADER_LENGTH_KEY;
import static org.archive.modules.writer.Kw3Constants.HEADER_MD5_KEY;
import static org.archive.modules.writer.Kw3Constants.IP_ADDRESS_KEY;
import static org.archive.modules.writer.Kw3Constants.STATUS_CODE_KEY;
import static org.archive.modules.writer.Kw3Constants.URL_KEY;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.InetAddress;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.io.ReplayInputStream;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.spring.ConfigPath;
import org.archive.util.FileUtils;
import org.springframework.beans.factory.annotation.Autowired;

/**
 * Processor module that writes the results of successful fetches to
 * files on disk. These files are MIME-files of the type used by the
 * Swedish National Library's Kulturarw3 web harvesting [http://www.kb.se/kw3/].
 *  
 * Each URI gets written to its own file and has a path consisting of:
 * 
 *   A dir named with the first two chars of the website's md5. 
 *   A dir named after the website. 
 *   'current' - a dir indicating that this is the directory being written
 *                   to by the ongoing crawl. 
 *   A file on the format <md5 of url>.<fetchtime in seconds> 
 * 
 * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837'
 * 
 * The MIME-file itself consists of three parts:
 * 
 *   1. ArchiveInfo - Metadata about the file and its content. 
 *   2. Header - The HTTP response header. 
 *   3. Content - The HTTP response content, plus content-type. 
 * 
 * 
 * @author oskar
 */
public class Kw3WriterProcessor extends Processor {

  @SuppressWarnings("unused")
  private static final long serialVersionUID = 3L;
  
  private static String COLON = ":";
  private static String WS = " ";
  private static String LF = "\n";
  
  /**
   * Logger.
   */
  private static final Logger logger =
      Logger.getLogger(Kw3WriterProcessor.class.getName());


  /**
   * Top-level directory for archive files.
   */
  protected ConfigPath path = new ConfigPath("Kw3Writer subdirectory","arcs");
  public ConfigPath getPath() {
      return this.path;
  }
  public void setPath(ConfigPath s) {
      this.path = s; 
  }
  
  /**
   * Max size for each file.
   */
  protected long maxFileSizeBytes = 100000000L;
  public long getMaxFileSizeBytes() {
      return maxFileSizeBytes;
  }
  public void setMaxFileSizeBytes(long maxFileSizeBytes) {
      this.maxFileSizeBytes = maxFileSizeBytes;
  }
  
  /**
   * Should permissions be changed for the newly created dirs.
   */
  protected boolean chmod = false; 
  public boolean getChmod() {
      return chmod;
  }
  public void setChmod(boolean chmod) {
      this.chmod = chmod;
  }

  /**
   * What should the permissions be set to. Given as three octal digits, as to
   * the UNIX 'chmod' command. Ex. 777 for all permissions to everyone.
   */
  protected String chmodValue = "777";
  public String getChmodValue() {
      return this.chmodValue;
  }
  public void setChmodValue(String s) {
      this.chmodValue = s; 
  }


  /**
   * Max size for each file.Key for the maximum ARC bytes to write attribute.
   */
  public static final String ATTR_MAX_BYTES_WRITTEN = "total-bytes-to-write";
  
  
  /**
   * Name of collection.
   */
  protected String collection = "kw3";
  public String getCollection() {
      return this.collection;
  }
  public void setCollection(String s) {
      this.collection = s; 
  }

  /**
   * Name of the harvester that is used for the web harvesting.
   */
  protected String harvester = "heritrix";
  public String getHarvester() {
      return this.harvester;
  }
  public void setHarvester(String s) {
      this.harvester = s; 
  }
  
  /**
   * The server cache to use.
   */
  protected ServerCache serverCache;
  public ServerCache getServerCache() {
      return this.serverCache;
  }
  @Autowired
  public void setServerCache(ServerCache serverCache) {
      this.serverCache = serverCache;
  }

  private static String BOUNDARY_START = "KulturArw3_";
  
  /**
   * Constructor.
   */
  public Kw3WriterProcessor() {
  }
  
  protected boolean shouldProcess(CrawlURI curi) {
      // Only successful fetches are written.
      if (!isSuccess(curi)) { 
          return false;
      }
      
      // Only http and https schemes are supported.
      String scheme = curi.getUURI().getScheme().toLowerCase();
      if (!"http".equalsIgnoreCase(scheme) 
              && !"https".equalsIgnoreCase(scheme)) {
          return false;                
      }
      
      return true;
  }


  protected void innerProcess(CrawlURI curi) {      
      // Write the MIME-file
      try {
          writeMimeFile(curi);
      } catch (IOException e) {
          logger.log(Level.WARNING, "i/o error", e);
      }      
  }
  
  /**
   * The actual writing of the Kulturarw3 MIME-file.
   * 
   * The MIME-file consists of three parts:
   * 1. ArchiveInfo - Metadata about the file and its content.
   * 2. Header - The HTTP response header.
   * 3. Content - The HTTP response content, plus content-type.
   * 
   * For more on this format, see '?'.
   */
  protected void writeMimeFile(CrawlURI curi) throws IOException {
      ReplayInputStream ris = null;
      OutputStream out = null;
                
      try {
          String boundary = BOUNDARY_START + stringToMD5(curi.toString());
          ris = curi.getRecorder().getRecordedInput().
              getReplayInputStream();
          out = initOutputStream(curi);
          
          // Part 1: Archive info
          writeArchiveInfoPart(boundary, curi, ris, out);

          // Part 2: Header info + HTTP header
          writeHeaderPart(boundary, ris, out);

          // Part 3: Content info + HTTP content
          writeContentPart(boundary, curi, ris, out);

          // And finally the terminator string
          String terminator = "\n--" + boundary + "--\n";
          out.write(terminator.getBytes());
      } finally {
          if (ris != null)
              ris.close();
          if (out != null)
              out.close();
      }
  }
  
  /**
   * Get the OutputStream for the file to write to.
   * 
   * It has a path consisting of:
   * 1. A dir named with the first two chars of the website's md5.
   * 2. A dir named after the website.
   * 3. 'current' - a dir indicating that this is the directory being written
   *                to by the ongoing crawl. 
   * 4. A file on the format <md5 of url>.<fetchtime in seconds>
   * 
   * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837'            
   */
  protected OutputStream initOutputStream(CrawlURI curi) throws IOException {
      String uri = curi.toString();
      int port = curi.getUURI().getPort();
      String host = (port == 80 || port <= 0) ?
              curi.getUURI().getHost() : curi.getUURI().getHost() + ":" + port;
      long fetchTime = curi.getFetchBeginTime() / 1000;
             
      String md5 = stringToMD5(host);
      File dir = new File(getPath().getFile(), md5.substring(0, 2) + "/" + host +
              "/current");
      if (!dir.exists()) {
          FileUtils.ensureWriteableDirectory(dir);
          if (this.chmod)
              chmods(dir, getPath().getFile());
      }
      md5 = stringToMD5(uri);
      File arcFile = new File(dir, md5 + "." + fetchTime);
      return new FastBufferedOutputStream(new FileOutputStream(arcFile));       
  }
  
  protected void writeArchiveInfoPart(String boundary, CrawlURI curi,
          ReplayInputStream ris, OutputStream out)
          throws IOException {
      // Get things we need to write in this part
      String uri = curi.toString();
      String ip = getHostAddress(curi);
      long headerLength = ris.getHeaderSize();
      long contentLength = ris.getContentSize();
      long archiveTime = System.currentTimeMillis() / 1000; // Fetchtime in seconds
      int statusCode = curi.getFetchStatus();
      String headerMd5 = null;
      Object contentMd5 = null;       
      
      // Get headerMd5
      ByteArrayOutputStream baos = new ByteArrayOutputStream();
      ris.readHeaderTo(baos);
      headerMd5 = stringToMD5(baos.toString());              
      
      // Get contentMd5
      contentMd5 = curi.getContentDigest();
      if (contentMd5 != null)
          contentMd5 = getHexString((byte[]) contentMd5);
      
      StringBuffer buffer = new StringBuffer();
      buffer.append("MIME-version: 1.1" + LF);
      buffer.append("Content-Type: multipart/mixed; boundary=" + boundary + LF);
      buffer.append("HTTP-Part: ArchiveInfo" + LF);
      buffer.append(COLLECTION_KEY + COLON + WS + this.collection + LF);
      buffer.append(HARVESTER_KEY + COLON + WS + this.harvester + LF);
      buffer.append(URL_KEY + COLON + WS + uri + LF);
      buffer.append(IP_ADDRESS_KEY + COLON + WS + ip + LF);
      buffer.append(HEADER_LENGTH_KEY + COLON + WS + headerLength + LF);
      buffer.append(HEADER_MD5_KEY + COLON + WS + headerMd5 + LF);
      buffer.append(CONTENT_LENGTH_KEY + COLON + WS + contentLength + LF);
      buffer.append(CONTENT_MD5_KEY + COLON + WS + contentMd5 + LF);
      buffer.append(ARCHIVE_TIME_KEY + COLON + WS+ archiveTime + LF);
      buffer.append(STATUS_CODE_KEY + COLON + WS + statusCode + LF + LF);       
      out.write(buffer.toString().getBytes());       
  }
  
  protected void writeHeaderPart(String boundary, ReplayInputStream ris,
          OutputStream out) 
          throws IOException {
      StringBuffer buffer = new StringBuffer();
      buffer.append("--" + boundary + LF);
      buffer.append("Content-Type: text/plain; charset=\"US-ascii\"" + LF);
      buffer.append("HTTP-Part: Header" + LF + LF );
      out.write(buffer.toString().getBytes());
      ris.readHeaderTo(out);       
  }
  
  protected void writeContentPart(String boundary, CrawlURI curi,
          ReplayInputStream ris, OutputStream out) 
          throws IOException {
      // Get things we need to write in this part
      String uri = curi.toString();
      String contentType = curi.getContentType();
      long contentLength = ris.getContentSize();      
      // Only write content if there is some
      if (contentLength == 0)   return;
             
      StringBuffer buffer = new StringBuffer();
      buffer.append("--" + boundary + LF);
      buffer.append("Content-Type: " + contentType + LF);
      buffer.append("HTTP-Part: Content" + LF + LF);
      out.write(buffer.toString().getBytes());
      
      if (contentLength > getMaxFileSizeBytes()) {
          ris.readContentTo(out, getMaxFileSizeBytes());
          logger.info(" Truncated url: " + uri + ", Size: " + contentLength +
                  ", Content-type: " + contentType);
      } else {
          ris.readContentTo(out);
      }
  }

  // --- Private helper functions --- //
  /*
   * Get a MD5 checksum based on a String. 
   */ 
  private String stringToMD5(String str) {
      try {
          byte b[] = str.getBytes();
          MessageDigest md = MessageDigest.getInstance("MD5");
          md.update(b);
          byte[] digest = md.digest();
          return getHexString(digest);
      } catch (NoSuchAlgorithmException e) {
          logger.log(Level.WARNING, "md5 error", e);
      } 
      return null;
  }

  /* 
   * Fast convert a byte array to a hex string with possible leading zero.
   */
  private String getHexString(byte[] b) {
      StringBuffer sb = new StringBuffer();
      for (int i = 0; i < b.length; i++) {
          String tmp = Integer.toHexString(b[i] & 0xff);
          if (tmp.length() < 2)
              sb.append("0" + tmp);
          else
              sb.append(tmp);
      }
      return sb.toString();
  }

  /* 
   * Chmods for all newly created directories.
   */
  private void chmods(File dir, File arcsDir) {
      String topdir = arcsDir.getAbsolutePath();
      chmod(dir, this.chmodValue);
      File parent = dir.getParentFile();
      while (!parent.getAbsolutePath().equalsIgnoreCase((topdir))) {
          chmod(parent, this.chmodValue);
          parent = parent.getParentFile();
      }
      
  }

  /* 
   * Chmod for a specific file or directory.
   */
  private void chmod(File file, String permissions) {
      Process proc = null;
      try {
          proc = Runtime.getRuntime().exec("chmod " + permissions + " " +
                  file.getAbsolutePath());
          proc.waitFor();
          proc.getInputStream().close();
          proc.getOutputStream().close();
          proc.getErrorStream().close();
      } catch (IOException e) {
          logger.log(Level.WARNING, "chmod failed", e);
      } catch (InterruptedException e) {
          logger.log(Level.WARNING, "chmod failed", e);
      }
  }

  private String getHostAddress(CrawlURI curi) {
      CrawlHost h = serverCache.getHostFor(curi.getUURI());
      if (h == null) {
          throw new NullPointerException("Crawlhost is null for " + curi + " " +
                  curi.getVia());
      }
      InetAddress a = h.getIP();
      if (a == null) {
          throw new NullPointerException("Address is null for " + curi + " " +
             curi.getVia() + ". Address " +
                 ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP) ?
                     "was never looked up." :
                     (System.currentTimeMillis() - h.getIpFetched()) + " ms ago."));
      }
      return h.getIP().getHostAddress();
  }
}