All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.util.RecoveryLogMapper Maven / Gradle / Ivy

The newest version!
/* SeedUrlNotFoundException
 *
 * $Id$
 *
 * Created on Mar 9, 2005
 *
 * Copyright (C) 2005 Mike Schwartz.
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package org.archive.crawler.util;

import org.archive.crawler.frontier.FrontierJournal;
import org.archive.util.ArchiveUtils;

import java.io.File;
import java.io.LineNumberReader;
import java.io.PrintWriter;
import java.io.FileOutputStream;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * Parses a Heritrix recovery log file (recover.gz), and builds maps
 * that allow a caller to look up any seed URL and get back an Iterator of all
 * URLs successfully crawled from given seed.
 *
 * Also allows lookup on any crawled
 * URL to find the seed URL from which the crawler reached that URL (through 1
 * or more discovered URL hops, which are collapsed in this lookup).
 *
 * 

This code creates some fairly large collections (proprotionate in size to * # discovered URLs) so make sure you allocate * it a large heap to work in. It also takes a while to process a recover log. *

See {@link #main()} method at end for test/demo code. * @author Mike Schwartz, schwartz at CodeOnTheRoad dot com */ public class RecoveryLogMapper { private static final char LOG_LINE_START_CHAR = FrontierJournal.F_ADD .charAt(0); private static final Logger logger = Logger .getLogger(RecoveryLogMapper.class.getName()); private PrintWriter seedNotFoundPrintWriter = null; /** * Tracks seed for each crawled URL */ private Map crawledUrlToSeedMap = new HashMap(); /** * Maps seed URLs to Set of discovered URLs */ private Map> seedUrlToDiscoveredUrlsMap = new HashMap>(); /** * Tracks which URLs were successfully crawled */ private Set successfullyCrawledUrls = new HashSet(); /** * Normal constructor - if encounter not-found seeds while loading * recoverLogFileName, will throw throw SeedUrlNotFoundException. Use * {@link #RecoveryLogMapper(String)} if you want to just log such cases and * keep going. (Those should not happen if the recover log is written * correctly, but we see them in practice.) * * @param recoverLogFileName * @throws java.io.FileNotFoundException * @throws java.io.IOException * @throws SeedUrlNotFoundException */ public RecoveryLogMapper(String recoverLogFileName) throws java.io.FileNotFoundException, java.io.IOException, SeedUrlNotFoundException { load(recoverLogFileName); } /** * Constructor to use if you want to allow not-found seeds, logging them to * seedNotFoundLogFileName. In contrast, {@link #RecoveryLogMapper(String)} * will throw SeedUrlNotFoundException when a seed isn't found. * * @param recoverLogFileName * @param seedNotFoundLogFileName */ public RecoveryLogMapper(String recoverLogFileName, String seedNotFoundLogFileName) throws java.io.FileNotFoundException, java.io.IOException, SeedUrlNotFoundException { seedNotFoundPrintWriter = new PrintWriter(new FileOutputStream( seedNotFoundLogFileName)); load(recoverLogFileName); } protected void load(String recoverLogFileName) throws java.io.FileNotFoundException, java.io.IOException, SeedUrlNotFoundException { LineNumberReader reader = new LineNumberReader(ArchiveUtils .getBufferedReader(new File(recoverLogFileName))); String curLine = null; while ((curLine = reader.readLine()) != null) { if (curLine.length() == 0 || curLine.charAt(0) != LOG_LINE_START_CHAR) { continue; } String args[] = curLine.split("\\s+"); int curLineNumWords = args.length; String firstUrl = args[1]; // Ignore DNS log entries if (firstUrl.startsWith("dns:")) { continue; } if (curLine.startsWith(FrontierJournal.F_ADD)) { // Seed URL if (curLineNumWords == 2) { if (logger.isLoggable(Level.FINE)) { logger.fine("F_ADD with 2 words --> seed URL (" + firstUrl + ")"); } // Add seed the first time we find it if (seedUrlToDiscoveredUrlsMap.get(firstUrl) == null) { seedUrlToDiscoveredUrlsMap.put(firstUrl, new HashSet()); } } else { // URL found via an earlier seeded / discovered URL // Look for the seed from which firstUrlString came, so // we can collapse new URLString back to it String viaUrl = args[curLineNumWords - 1]; if (logger.isLoggable(Level.FINE)) { logger.fine("F_ADD with 3+ words --> new URL " + firstUrl + " via URL " + viaUrl); } String seedForFirstUrl = (String) crawledUrlToSeedMap .get(viaUrl); // viaUrlString is a seed URL if (seedForFirstUrl == null) { if (logger.isLoggable(Level.FINE)) { logger.fine("\tvia URL is a seed"); } crawledUrlToSeedMap.put(firstUrl, viaUrl); seedForFirstUrl = viaUrl; } else { if (logger.isLoggable(Level.FINE)) { logger.fine("\tvia URL discovered via seed URL " + seedForFirstUrl); } // Collapse crawledUrlToSeedMap.put(firstUrl, seedForFirstUrl); } Set theSeedUrlList = seedUrlToDiscoveredUrlsMap .get(seedForFirstUrl); if (theSeedUrlList == null) { String message = "recover log " + recoverLogFileName + " at line " + reader.getLineNumber() + " listed F+ URL (" + viaUrl + ") for which found no seed list."; if (seedNotFoundPrintWriter != null) { seedNotFoundPrintWriter.println(message); } else { throw new SeedUrlNotFoundException(message); } } else { theSeedUrlList.add(firstUrl); } } } else if (curLine.startsWith(FrontierJournal.F_SUCCESS)) { if (logger.isLoggable(Level.FINE)) { logger.fine("F_SUCCESS for URL " + firstUrl); } successfullyCrawledUrls.add(firstUrl); } } reader.close(); if (seedNotFoundPrintWriter != null) { seedNotFoundPrintWriter.close(); } } /** * Returns seed for urlString (null if seed not found). * * @param urlString * @return Seed. */ public String getSeedForUrl(String urlString) { return (seedUrlToDiscoveredUrlsMap.get(urlString) != null) ? urlString : crawledUrlToSeedMap.get(urlString); } /** * @return Returns the seedUrlToDiscoveredUrlsMap. */ public Map> getSeedUrlToDiscoveredUrlsMap() { return this.seedUrlToDiscoveredUrlsMap; } /** * @return Returns the successfullyCrawledUrls. */ public Set getSuccessfullyCrawledUrls() { return this.successfullyCrawledUrls; } /** * @return Returns the logger. */ public static Logger getLogger() { return logger; } private class SuccessfullyCrawledURLsIterator implements Iterator { private String nextValue = null; private Iterator discoveredUrlsIterator; public SuccessfullyCrawledURLsIterator(String seedUrlString) throws SeedUrlNotFoundException { Set discoveredUrlList = (Set) getSeedUrlToDiscoveredUrlsMap() .get(seedUrlString); if (discoveredUrlList == null) { throw new SeedUrlNotFoundException("Seed URL " + seedUrlString + " not found in seed list"); } discoveredUrlsIterator = discoveredUrlList.iterator(); } /** * Idempotent method (because of null check on nextValue). */ private void populateNextValue() { while (nextValue == null & discoveredUrlsIterator.hasNext()) { String curDiscoveredUrl = discoveredUrlsIterator.next(); boolean succCrawled = getSuccessfullyCrawledUrls().contains( curDiscoveredUrl); if (getLogger().isLoggable(Level.FINE)) { getLogger().fine( "populateNextValue: curDiscoveredUrl=" + curDiscoveredUrl + ", succCrawled=" + succCrawled); } if (succCrawled) nextValue = curDiscoveredUrl; } } public boolean hasNext() { populateNextValue(); return (nextValue != null); } public String next() { populateNextValue(); String returnValue = nextValue; nextValue = null; return returnValue; } /** * Remove operation is unsupported in this Iterator (will throw * UnsupportedOperationException if called). */ public void remove() { throw new UnsupportedOperationException( "SuccessfullyCrawledURLsIterator.remove: not supported."); } } public Iterator getIteratorOfURLsSuccessfullyCrawledFromSeedUrl( String seedUrlString) throws SeedUrlNotFoundException { return new SuccessfullyCrawledURLsIterator(seedUrlString); } public Collection getSeedCollection() { return seedUrlToDiscoveredUrlsMap.keySet(); } public static void main(String args[]) { if (args.length < 1) { System.out.println("Usage: RecoveryLogMapper recoverLogFileName"); Runtime.getRuntime().exit(-1); } String recoverLogFileName = args[0]; try { RecoveryLogMapper myRecoveryLogMapper = new RecoveryLogMapper( recoverLogFileName); for (String curSeedUrl : myRecoveryLogMapper.getSeedCollection()) { System.out.println("URLs successfully crawled from seed URL " + curSeedUrl); Iterator iteratorOfUrlsCrawledFromSeedUrl = myRecoveryLogMapper .getIteratorOfURLsSuccessfullyCrawledFromSeedUrl(curSeedUrl); while (iteratorOfUrlsCrawledFromSeedUrl.hasNext()) { String curCrawledUrlString = (String) iteratorOfUrlsCrawledFromSeedUrl .next(); System.out.println(" -> " + curCrawledUrlString); } } } catch (Exception e) { e.printStackTrace(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy