All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.processor.CrawlMapper Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.crawler.processor;

import static org.archive.modules.fetcher.FetchStatusCodes.S_BLOCKED_BY_CUSTOM_PROCESSOR;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.Iterator;

import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.deciderules.AcceptDecideRule;
import org.archive.modules.deciderules.DecideResult;
import org.archive.modules.deciderules.DecideRule;
import org.archive.spring.ConfigPath;
import org.archive.util.ArchiveUtils;
import org.archive.util.fingerprint.ArrayLongFPCache;
import org.springframework.context.Lifecycle;

import st.ata.util.FPGenerator;

/**
 * A simple crawl splitter/mapper, dividing up CrawlURIs/CrawlURIs
 * between crawlers by diverting some range of URIs to local log files
 * (which can then be imported to other crawlers). 
 * 
 * May operate on a CrawlURI (typically early in the processing chain) or
 * its CrawlURI outlinks (late in the processing chain, after 
 * LinksScoper), or both (if inserted and configured in both places). 
 * 
 * 

Applies a map() method, supplied by a concrete subclass, to * classKeys to map URIs to crawlers by name. * *

One crawler name is distinguished as the 'local name'; URIs mapped to * this name are not diverted, but continue to be processed normally. * *

If using the JMX importUris operation importing URLs dropped by * a {@link CrawlMapper} instance, use recoveryLog style. * * @author gojomo * @version $Date$, $Revision$ */ public abstract class CrawlMapper extends Processor implements Lifecycle { /** * PrintWriter which remembers the File to which it writes. */ private class FilePrintWriter extends PrintWriter { File file; public FilePrintWriter(File file) throws FileNotFoundException { super(new BufferedOutputStream(new FileOutputStream(file))); this.file = file; } public File getFile() { return file; } } /** * Whether to apply the mapping to a URI being processed itself, for example * early in processing (while its status is still 'unattempted'). */ protected boolean checkUri = true; public boolean getCheckUri() { return this.checkUri; } public void setCheckUri(boolean check) { this.checkUri = check; } /** * Whether to apply the mapping to discovered outlinks, for example after * extraction has occurred. */ protected boolean checkOutlinks = true; public boolean getCheckOutlinks() { return this.checkOutlinks; } public void setCheckOutlinks(boolean check) { this.checkOutlinks = check; } /** * Decide rules to determine if an outlink is subject to mapping. */ protected DecideRule outlinkRule = new AcceptDecideRule(); public DecideRule getOutlinkRule() { return this.outlinkRule; } public void setOutlinkRule(DecideRule rule) { this.outlinkRule = rule; } /** * Name of local crawler node; mappings to this name result in normal * processing (no diversion). */ protected String localName = "."; public String getLocalName() { return this.localName; } public void setLocalName(String name) { this.localName = name; } /** * Directory to write diversion logs. */ protected ConfigPath diversionDir = new ConfigPath("diverted URIs subdirectory","diversions"); public ConfigPath getDiversionDir() { return this.diversionDir; } public void setDiversionDir(ConfigPath path) { this.diversionDir = path; } /** * Number of timestamp digits to use as prefix of log names (grouping all * diversions from that period in a single log). Default is 10 (hourly log * rotation). * */ protected int rotationDigits = 10; public int getRotationDigits() { return this.rotationDigits; } public void setRotationDigits(int digits) { this.rotationDigits = digits; } /** * Mapping of target crawlers to logs (PrintWriters) */ protected HashMap diversionLogs = new HashMap(); /** * Truncated timestamp prefix for diversion logs; when * current time doesn't match, it's time to close all * current logs. */ protected String logGeneration = ""; protected ArrayLongFPCache cache; /** * Constructor. */ public CrawlMapper() { super(); } @Override protected boolean shouldProcess(CrawlURI puri) { return true; } @Override protected void innerProcess(CrawlURI puri) { throw new AssertionError(); } @Override protected ProcessResult innerProcessResult(CrawlURI puri) { CrawlURI curi = (CrawlURI)puri; String nowGeneration = ArchiveUtils.get14DigitDate().substring( 0, getRotationDigits()); if(!nowGeneration.equals(logGeneration)) { updateGeneration(nowGeneration); } if (curi.getFetchStatus() <= 0 // unfetched/unsuccessful && getCheckUri()) { // apply mapping to the CrawlURI itself String target = map(curi); if(!localName.equals(target)) { // CrawlURI is mapped to somewhere other than here curi.setFetchStatus(S_BLOCKED_BY_CUSTOM_PROCESSOR); curi.getAnnotations().add("to:"+target); divertLog(curi,target); return ProcessResult.FINISH; } else { // localName means keep locally; do nothing } } if (getCheckOutlinks()) { // consider outlinks for mapping Iterator iter = curi.getOutLinks().iterator(); while(iter.hasNext()) { CrawlURI cauri = iter.next(); if (decideToMapOutlink(cauri)) { // apply mapping to the CrawlURI String target = map(cauri); if(!localName.equals(target)) { // CrawlURI is mapped to somewhere other than here iter.remove(); divertLog(cauri,target); } else { // localName means keep locally; do nothing } } } } return ProcessResult.PROCEED; } protected boolean decideToMapOutlink(CrawlURI cauri) { DecideRule rule = getOutlinkRule(); boolean rejected = rule.decisionFor(cauri) .equals(DecideResult.REJECT); return !rejected; } /** * Close and mark as finished all existing diversion logs, and * arrange for new logs to use the new generation prefix. * * @param nowGeneration new generation (timestamp prefix) to use */ protected synchronized void updateGeneration(String nowGeneration) { // all existing logs are of a previous generation Iterator iter = diversionLogs.values().iterator(); while(iter.hasNext()) { FilePrintWriter writer = (FilePrintWriter) iter.next(); writer.close(); writer.getFile().renameTo( new File(writer.getFile().getAbsolutePath() .replaceFirst("\\.open$", ".divert"))); } diversionLogs.clear(); logGeneration = nowGeneration; } /** * Look up the crawler node name to which the given CrawlURI * should be mapped. * * @param cauri CrawlURI to consider * @return String node name which should handle URI */ protected abstract String map(CrawlURI cauri); /** * Note the given CrawlURI in the appropriate diversion log. * * @param cauri CrawlURI to append to a diversion log * @param target String node name (log name) to receive URI */ protected synchronized void divertLog(CrawlURI cauri, String target) { if(recentlySeen(cauri)) { return; } PrintWriter diversionLog = getDiversionLog(target); diversionLog.print(cauri.getClassKey()); diversionLog.print(" "); cauri.shortReportLineTo(diversionLog); diversionLog.println(); } /** * Consult the cache to determine if the given URI * has been recently seen -- entering it if not. * * @param cauri CrawlURI to test * @return true if URI was already in the cache; false otherwise */ private boolean recentlySeen(CrawlURI cauri) { long fp = FPGenerator.std64.fp(cauri.toString()); return ! cache.add(fp); } /** * Get the diversion log for a given target crawler node node. * * @param target crawler node name of requested log * @return PrintWriter open on an appropriately-named * log file */ protected PrintWriter getDiversionLog(String target) { FilePrintWriter writer = (FilePrintWriter) diversionLogs.get(target); if(writer == null) { File divertDir = getDiversionDir().getFile(); divertDir.mkdirs(); File divertLog = new File(divertDir, logGeneration+"-"+localName+"-to-"+target+".open"); try { writer = new FilePrintWriter(divertLog); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new RuntimeException(e); } diversionLogs.put(target,writer); } return writer; } public void start() { if(isRunning()) { return; } cache = new ArrayLongFPCache(); } public boolean isRunning() { return cache != null; } public void stop() { // XXX this happens at finish; move to teardown? cache = null; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy