org.archive.crawler.processor.CrawlMapper Maven / Gradle / Ivy
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.processor;
import static org.archive.modules.fetcher.FetchStatusCodes.S_BLOCKED_BY_CUSTOM_PROCESSOR;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.Iterator;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.deciderules.AcceptDecideRule;
import org.archive.modules.deciderules.DecideResult;
import org.archive.modules.deciderules.DecideRule;
import org.archive.spring.ConfigPath;
import org.archive.util.ArchiveUtils;
import org.archive.util.fingerprint.ArrayLongFPCache;
import org.springframework.context.Lifecycle;
import st.ata.util.FPGenerator;
/**
* A simple crawl splitter/mapper, dividing up CrawlURIs/CrawlURIs
* between crawlers by diverting some range of URIs to local log files
* (which can then be imported to other crawlers).
*
* May operate on a CrawlURI (typically early in the processing chain) or
* its CrawlURI outlinks (late in the processing chain, after
* LinksScoper), or both (if inserted and configured in both places).
*
* Applies a map() method, supplied by a concrete subclass, to
* classKeys to map URIs to crawlers by name.
*
*
One crawler name is distinguished as the 'local name'; URIs mapped to
* this name are not diverted, but continue to be processed normally.
*
*
If using the JMX importUris operation importing URLs dropped by
* a {@link CrawlMapper} instance, use recoveryLog
style.
*
* @author gojomo
* @version $Date$, $Revision$
*/
public abstract class CrawlMapper extends Processor implements Lifecycle {
/**
* PrintWriter which remembers the File to which it writes.
*/
private class FilePrintWriter extends PrintWriter {
File file;
public FilePrintWriter(File file) throws FileNotFoundException {
super(new BufferedOutputStream(new FileOutputStream(file)));
this.file = file;
}
public File getFile() {
return file;
}
}
/**
* Whether to apply the mapping to a URI being processed itself, for example
* early in processing (while its status is still 'unattempted').
*/
protected boolean checkUri = true;
public boolean getCheckUri() {
return this.checkUri;
}
public void setCheckUri(boolean check) {
this.checkUri = check;
}
/**
* Whether to apply the mapping to discovered outlinks, for example after
* extraction has occurred.
*/
protected boolean checkOutlinks = true;
public boolean getCheckOutlinks() {
return this.checkOutlinks;
}
public void setCheckOutlinks(boolean check) {
this.checkOutlinks = check;
}
/**
* Decide rules to determine if an outlink is subject to mapping.
*/
protected DecideRule outlinkRule = new AcceptDecideRule();
public DecideRule getOutlinkRule() {
return this.outlinkRule;
}
public void setOutlinkRule(DecideRule rule) {
this.outlinkRule = rule;
}
/**
* Name of local crawler node; mappings to this name result in normal
* processing (no diversion).
*/
protected String localName = ".";
public String getLocalName() {
return this.localName;
}
public void setLocalName(String name) {
this.localName = name;
}
/**
* Directory to write diversion logs.
*/
protected ConfigPath diversionDir =
new ConfigPath("diverted URIs subdirectory","diversions");
public ConfigPath getDiversionDir() {
return this.diversionDir;
}
public void setDiversionDir(ConfigPath path) {
this.diversionDir = path;
}
/**
* Number of timestamp digits to use as prefix of log names (grouping all
* diversions from that period in a single log). Default is 10 (hourly log
* rotation).
*
*/
protected int rotationDigits = 10;
public int getRotationDigits() {
return this.rotationDigits;
}
public void setRotationDigits(int digits) {
this.rotationDigits = digits;
}
/**
* Mapping of target crawlers to logs (PrintWriters)
*/
protected HashMap diversionLogs
= new HashMap();
/**
* Truncated timestamp prefix for diversion logs; when
* current time doesn't match, it's time to close all
* current logs.
*/
protected String logGeneration = "";
protected ArrayLongFPCache cache;
/**
* Constructor.
*/
public CrawlMapper() {
super();
}
@Override
protected boolean shouldProcess(CrawlURI puri) {
return true;
}
@Override
protected void innerProcess(CrawlURI puri) {
throw new AssertionError();
}
@Override
protected ProcessResult innerProcessResult(CrawlURI puri) {
CrawlURI curi = (CrawlURI)puri;
String nowGeneration =
ArchiveUtils.get14DigitDate().substring(
0,
getRotationDigits());
if(!nowGeneration.equals(logGeneration)) {
updateGeneration(nowGeneration);
}
if (curi.getFetchStatus() <= 0 // unfetched/unsuccessful
&& getCheckUri()) {
// apply mapping to the CrawlURI itself
String target = map(curi);
if(!localName.equals(target)) {
// CrawlURI is mapped to somewhere other than here
curi.setFetchStatus(S_BLOCKED_BY_CUSTOM_PROCESSOR);
curi.getAnnotations().add("to:"+target);
divertLog(curi,target);
return ProcessResult.FINISH;
} else {
// localName means keep locally; do nothing
}
}
if (getCheckOutlinks()) {
// consider outlinks for mapping
Iterator iter = curi.getOutLinks().iterator();
while(iter.hasNext()) {
CrawlURI cauri = iter.next();
if (decideToMapOutlink(cauri)) {
// apply mapping to the CrawlURI
String target = map(cauri);
if(!localName.equals(target)) {
// CrawlURI is mapped to somewhere other than here
iter.remove();
divertLog(cauri,target);
} else {
// localName means keep locally; do nothing
}
}
}
}
return ProcessResult.PROCEED;
}
protected boolean decideToMapOutlink(CrawlURI cauri) {
DecideRule rule = getOutlinkRule();
boolean rejected = rule.decisionFor(cauri)
.equals(DecideResult.REJECT);
return !rejected;
}
/**
* Close and mark as finished all existing diversion logs, and
* arrange for new logs to use the new generation prefix.
*
* @param nowGeneration new generation (timestamp prefix) to use
*/
protected synchronized void updateGeneration(String nowGeneration) {
// all existing logs are of a previous generation
Iterator iter = diversionLogs.values().iterator();
while(iter.hasNext()) {
FilePrintWriter writer = (FilePrintWriter) iter.next();
writer.close();
writer.getFile().renameTo(
new File(writer.getFile().getAbsolutePath()
.replaceFirst("\\.open$", ".divert")));
}
diversionLogs.clear();
logGeneration = nowGeneration;
}
/**
* Look up the crawler node name to which the given CrawlURI
* should be mapped.
*
* @param cauri CrawlURI to consider
* @return String node name which should handle URI
*/
protected abstract String map(CrawlURI cauri);
/**
* Note the given CrawlURI in the appropriate diversion log.
*
* @param cauri CrawlURI to append to a diversion log
* @param target String node name (log name) to receive URI
*/
protected synchronized void divertLog(CrawlURI cauri, String target) {
if(recentlySeen(cauri)) {
return;
}
PrintWriter diversionLog = getDiversionLog(target);
diversionLog.print(cauri.getClassKey());
diversionLog.print(" ");
cauri.shortReportLineTo(diversionLog);
diversionLog.println();
}
/**
* Consult the cache to determine if the given URI
* has been recently seen -- entering it if not.
*
* @param cauri CrawlURI to test
* @return true if URI was already in the cache; false otherwise
*/
private boolean recentlySeen(CrawlURI cauri) {
long fp = FPGenerator.std64.fp(cauri.toString());
return ! cache.add(fp);
}
/**
* Get the diversion log for a given target crawler node node.
*
* @param target crawler node name of requested log
* @return PrintWriter open on an appropriately-named
* log file
*/
protected PrintWriter getDiversionLog(String target) {
FilePrintWriter writer = (FilePrintWriter) diversionLogs.get(target);
if(writer == null) {
File divertDir = getDiversionDir().getFile();
divertDir.mkdirs();
File divertLog =
new File(divertDir,
logGeneration+"-"+localName+"-to-"+target+".open");
try {
writer = new FilePrintWriter(divertLog);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
throw new RuntimeException(e);
}
diversionLogs.put(target,writer);
}
return writer;
}
public void start() {
if(isRunning()) {
return;
}
cache = new ArrayLongFPCache();
}
public boolean isRunning() {
return cache != null;
}
public void stop() {
// XXX this happens at finish; move to teardown?
cache = null;
}
}