org.archive.crawler.frontier.BdbFrontier Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.frontier;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.Set;
import java.util.SortedMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.DelayQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import javax.management.openmbean.CompositeData;
import org.apache.commons.collections.Closure;
import org.apache.commons.io.IOUtils;
import org.archive.bdb.BdbModule;
import org.archive.bdb.DisposableStoredSortedMap;
import org.archive.bdb.StoredQueue;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.modules.CrawlURI;
import org.archive.util.ArchiveUtils;
import org.archive.util.ObjectIdentityCache;
import org.archive.util.Supplier;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.annotation.Autowired;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseException;
/**
* A Frontier using several BerkeleyDB JE Databases to hold its record of
* known hosts (queues), and pending URIs.
*
* @author Gordon Mohr
*/
public class BdbFrontier extends WorkQueueFrontier
implements Checkpointable, BeanNameAware {
@SuppressWarnings("unused")
private static final long serialVersionUID = 1L;
private static final Logger logger =
Logger.getLogger(BdbFrontier.class.getName());
/**
* All 'inactive' queues, not yet in active rotation.
* Linked-list of keys for the queues.
*/
protected SortedMap> inactiveQueuesByPrecedence;
/**
* 'retired' queues, no longer considered for activation.
* Linked-list of keys for queues.
*/
protected StoredQueue retiredQueues;
/** all URIs scheduled to be crawled */
protected transient BdbMultipleWorkQueues pendingUris;
protected BdbModule bdb;
@Autowired
public void setBdbModule(BdbModule bdb) {
this.bdb = bdb;
}
protected String beanName;
public void setBeanName(String name) {
this.beanName = name;
}
protected boolean dumpPendingAtClose = false;
public boolean getDumpPendingAtClose() {
return dumpPendingAtClose;
}
public void setDumpPendingAtClose(boolean dumpPendingAtClose) {
this.dumpPendingAtClose = dumpPendingAtClose;
}
/* (non-Javadoc)
* @see org.archive.crawler.frontier.WorkQueueFrontier#getInactiveQueuesByPrecedence()
*/
@Override
protected SortedMap> getInactiveQueuesByPrecedence() {
return inactiveQueuesByPrecedence;
}
/* (non-Javadoc)
* @see org.archive.crawler.frontier.WorkQueueFrontier#getRetiredQueues()
*/
@Override
protected Queue getRetiredQueues() {
return retiredQueues;
}
/**
* Create the single object (within which is one BDB database)
* inside which all the other queues live.
*
* @return the created BdbMultipleWorkQueues
* @throws DatabaseException
*/
protected BdbMultipleWorkQueues createMultipleWorkQueues()
throws DatabaseException {
Database db;
boolean recycle = (recoveryCheckpoint != null);
BdbModule.BdbConfig dbConfig = new BdbModule.BdbConfig();
dbConfig.setAllowCreate(!recycle);
// Make database deferred write: URLs that are added then removed
// before a page-out is required need never cause disk IO.
db = bdb.openDatabase("pending", dbConfig, recycle);
return new BdbMultipleWorkQueues(db, bdb.getClassCatalog());
}
/**
* Return the work queue for the given classKey, or null
* if no such queue exists.
*
* @param classKey key to look for
* @return the found WorkQueue
*/
protected WorkQueue getQueueFor(final String classKey) {
WorkQueue wq = allQueues.getOrUse(
classKey,
new Supplier() {
public BdbWorkQueue get() {
String qKey = new String(classKey); // ensure private minimal key
BdbWorkQueue q = new BdbWorkQueue(qKey, BdbFrontier.this);
q.setTotalBudget(getQueueTotalBudget());
getQueuePrecedencePolicy().queueCreated(q);
return q;
}});
return wq;
}
@Override
public FrontierGroup getGroup(CrawlURI curi) {
return getQueueFor(curi.getClassKey());
}
/**
* Return list of urls.
* @param marker
* @param numberOfMatches
* @param verbose
* @return List of URIs (strings).
*/
public CompositeData getURIsList(String marker,
int numberOfMatches, String pattern, final boolean verbose) {
try {
Pattern p = Pattern.compile(pattern);
return pendingUris.getFrom(marker, numberOfMatches, p, verbose);
} catch (DatabaseException e) {
throw new IllegalStateException(e);
}
}
/* (non-Javadoc)
* @see org.archive.crawler.frontier.AbstractFrontier#finalTasks()
*/
@Override
protected void finalTasks() {
super.finalTasks();
// before closing/releasing, dump if requested
if (getDumpPendingAtClose()) {
try {
dumpAllPendingToLog();
} catch (Exception e) {
logger.log(Level.WARNING, "dump pending problem", e);
}
}
}
/* (non-Javadoc)
* @see org.archive.crawler.frontier.WorkQueueFrontier#close()
*/
@Override
public void close() {
ArchiveUtils.closeQuietly(pendingUris);
super.close();
}
protected BdbMultipleWorkQueues getWorkQueues() {
return pendingUris;
}
protected boolean workQueueDataOnDisk() {
return true;
}
public BdbFrontier() {
super();
}
public void startCheckpoint(Checkpoint checkpointInProgress) {
dispositionInProgressLock.writeLock().lock();
}
public void doCheckpoint(Checkpoint checkpointInProgress) {
// An explicit sync on any deferred write dbs is needed to make the
// db recoverable. Sync'ing the environment is insufficient
this.pendingUris.sync();
// object caches will be sync()d by BdbModule
// save simple instance fields & inactive-levels summary
JSONObject json = new JSONObject();
try {
json.put("nextOrdinal", nextOrdinal.get());
json.put("queuedUriCount", queuedUriCount.get());
json.put("futureUriCount", futureUriCount.get());
json.put("succeededFetchCount", succeededFetchCount.get());
json.put("failedFetchCount", failedFetchCount.get());
json.put("disregardedUriCount", disregardedUriCount.get());
json.put("totalProcessedBytes", totalProcessedBytes.get());
json.put("inactivePrecedences", inactiveQueuesByPrecedence.keySet());
checkpointInProgress.saveJson(beanName, json);
} catch (JSONException e) {
// impossible
throw new RuntimeException(e);
}
// write all active (inProcess, ready, snoozed) queues to list for quick-resume-use
PrintWriter activeQueuesWriter = null;
try {
activeQueuesWriter = new PrintWriter(checkpointInProgress.saveWriter(beanName, "active"));
for(WorkQueue q : inProcessQueues) {
activeQueuesWriter.println(q.getClassKey());
}
for(String qk : readyClassQueues) {
activeQueuesWriter.println(qk);
}
for(DelayedWorkQueue q : snoozedClassQueues) {
activeQueuesWriter.println(q.getClassKey());
}
for(DelayedWorkQueue q : snoozedOverflow.values()) {
activeQueuesWriter.println(q.getClassKey());
}
} catch (IOException ioe) {
checkpointInProgress.setSuccess(false);
logger.log(Level.SEVERE,"problem writing checkpoint", ioe);
} finally {
IOUtils.closeQuietly(activeQueuesWriter);
}
// rotate recover log, if any
if(this.recover!=null) {
recover.rotateForCheckpoint(checkpointInProgress);
}
}
public void finishCheckpoint(Checkpoint checkpointInProgress) {
dispositionInProgressLock.writeLock().unlock();
}
protected Checkpoint recoveryCheckpoint;
@Autowired(required=false)
public void setRecoveryCheckpoint(Checkpoint checkpoint) {
this.recoveryCheckpoint = checkpoint;
}
@Override
protected void initAllQueues() throws DatabaseException {
boolean isRecovery = (recoveryCheckpoint != null);
this.allQueues = bdb.getObjectCache("allqueues", isRecovery, WorkQueue.class, BdbWorkQueue.class);
if(isRecovery) {
// restore simple instance fields
JSONObject json = recoveryCheckpoint.loadJson(beanName);
try {
nextOrdinal.set(json.getLong("nextOrdinal"));
queuedUriCount.set(json.getLong("queuedUriCount"));
futureUriCount.set(json.getLong("futureUriCount"));
succeededFetchCount.set(json.getLong("succeededFetchCount"));
failedFetchCount.set(json.getLong("failedFetchCount"));
disregardedUriCount.set(json.getLong("disregardedUriCount"));
totalProcessedBytes.set(json.getLong("totalProcessedBytes"));
JSONArray inactivePrecedences = json.getJSONArray("inactivePrecedences");
// restore all intended inactiveQueues
for(int i = 0; i < inactivePrecedences.length(); i++) {
int precedence = inactivePrecedences.getInt(i);
inactiveQueuesByPrecedence.put(precedence,createInactiveQueueForPrecedence(precedence,true));
}
} catch (JSONException e) {
throw new RuntimeException(e);
}
// retired queues already restored with prior data in initOtherQueues
// restore ready queues (those not already on inactive, retired)
BufferedReader activeQueuesReader = null;
try {
activeQueuesReader = recoveryCheckpoint.loadReader(beanName,"active");
String line;
while((line = activeQueuesReader.readLine())!=null) {
readyClassQueues.add(line);
}
} catch (IOException ioe) {
throw new RuntimeException(ioe);
} finally {
IOUtils.closeQuietly(activeQueuesReader);
}
// TODO: restore largestQueues topNset?
}
}
@Override
protected void initOtherQueues() throws DatabaseException {
boolean recycle = (recoveryCheckpoint != null);
// tiny risk of OutOfMemoryError: if giant number of snoozed
// queues all wake-to-ready at once
readyClassQueues = new LinkedBlockingQueue();
inactiveQueuesByPrecedence = new ConcurrentSkipListMap>();
retiredQueues = bdb.getStoredQueue("retiredQueues", String.class, recycle);
// primary snoozed queues
snoozedClassQueues = new DelayQueue();
// just in case: overflow for extreme situations
snoozedOverflow = bdb.getStoredMap(
"snoozedOverflow", Long.class, DelayedWorkQueue.class, true, false);
this.futureUris = bdb.getStoredMap(
"futureUris", Long.class, CrawlURI.class, true, recoveryCheckpoint!=null);
// initialize master map in which other queues live
this.pendingUris = createMultipleWorkQueues();
}
/* (non-Javadoc)
* @see org.archive.crawler.frontier.WorkQueueFrontier#createInactiveQueueForPrecedence(int)
*/
@Override
protected Queue createInactiveQueueForPrecedence(int precedence) {
return createInactiveQueueForPrecedence(precedence, false);
}
/**
* Optionally reuse prior data, for use when resuming from a checkpoint
*/
protected Queue createInactiveQueueForPrecedence(int precedence, boolean usePriorData) {
return bdb.getStoredQueue("inactiveQueues-"+precedence, String.class, usePriorData);
}
/**
* Dump all still-enqueued URIs to the crawl.log -- without actually
* dequeuing. Useful for understanding what was remaining in a crawl that
* was ended early, for example at a time limit.
*
* @throws DatabaseException
*/
public void dumpAllPendingToLog() throws DatabaseException {
Closure tolog = new Closure() {
public void execute(Object curi) {
log((CrawlURI) curi);
}
};
forAllPendingDo(tolog);
}
public void forAllPendingDo(Closure closure) {
pendingUris.forAllPendingDo(closure);
}
/**
* Run a self-consistency check over queue collections, queues-of-queues,
* etc. for testing purposes. Requires one of the same locks as for PAUSE,
* so should only be run while crawl is running.
*/
public void consistencyCheck() {
// outboundLock.writeLock().lock();
dispositionInProgressLock.writeLock().lock();
System.err.println("<< queueSummaries =
bdb.getStoredMap(
null,
String.class,
String.class,
false,
false);
// mark every queue with the 'managed' collections it's in
consistencyMarkup(queueSummaries, inProcessQueues, "i");
consistencyMarkup(queueSummaries,readyClassQueues, "r");
consistencyMarkup(queueSummaries,snoozedClassQueues, "s");
consistencyMarkup(queueSummaries,snoozedOverflow.values(), "S");
for( Entry> entry : getInactiveQueuesByPrecedence().entrySet()) {
consistencyMarkup(queueSummaries,entry.getValue(),Integer.toString(entry.getKey()));
}
consistencyMarkup(queueSummaries,retiredQueues, "R");
// report problems where a queue isn't as expected or ideal
int anomalies = 0;
for(String q : allQueues.keySet()) {
WorkQueue wq = allQueues.get(q);
String summary = queueSummaries.get(q);
if(wq.getCount()>0 && summary == null) {
// every non-empty queue should have been in at least one collection
System.err.println("FRONTIER ANOMALY: "+q+" "+wq.getCount()+" "+wq.isManaged()+" but not in managed collections");
// System.err.println(wq.shortReportLegend()+"\n"+inactiveByClass.get(q)+"\n"+wq.shortReportLine());
anomalies++;
}
if(wq.getCount()==0 && summary == null && wq.isManaged()) {
// any empty queue should only report isManaged if in a collection
System.err.println("FRONTIER ANOMALY: "+q+" "+wq.getCount()+" "+wq.isManaged()+" but not in managed collections");
// System.err.println(wq.shortReportLegend()+"\n"+inactiveByClass.get(q)+"\n"+wq.shortReportLine());
anomalies++;
}
}
System.err.println(anomalies+" ANOMALIES");
int concerns = 0;
for(String q : queueSummaries.keySet()) {
String summary = queueSummaries.get(q);
if(summary != null && summary.split(",").length>1) {
// ideally queues won't be more than one place (though frontier
// should operate if they are, and changing precedence values
// will cause multiple entries by design)
WorkQueue wq = allQueues.get(q);
System.err.println("FRONTIER CONCERN: "+q+" "+wq.getCount()+" multiple places: "+summary);
System.err.println("\n"+wq.shortReportLegend()+"\n"+wq.shortReportLine());
concerns++;
}
}
System.err.println(concerns+" CONCERNS");
System.err.println("END CHECKING FRONTIER>>>");
queueSummaries.dispose();
dispositionInProgressLock.writeLock().unlock();
// outboundLock.writeLock().unlock();
}
protected void consistencyMarkup(
DisposableStoredSortedMap queueSummaries,
Iterable> queues, String mark) {
for(Object qq : queues) {
String key = (qq instanceof String)
? (String)qq
: (qq instanceof WorkQueue)
? ((WorkQueue)qq).getClassKey()
: ((DelayedWorkQueue)qq).getClassKey();
String val = queueSummaries.get(key);
val = (val==null) ? mark : val+","+mark;
queueSummaries.put(key, val);
}
}
@Override
public long exportPendingUris(PrintWriter writer) {
if (pendingUris == null) {
return -5L;
}
return pendingUris.exportPendingUris(writer);
}
@Override
public ObjectIdentityCache getAllQueues() {
return allQueues;
}
@Override
public BlockingQueue getReadyClassQueues() {
return readyClassQueues;
}
@Override
public Set getInProcessQueues() {
return inProcessQueues;
}
}