All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.frontier.BdbFrontier Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.crawler.frontier;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.Set;
import java.util.SortedMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.DelayQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import javax.management.openmbean.CompositeData;

import org.apache.commons.collections.Closure;
import org.apache.commons.io.IOUtils;
import org.archive.bdb.BdbModule;
import org.archive.bdb.DisposableStoredSortedMap;
import org.archive.bdb.StoredQueue;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.modules.CrawlURI;
import org.archive.util.ArchiveUtils;
import org.archive.util.ObjectIdentityCache;
import org.archive.util.Supplier;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.annotation.Autowired;

import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseException;

/**
 * A Frontier using several BerkeleyDB JE Databases to hold its record of
 * known hosts (queues), and pending URIs. 
 *
 * @author Gordon Mohr
 */
public class BdbFrontier extends WorkQueueFrontier 
implements Checkpointable, BeanNameAware {
    @SuppressWarnings("unused")
    private static final long serialVersionUID = 1L;

    private static final Logger logger =
        Logger.getLogger(BdbFrontier.class.getName());

    /** 
     * All 'inactive' queues, not yet in active rotation.
     * Linked-list of keys for the queues.
     */
    protected SortedMap> inactiveQueuesByPrecedence;

    /**
     * 'retired' queues, no longer considered for activation.
     * Linked-list of keys for queues.
     */
    protected StoredQueue retiredQueues;
    
    /** all URIs scheduled to be crawled */
    protected transient BdbMultipleWorkQueues pendingUris;

    protected BdbModule bdb;
    @Autowired
    public void setBdbModule(BdbModule bdb) {
        this.bdb = bdb;
    }
    
    protected String beanName; 
    public void setBeanName(String name) {
        this.beanName = name;
    }
    
    protected boolean dumpPendingAtClose = false; 
    public boolean getDumpPendingAtClose() {
        return dumpPendingAtClose;
    }
    public void setDumpPendingAtClose(boolean dumpPendingAtClose) {
        this.dumpPendingAtClose = dumpPendingAtClose;
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.WorkQueueFrontier#getInactiveQueuesByPrecedence()
     */
    @Override
    protected SortedMap> getInactiveQueuesByPrecedence() {
        return inactiveQueuesByPrecedence;
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.WorkQueueFrontier#getRetiredQueues()
     */
    @Override
    protected Queue getRetiredQueues() {
        return retiredQueues;
    }
    
    /**
     * Create the single object (within which is one BDB database)
     * inside which all the other queues live. 
     * 
     * @return the created BdbMultipleWorkQueues
     * @throws DatabaseException
     */
    protected BdbMultipleWorkQueues createMultipleWorkQueues()
    throws DatabaseException {
        Database db;
        boolean recycle = (recoveryCheckpoint != null);

        BdbModule.BdbConfig dbConfig = new BdbModule.BdbConfig();
        dbConfig.setAllowCreate(!recycle);
        // Make database deferred write: URLs that are added then removed 
        // before a page-out is required need never cause disk IO.
        db = bdb.openDatabase("pending", dbConfig, recycle);
        
        return new BdbMultipleWorkQueues(db, bdb.getClassCatalog());
    }


    /**
     * Return the work queue for the given classKey, or null
     * if no such queue exists.
     * 
     * @param classKey key to look for
     * @return the found WorkQueue
     */
    protected WorkQueue getQueueFor(final String classKey) {      
        WorkQueue wq = allQueues.getOrUse(
                classKey,
                new Supplier() {
                    public BdbWorkQueue get() {
                        String qKey = new String(classKey); // ensure private minimal key
                        BdbWorkQueue q = new BdbWorkQueue(qKey, BdbFrontier.this);
                        q.setTotalBudget(getQueueTotalBudget()); 
                        getQueuePrecedencePolicy().queueCreated(q);
                        return q;
                    }});
        return wq;
    }
    
    @Override
    public FrontierGroup getGroup(CrawlURI curi) {
        return getQueueFor(curi.getClassKey());
    }
    
    /**
     * Return list of urls.
     * @param marker
     * @param numberOfMatches
     * @param verbose 
     * @return List of URIs (strings).
     */
    public CompositeData getURIsList(String marker, 
            int numberOfMatches, String pattern, final boolean verbose) {
        try {
            Pattern p = Pattern.compile(pattern);
            return pendingUris.getFrom(marker, numberOfMatches, p, verbose);
        } catch (DatabaseException e) {
            throw new IllegalStateException(e);
        }
    }
    
    
    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.AbstractFrontier#finalTasks()
     */
    @Override
    protected void finalTasks() {
        super.finalTasks();
        // before closing/releasing, dump if requested
        if (getDumpPendingAtClose()) {
            try {
                dumpAllPendingToLog();
            } catch (Exception e) {
                logger.log(Level.WARNING, "dump pending problem", e);
            }
        }
    }
    
    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.WorkQueueFrontier#close()
     */
    @Override 
    public void close() {
        ArchiveUtils.closeQuietly(pendingUris);
        super.close(); 
    }
        
    protected BdbMultipleWorkQueues getWorkQueues() {
        return pendingUris;
    }

    protected boolean workQueueDataOnDisk() {
        return true;
    }

    public BdbFrontier() {
        super();
    }
    
    public void startCheckpoint(Checkpoint checkpointInProgress) {
        dispositionInProgressLock.writeLock().lock();
    }

    public void doCheckpoint(Checkpoint checkpointInProgress) {
        // An explicit sync on any deferred write dbs is needed to make the
        // db recoverable. Sync'ing the environment is insufficient
        this.pendingUris.sync();
        // object caches will be sync()d by BdbModule
        
        // save simple instance fields & inactive-levels summary
        JSONObject json = new JSONObject();
        try {
            json.put("nextOrdinal", nextOrdinal.get());
            json.put("queuedUriCount", queuedUriCount.get());
            json.put("futureUriCount", futureUriCount.get());
            json.put("succeededFetchCount", succeededFetchCount.get());
            json.put("failedFetchCount", failedFetchCount.get());
            json.put("disregardedUriCount", disregardedUriCount.get());
            json.put("totalProcessedBytes", totalProcessedBytes.get());
            json.put("inactivePrecedences", inactiveQueuesByPrecedence.keySet());
            checkpointInProgress.saveJson(beanName, json);
        } catch (JSONException e) {
            // impossible
            throw new RuntimeException(e);
        }
        // write all active (inProcess, ready, snoozed) queues to list for quick-resume-use
        PrintWriter activeQueuesWriter = null;
        try {
            activeQueuesWriter = new PrintWriter(checkpointInProgress.saveWriter(beanName, "active"));
            for(WorkQueue q : inProcessQueues) {
                activeQueuesWriter.println(q.getClassKey());
            }
            for(String qk : readyClassQueues) {
                activeQueuesWriter.println(qk);
            }
            for(DelayedWorkQueue q : snoozedClassQueues) {
                activeQueuesWriter.println(q.getClassKey());
            }
            for(DelayedWorkQueue q : snoozedOverflow.values()) {
                activeQueuesWriter.println(q.getClassKey());
            }
        } catch (IOException ioe) {
            checkpointInProgress.setSuccess(false);
            logger.log(Level.SEVERE,"problem writing checkpoint", ioe);
        } finally {
            IOUtils.closeQuietly(activeQueuesWriter);
        }
        // rotate recover log, if any
        if(this.recover!=null) {
            recover.rotateForCheckpoint(checkpointInProgress);
        }
    }

    public void finishCheckpoint(Checkpoint checkpointInProgress) {
        dispositionInProgressLock.writeLock().unlock();
    }

    protected Checkpoint recoveryCheckpoint;
    @Autowired(required=false)
    public void setRecoveryCheckpoint(Checkpoint checkpoint) {
        this.recoveryCheckpoint = checkpoint; 
    }
    
    @Override
    protected void initAllQueues() throws DatabaseException {
        boolean isRecovery = (recoveryCheckpoint != null);
        this.allQueues = bdb.getObjectCache("allqueues", isRecovery, WorkQueue.class, BdbWorkQueue.class);
        if(isRecovery) {
            // restore simple instance fields 
            JSONObject json = recoveryCheckpoint.loadJson(beanName);
            try {
                nextOrdinal.set(json.getLong("nextOrdinal"));
                queuedUriCount.set(json.getLong("queuedUriCount"));
                futureUriCount.set(json.getLong("futureUriCount"));
                succeededFetchCount.set(json.getLong("succeededFetchCount"));
                failedFetchCount.set(json.getLong("failedFetchCount"));
                disregardedUriCount.set(json.getLong("disregardedUriCount"));
                totalProcessedBytes.set(json.getLong("totalProcessedBytes"));
                JSONArray inactivePrecedences = json.getJSONArray("inactivePrecedences"); 
                // restore all intended inactiveQueues
                for(int i = 0; i < inactivePrecedences.length(); i++) {
                    int precedence = inactivePrecedences.getInt(i);
                    inactiveQueuesByPrecedence.put(precedence,createInactiveQueueForPrecedence(precedence,true));
                }
            } catch (JSONException e) {
                throw new RuntimeException(e);
            }           
            
            // retired queues already restored with prior data in initOtherQueues
            
            // restore ready queues (those not already on inactive, retired)
            BufferedReader activeQueuesReader = null;
            try {
                activeQueuesReader = recoveryCheckpoint.loadReader(beanName,"active");
                String line; 
                while((line = activeQueuesReader.readLine())!=null) {
                    readyClassQueues.add(line); 
                }
            } catch (IOException ioe) {
                throw new RuntimeException(ioe); 
            } finally {
                IOUtils.closeQuietly(activeQueuesReader); 
            }

            // TODO: restore largestQueues topNset?
        }
    }

    @Override
    protected void initOtherQueues() throws DatabaseException {
        boolean recycle = (recoveryCheckpoint != null);
        
        // tiny risk of OutOfMemoryError: if giant number of snoozed
        // queues all wake-to-ready at once
        readyClassQueues = new LinkedBlockingQueue();

        inactiveQueuesByPrecedence = new ConcurrentSkipListMap>();
        
        retiredQueues = bdb.getStoredQueue("retiredQueues", String.class, recycle);

        // primary snoozed queues
        snoozedClassQueues = new DelayQueue();
        // just in case: overflow for extreme situations
        snoozedOverflow = bdb.getStoredMap(
                "snoozedOverflow", Long.class, DelayedWorkQueue.class, true, false);
            
        this.futureUris = bdb.getStoredMap(
                "futureUris", Long.class, CrawlURI.class, true, recoveryCheckpoint!=null);
        
        // initialize master map in which other queues live
        this.pendingUris = createMultipleWorkQueues();
    }


    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.WorkQueueFrontier#createInactiveQueueForPrecedence(int)
     */
    @Override
    protected Queue createInactiveQueueForPrecedence(int precedence) {
        return createInactiveQueueForPrecedence(precedence, false);
    }
    
    /** 
     * Optionally reuse prior data, for use when resuming from a checkpoint
     */
    protected Queue createInactiveQueueForPrecedence(int precedence, boolean usePriorData) {
        return bdb.getStoredQueue("inactiveQueues-"+precedence, String.class, usePriorData);
    }
    
    /**
     * Dump all still-enqueued URIs to the crawl.log -- without actually
     * dequeuing. Useful for understanding what was remaining in a crawl that
     * was ended early, for example at a time limit.
     * 
     * @throws DatabaseException
     */
    public void dumpAllPendingToLog() throws DatabaseException {
        Closure tolog = new Closure() {
            public void execute(Object curi) {
                log((CrawlURI) curi);
            }
        };
        forAllPendingDo(tolog);
    }

    public void forAllPendingDo(Closure closure) {
        pendingUris.forAllPendingDo(closure);
    }

    /**
     * Run a self-consistency check over queue collections, queues-of-queues, 
     * etc. for testing purposes. Requires one of the same locks as for PAUSE, 
     * so should only be run while crawl is running. 
     */
    public void consistencyCheck() {
//        outboundLock.writeLock().lock(); 
        dispositionInProgressLock.writeLock().lock();
        System.err.println("<< queueSummaries = 
            bdb.getStoredMap(
                    null,
                    String.class,
                    String.class,
                    false,
                    false);
        // mark every queue with the 'managed' collections it's in
        consistencyMarkup(queueSummaries, inProcessQueues, "i");
        consistencyMarkup(queueSummaries,readyClassQueues, "r");
        consistencyMarkup(queueSummaries,snoozedClassQueues, "s");
        consistencyMarkup(queueSummaries,snoozedOverflow.values(), "S");
        for( Entry> entry : getInactiveQueuesByPrecedence().entrySet()) {
            consistencyMarkup(queueSummaries,entry.getValue(),Integer.toString(entry.getKey()));
        }
        consistencyMarkup(queueSummaries,retiredQueues, "R");
        
        // report problems where a queue isn't as expected or ideal
        int anomalies = 0; 
        for(String q : allQueues.keySet()) {
            WorkQueue wq = allQueues.get(q); 
            String summary = queueSummaries.get(q);
            if(wq.getCount()>0 && summary == null) {
                // every non-empty queue should have been in at least one collection
                System.err.println("FRONTIER ANOMALY: "+q+" "+wq.getCount()+" "+wq.isManaged()+" but not in managed collections");
//                System.err.println(wq.shortReportLegend()+"\n"+inactiveByClass.get(q)+"\n"+wq.shortReportLine());
                anomalies++;
            }
            if(wq.getCount()==0 && summary == null && wq.isManaged()) {
                // any empty queue should only report isManaged if in a collection
                System.err.println("FRONTIER ANOMALY: "+q+" "+wq.getCount()+" "+wq.isManaged()+" but not in managed collections");
//                System.err.println(wq.shortReportLegend()+"\n"+inactiveByClass.get(q)+"\n"+wq.shortReportLine());
                anomalies++;
            }
        }
        System.err.println(anomalies+" ANOMALIES");
        int concerns = 0; 
        for(String q : queueSummaries.keySet()) {
            String summary = queueSummaries.get(q);
            if(summary != null && summary.split(",").length>1) {
                // ideally queues won't be more than one place (though frontier
                // should operate if they are, and changing precedence values 
                // will cause multiple entries by design)
                WorkQueue wq = allQueues.get(q); 
                System.err.println("FRONTIER CONCERN: "+q+" "+wq.getCount()+" multiple places: "+summary);
                System.err.println("\n"+wq.shortReportLegend()+"\n"+wq.shortReportLine());
                concerns++;
            }
        }
        System.err.println(concerns+" CONCERNS");
        System.err.println("END CHECKING FRONTIER>>>");
        
        queueSummaries.dispose();
        dispositionInProgressLock.writeLock().unlock();
//        outboundLock.writeLock().unlock(); 
    }
    protected void consistencyMarkup(
            DisposableStoredSortedMap queueSummaries,
            Iterable queues, String mark) {
        for(Object qq : queues) {
            String key = (qq instanceof String) 
                         ? (String)qq 
                         : (qq instanceof WorkQueue) 
                           ? ((WorkQueue)qq).getClassKey()
                           : ((DelayedWorkQueue)qq).getClassKey();
            String val = queueSummaries.get(key);
            val = (val==null) ? mark : val+","+mark;
            queueSummaries.put(key, val);
        }
    }

    @Override
    public long exportPendingUris(PrintWriter writer) {
        if (pendingUris == null) {
            return -5L;
        }
        return pendingUris.exportPendingUris(writer);
    }

    @Override
    public ObjectIdentityCache getAllQueues() {
        return allQueues;
    }

    @Override
    public BlockingQueue getReadyClassQueues() {
        return readyClassQueues;
    }

    @Override
    public Set getInProcessQueues() {
        return inProcessQueues;
    } 

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy