All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.frontier.BdbWorkQueue Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.crawler.frontier;

import java.io.IOException;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.bdb.AutoKryo;
import org.archive.crawler.frontier.precedence.SimplePrecedenceProvider;
import org.archive.modules.CrawlURI;
import org.archive.modules.fetcher.FetchStats;
import org.archive.util.ArchiveUtils;

import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;


/**
 * One independent queue of items with the same 'classKey' (eg host).
 * @author gojomo
 */
public class BdbWorkQueue extends WorkQueue
implements Serializable {
    private static final long serialVersionUID = 1L;
    private static Logger LOGGER =
        Logger.getLogger(BdbWorkQueue.class.getName());


    /**
     * All items in this queue have this same 'origin'
     * prefix to their keys.
     */
    private byte[] origin;

    /**
     * Create a virtual queue inside the given BdbMultipleWorkQueues 
     * 
     * @param classKey
     */
    public BdbWorkQueue(String classKey, BdbFrontier frontier) {
        super(classKey);
        this.origin = BdbMultipleWorkQueues.calculateOriginKey(classKey);
        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.fine(getPrefixClassKey(this.origin) + " " + classKey);
        }
        // add the queue-front 'cap' entry; see...
        // http://sourceforge.net/tracker/index.php?func=detail&aid=1262665&group_id=73833&atid=539102
        frontier.getWorkQueues().addCap(origin);
    }

    protected long deleteMatchingFromQueue(final WorkQueueFrontier frontier,
            final String match) throws IOException {
        try {
            final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier)
                .getWorkQueues();
            return queues.deleteMatchingFromQueue(match, classKey,
                new DatabaseEntry(origin));
        } catch (DatabaseException e) {
            throw new IOException(e);
        }
    }

    protected void deleteItem(final WorkQueueFrontier frontier,
            final CrawlURI peekItem) throws IOException {
        try {
            final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier)
                .getWorkQueues();
             queues.delete(peekItem);
        } catch (DatabaseException e) {
            throw new IOException(e);
        }
    }

    protected CrawlURI peekItem(final WorkQueueFrontier frontier)
    throws IOException {
        final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier)
            .getWorkQueues();
        DatabaseEntry key = new DatabaseEntry(origin);
        CrawlURI curi = null;
        int tries = 1;
        while(true) {
            try {
                curi = queues.get(key);
            } catch (DatabaseException e) {
                LOGGER.log(Level.SEVERE,"peekItem failure; retrying",e);
            }
            
            // ensure CrawlURI, if any,  came from acceptable range: 
            if(!ArchiveUtils.startsWith(key.getData(),origin)) {
                LOGGER.severe(
                    "inconsistency: "+classKey+"("+
                    getPrefixClassKey(origin)+") with " + getCount() + " items gave "
                    + curi +"("+getPrefixClassKey(key.getData()));
                // clear curi to allow retry
                curi = null; 
                // reset key to original origin for retry
                key.setData(origin);
            }
            
            if (curi!=null) {
                // success
                break;
            }
            
            if (tries>3) {
                LOGGER.severe("no item where expected in queue "+classKey);
                break;
            }
            tries++;
            LOGGER.severe("Trying get #" + Integer.toString(tries)
                    + " in queue " + classKey + " with " + getCount()
                    + " items using key "
                    + getPrefixClassKey(key.getData()));
        }
 
        return curi;
    }

    protected void insertItem(final WorkQueueFrontier frontier,
            final CrawlURI curi, boolean overwriteIfPresent) throws IOException {
        try {
            final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier)
                .getWorkQueues();
            queues.put(curi, overwriteIfPresent);
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("Inserted into " + getPrefixClassKey(this.origin) +
                    " (count " + Long.toString(getCount())+ "): " +
                        curi.toString());
            }
        } catch (DatabaseException e) {
            throw new IOException(e);
        }
    }
    
    /**
     * @param byteArray Byte array to get hex string of.
     * @return Hex string of passed in byte array (Used logging
     * key-prefixes).
     */
    protected static String getPrefixClassKey(final byte [] byteArray) {
        int zeroIndex = 0;
        while(byteArray[zeroIndex]!=0) {
            zeroIndex++;
        }
        try {
            return new String(byteArray,0,zeroIndex,"UTF-8");
        } catch (UnsupportedEncodingException e) {
            // should be impossible; UTF-8 always available
            e.printStackTrace();
            return e.getMessage();
        }
    }
    
    // Kryo support
    public static void autoregisterTo(AutoKryo kryo) {
        kryo.register(BdbWorkQueue.class);
        kryo.autoregister(FetchStats.class); 
        kryo.autoregister(HashSet.class);
        kryo.autoregister(SimplePrecedenceProvider.class);
        kryo.autoregister(byte[].class);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy