All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.frontier.BdbMultipleWorkQueues Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.crawler.frontier;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import javax.management.openmbean.CompositeData;
import javax.management.openmbean.CompositeDataSupport;
import javax.management.openmbean.OpenDataException;

import org.apache.commons.collections.Closure;
import org.archive.bdb.KryoBinding;
import org.archive.modules.CrawlURI;
import org.archive.util.ArchiveUtils;

import com.google.common.base.Charsets;
import com.sleepycat.bind.EntryBinding;
import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.util.RuntimeExceptionWrapper;


/**
 * A BerkeleyDB-database-backed structure for holding ordered
 * groupings of CrawlURIs. Reading the groupings from specific
 * per-grouping (per-classKey/per-Host) starting points allows
 * this to act as a collection of independent queues. 
 * 
 * 

For how the bdb keys are made, see {@link #calculateInsertKey(CrawlURI)}. * *

TODO: refactor, improve naming. * * @author gojomo */ public class BdbMultipleWorkQueues { @SuppressWarnings("unused") private static final long serialVersionUID = 1L; private static final Logger LOGGER = Logger.getLogger(BdbMultipleWorkQueues.class.getName()); /** Database holding all pending URIs, grouped in virtual queues */ private Database pendingUrisDB = null; /** Supporting bdb serialization of CrawlURIs */ private EntryBinding crawlUriBinding; /** * Create the multi queue in the given environment. * * @throws DatabaseException */ public BdbMultipleWorkQueues(Database db, StoredClassCatalog classCatalog) throws DatabaseException { this.pendingUrisDB = db; crawlUriBinding = new KryoBinding(CrawlURI.class); // new RecyclingSerialBinding(classCatalog, CrawlURI.class); // new BenchmarkingBinding(new EntryBinding[] { // new KryoBinding(CrawlURI.class,true), // new KryoBinding(CrawlURI.class,false), // new RecyclingSerialBinding(classCatalog, CrawlURI.class), // }); } /** * Delete all CrawlURIs matching the given expression. * * @param match * @param queue * @param headKey * @return count of deleted items * @throws DatabaseException * @throws DatabaseException */ public long deleteMatchingFromQueue(String match, String queue, DatabaseEntry headKey) throws DatabaseException { long deletedCount = 0; Pattern pattern = Pattern.compile(match); DatabaseEntry key = headKey; DatabaseEntry value = new DatabaseEntry(); Cursor cursor = null; try { cursor = pendingUrisDB.openCursor(null, null); OperationStatus result = cursor.getSearchKeyRange(headKey, value, null); while (result == OperationStatus.SUCCESS) { if(value.getData().length>0) { CrawlURI curi = (CrawlURI) crawlUriBinding .entryToObject(value); if (!curi.getClassKey().equals(queue)) { // rolled into next queue; finished with this queue break; } if (pattern.matcher(curi.toString()).matches()) { cursor.delete(); deletedCount++; } } result = cursor.getNext(key, value, null); } } finally { if (cursor != null) { cursor.close(); } } return deletedCount; } /** * @param m marker or null to start with first entry * @param maxMatches * @return list of matches starting from marker position * @throws DatabaseException */ public CompositeData getFrom( String m, int maxMatches, Pattern pattern, boolean verbose) throws DatabaseException { int matches = 0; ArrayList results = new ArrayList(maxMatches); DatabaseEntry key; if (m == null) { key = getFirstKey(); } else { byte[] marker = m.getBytes(); // = FrontierJMXTypes.fromString(m); key = new DatabaseEntry(marker); } DatabaseEntry value = new DatabaseEntry(); Cursor cursor = null; OperationStatus result = null; try { cursor = pendingUrisDB.openCursor(null,null); result = cursor.getSearchKey(key, value, null); while(matches < maxMatches && result == OperationStatus.SUCCESS) { if(value.getData().length>0) { CrawlURI curi = (CrawlURI) crawlUriBinding.entryToObject(value); if(pattern.matcher(curi.toString()).matches()) { if (verbose) { results.add("[" + curi.getClassKey() + "] " + curi.shortReportLine()); } else { results.add(curi.toString()); } matches++; } } result = cursor.getNext(key,value,null); } } finally { if (cursor !=null) { cursor.close(); } } if(result != OperationStatus.SUCCESS) { // end of scan m = null; } else { m = new String(key.getData()); // = FrontierJMXTypes.toString(key.getData()); } String[] arr = results.toArray(new String[results.size()]); CompositeData cd; try { cd = new CompositeDataSupport( /*FrontierJMXTypes.URI_LIST_DATA*/ null, new String[] { "list", "marker" }, new Object[] { arr, m }); } catch (OpenDataException e) { throw new IllegalStateException(e); } return cd; } /** * @return the key to the first item in the database * @throws DatabaseException */ protected DatabaseEntry getFirstKey() throws DatabaseException { DatabaseEntry key = new DatabaseEntry(); DatabaseEntry value = new DatabaseEntry(); Cursor cursor = pendingUrisDB.openCursor(null,null); OperationStatus status = cursor.getNext(key,value,null); cursor.close(); if(status == OperationStatus.SUCCESS) { return key; } return null; } /** * Get the next nearest item after the given key. Relies on * external discipline -- we'll look at the queues count of how many * items it has -- to avoid asking for something from a * range where there are no associated items -- * otherwise could get first item of next 'queue' by mistake. * *

TODO: hold within a queue's range * * @param headKey Key prefix that demarks the beginning of the range * in pendingUrisDB we're interested in. * @return CrawlURI. * @throws DatabaseException */ public CrawlURI get(DatabaseEntry headKey) throws DatabaseException { DatabaseEntry result = new DatabaseEntry(); // From Linda Lee of sleepycat: // "You want to check the status returned from Cursor.getSearchKeyRange // to make sure that you have OperationStatus.SUCCESS. In that case, // you have found a valid data record, and result.getData() // (called by internally by the binding code, in this case) will be // non-null. The other possible status return is // OperationStatus.NOTFOUND, in which case no data record matched // the criteria. " OperationStatus status = getNextNearestItem(headKey, result); CrawlURI retVal = null; if (status != OperationStatus.SUCCESS) { LOGGER.severe("See '1219854 NPE je-2.0 " + "entryToObject...'. OperationStatus " + " was not SUCCESS: " + status + ", headKey " + BdbWorkQueue.getPrefixClassKey(headKey.getData())); return null; } try { retVal = (CrawlURI)crawlUriBinding.entryToObject(result); } catch (ClassCastException cce) { Object obj = crawlUriBinding.entryToObject(result); LOGGER.log(Level.SEVERE, "see [#HER-1283]: deserialized " + obj.getClass() + " has ClassLoader " + obj.getClass().getClassLoader().getClass(), cce); return null; } catch (RuntimeExceptionWrapper rw) { LOGGER.log( Level.SEVERE, "expected object missing in queue " + BdbWorkQueue.getPrefixClassKey(headKey.getData()), rw); return null; } retVal.setHolderKey(headKey); return retVal; } protected OperationStatus getNextNearestItem(DatabaseEntry headKey, DatabaseEntry result) throws DatabaseException { Cursor cursor = null; OperationStatus status; try { cursor = this.pendingUrisDB.openCursor(null, null); // get cap; headKey at this point should always point to // a queue-beginning cap entry (zero-length value) status = cursor.getSearchKey(headKey, result, null); if (status != OperationStatus.SUCCESS) { LOGGER.severe("bdb queue cap missing: " + status.toString() + " " + new String(headKey.getData())); return status; } if (result.getData().length > 0) { LOGGER.severe("bdb queue has nonzero size: " + result.getData().length); return OperationStatus.KEYEXIST; } // get next item (real first item of queue) status = cursor.getNext(headKey,result,null); } finally { if(cursor!=null) { cursor.close(); } } return status; } /** * Put the given CrawlURI in at the appropriate place. * * @param curi * @throws DatabaseException */ public void put(CrawlURI curi, boolean overwriteIfPresent) throws DatabaseException { DatabaseEntry insertKey = (DatabaseEntry)curi.getHolderKey(); if (insertKey == null) { insertKey = calculateInsertKey(curi); curi.setHolderKey(insertKey); } DatabaseEntry value = new DatabaseEntry(); crawlUriBinding.objectToEntry(curi, value); // Output tally on avg. size if level is FINE or greater. if (LOGGER.isLoggable(Level.FINE)) { tallyAverageEntrySize(curi, value); } OperationStatus status; if(overwriteIfPresent) { status = pendingUrisDB.put(null, insertKey, value); } else { status = pendingUrisDB.putNoOverwrite(null, insertKey, value); } if (status!=OperationStatus.SUCCESS) { LOGGER.log(Level.SEVERE,"URI enqueueing failed; "+status+ " "+curi, new RuntimeException()); } } private long entryCount = 0; private long entrySizeSum = 0; private int largestEntry = 0; /** * Log average size of database entry. * @param curi CrawlURI this entry is for. * @param value Database entry value. */ private synchronized void tallyAverageEntrySize(CrawlURI curi, DatabaseEntry value) { entryCount++; int length = value.getData().length; entrySizeSum += length; int avg = (int) (entrySizeSum/entryCount); if(entryCount % 1000 == 0) { LOGGER.fine("Average entry size at "+entryCount+": "+avg); } if (length>largestEntry) { largestEntry = length; LOGGER.fine("Largest entry: "+length+" "+curi); if(length>(2*avg)) { LOGGER.fine("excessive?"); } } } /** * Calculate the 'origin' key for a virtual queue of items * with the given classKey. This origin key will be a * prefix of the keys for all items in the queue. * * @param classKey String key to derive origin byte key from * @return a byte array key */ protected static byte[] calculateOriginKey(String classKey) { byte[] classKeyBytes = null; int len = 0; try { classKeyBytes = classKey.getBytes("UTF-8"); len = classKeyBytes.length; } catch (UnsupportedEncodingException e) { // should be impossible; all JVMs must support UTF-8 e.printStackTrace(); } byte[] keyData = new byte[len+1]; System.arraycopy(classKeyBytes,0,keyData,0,len); keyData[len]=0; return keyData; } /** * Calculate the insertKey that places a CrawlURI in the * desired spot. First bytes are always classKey (usu. host) * based -- ensuring grouping by host -- terminated by a zero * byte. Then 8 bytes of data ensuring desired ordering * within that 'queue' are used. The first byte of these 8 is * priority -- allowing 'immediate' and 'soon' items to * sort above regular. Next 1 byte is 'precedence'. Last 6 bytes * are ordinal serial number, ensuring earlier-discovered * URIs sort before later. * * NOTE: Dangers here are: * (1) priorities or precedences over 2^7 (signed byte comparison) * (2) ordinals over 2^48 * * Package access & static for testing purposes. * * @param curi * @return a DatabaseEntry key for the CrawlURI */ protected static DatabaseEntry calculateInsertKey(CrawlURI curi) { byte[] classKeyBytes = null; int len = 0; classKeyBytes = curi.getClassKey().getBytes(Charsets.UTF_8); len = classKeyBytes.length; byte[] keyData = new byte[len+9]; System.arraycopy(classKeyBytes,0,keyData,0,len); keyData[len]=0; long ordinalPlus = curi.getOrdinal() & 0x0000FFFFFFFFFFFFL; ordinalPlus = ((long)curi.getSchedulingDirective() << 56) | ordinalPlus; long precedence = Math.min(curi.getPrecedence(), 127); ordinalPlus = (((precedence) & 0xFFL) << 48) | ordinalPlus; ArchiveUtils.longIntoByteArray(ordinalPlus, keyData, len+1); return new DatabaseEntry(keyData); } protected static String insertKeyToString(DatabaseEntry holderKey) { StringBuilder result = new StringBuilder(); byte[] data = holderKey.getData(); int p = findFirstZero(data); result.append(new String(data, 0, p)); java.io.ByteArrayInputStream binp = new java.io.ByteArrayInputStream(data, p + 1, data.length); java.io.DataInputStream dinp = new java.io.DataInputStream(binp); long l = 0; try { l = dinp.readLong(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } result.append(" blah=").append(l); return result.toString(); } private static int findFirstZero(byte[] b) { for (int i = 0; i < b.length; i++) { if (b[i] == 0) { return i; } } return -1; } /** * Delete the given CrawlURI from persistent store. Requires * the key under which it was stored be available. * * @param item * @throws DatabaseException */ public void delete(CrawlURI item) throws DatabaseException { OperationStatus status; DatabaseEntry de = (DatabaseEntry)item.getHolderKey(); status = pendingUrisDB.delete(null, de); if (status != OperationStatus.SUCCESS) { LOGGER.severe("expected item not present: " + item + "(" + (new BigInteger(((DatabaseEntry) item.getHolderKey()) .getData())).toString(16) + ")"); } } /** * Method used by BdbFrontier during checkpointing. *

The backing bdbje database has been marked deferred write so we save * on writes to disk. Means no guarantees disk will have what's in memory * unless a sync is called (Calling sync on the bdbje Environment is not * sufficient). *

Package access only because only Frontiers of this package would ever * need access. * @see Deferred Write Databases */ protected void sync() { if (this.pendingUrisDB == null) { return; } try { this.pendingUrisDB.sync(); } catch (DatabaseException e) { e.printStackTrace(); } } /** * clean up * */ public void close() { /* try { this.pendingUrisDB.close(); } catch (DatabaseException e) { e.printStackTrace(); } */ } /** * Add a dummy 'cap' entry at the given insertion key. Prevents * 'seeks' to queue heads from holding lock on last item of * 'preceding' queue. See: * http://sourceforge.net/tracker/index.php?func=detail&aid=1262665&group_id=73833&atid=539102 * * @param origin key at which to insert the cap */ public void addCap(byte[] origin) { try { pendingUrisDB.put(null, new DatabaseEntry(origin), new DatabaseEntry(new byte[0])); } catch (DatabaseException e) { throw new RuntimeException(e); } } /** * Utility method to perform action for all pending CrawlURI instances. * @param c Closure action to perform * @throws DatabaseException */ protected void forAllPendingDo(Closure c) throws DatabaseException { DatabaseEntry key = new DatabaseEntry(); DatabaseEntry value = new DatabaseEntry(); Cursor cursor = pendingUrisDB.openCursor(null, null); while (cursor.getNext(key, value, null) == OperationStatus.SUCCESS) { if (value.getData().length == 0) { continue; } CrawlURI item = (CrawlURI) crawlUriBinding.entryToObject(value); c.execute(item); } cursor.close(); } /** * Run through all uris in the pending uris database and write them to the writer. * @param writer destination writer for writting all the uris * @return number of uris written to the writer */ public long exportPendingUris(PrintWriter writer) { if (this.pendingUrisDB == null) { return -6L; } sync(); DatabaseEntry key = new DatabaseEntry(); DatabaseEntry value = new DatabaseEntry(); long uris = 0L; Cursor cursor = pendingUrisDB.openCursor(null, null); while (cursor.getNext(key, value, null) == OperationStatus.SUCCESS) { if (value.getData().length == 0) { continue; } CrawlURI item = (CrawlURI) crawlUriBinding.entryToObject(value); writer.println(item.toString()); ++uris; } cursor.close(); return uris; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy