All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.frontier.WorkQueue Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.crawler.frontier;
 
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Serializable;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.concurrent.Delayed;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.crawler.framework.Frontier;
import org.archive.crawler.frontier.precedence.PrecedenceProvider;
import org.archive.crawler.frontier.precedence.SimplePrecedenceProvider;
import org.archive.modules.CrawlURI;
import org.archive.modules.fetcher.FetchStats;
import org.archive.modules.fetcher.FetchStats.Stage;
import org.archive.util.ArchiveUtils;
import org.archive.util.IdentityCacheable;
import org.archive.util.ObjectIdentityCache;
import org.archive.util.ReportUtils;
import org.archive.util.Reporter;

/**
 * A single queue of related URIs to visit, grouped by a classKey
 * (typically "hostname:port" or similar) 
 * 
 * @author gojomo
 * @author Christian Kohlschuetter 
 */
public abstract class WorkQueue implements Frontier.FrontierGroup,
        Serializable, Reporter, Delayed, IdentityCacheable {
    private static final long serialVersionUID = -3199666138837266341L;
    private static final Logger logger =
        Logger.getLogger(WorkQueue.class.getName());
    
    /** The classKey */
    protected final String classKey;

    /** whether queue is active (ready/in-process/snoozed) or on a waiting queue */
    protected boolean active = false;

    /** Total number of stored items */
    protected long count = 0;

    /** Total number of items ever enqueued */
    protected long enqueueCount = 0;
    
    /** Whether queue is already in lifecycle stage */
    protected boolean isManaged = false;

    /** Time to wake, if snoozed */
    protected long wakeTime = 0;

    /** assigned precedence */
    protected PrecedenceProvider precedenceProvider = new SimplePrecedenceProvider(1);
            
    /** Per-session 'budget' controlling activity duration */
    protected int sessionBudget = 0;

    /** Cost of the last item to be charged against queue */
    protected int lastCost = 0;

    /** Total number of items charged against queue; with totalExpenditure
     * can be used to calculate 'average cost'. */
    protected long costCount = 0;

    /** Running tally of total expenditures on this queue */
    protected long totalExpenditure = 0;

    /** Record of expenditures at last activation (session start) */
    protected long expenditureAtLastActivation = 0;
    
    /** Total to spend on this queue over its lifetime */
    protected long totalBudget = 0;

    /** The next item to be returned */
    transient protected CrawlURI peekItem = null;

    /** Last URI enqueued */
    protected String lastQueued;

    /** Last URI peeked */
    protected String lastPeeked;

    /** time of last dequeue (disposition of some URI) **/ 
    protected long lastDequeueTime;
    
    /** count of errors encountered */
    protected long errorCount = 0;
    
    /** Substats for all CrawlURIs in this group */
    protected FetchStats substats = new FetchStats();

    protected boolean retired;

    public WorkQueue(final String pClassKey) {
        this.classKey = pClassKey;
    }

    /**
     * Delete URIs matching the given pattern from this queue. 
     * @param frontier
     * @param match
     * @return count of deleted URIs
     */
    public synchronized long deleteMatching(final WorkQueueFrontier frontier, String match) {
        try {
            final long deleteCount = deleteMatchingFromQueue(frontier, match);
            this.count -= deleteCount;
            return deleteCount;
        } catch (IOException e) {
            //FIXME better exception handling
            e.printStackTrace();
            throw new RuntimeException(e);
        }
    }

    /**
     * Add the given CrawlURI, noting its addition in running count. (It
     * should not already be present.)
     * 
     * @param frontier Work queues manager.
     * @param curi CrawlURI to insert.
     */
    protected synchronized long enqueue(final WorkQueueFrontier frontier,
        CrawlURI curi) {
        try {
            insert(frontier, curi, false);
        } catch (IOException e) {
            //FIXME better exception handling
            e.printStackTrace();
            throw new RuntimeException(e);
        }
        count++;
        enqueueCount++;
        return count;
    }

    /**
     * Return the topmost queue item -- and remember it,
     * such that even later higher-priority inserts don't
     * change it. 
     * 
     * TODO: evaluate if this is really necessary
     * @param frontier Work queues manager
     * 
     * @return topmost queue item, or null
     */
    public synchronized CrawlURI peek(final WorkQueueFrontier frontier) {
        if(peekItem == null && count > 0) {
            try {
                peekItem = peekItem(frontier);
            } catch (IOException e) {
                //FIXME better exception handling
                logger.log(Level.SEVERE,"peek failure",e);
                e.printStackTrace();
                // throw new RuntimeException(e);
            }
            if(peekItem != null) {
                lastPeeked = peekItem.toString();
            }
        }
        return peekItem;
    }

    /**
     * Remove the peekItem from the queue and adjusts the count.
     * 
     * @param frontier  Work queues manager.
     */
    protected synchronized void dequeue(final WorkQueueFrontier frontier, CrawlURI expected) {
        try {
            deleteItem(frontier, peekItem);
        } catch (IOException e) {
            //FIXME better exception handling
            e.printStackTrace();
            throw new RuntimeException(e);
        }
        unpeek(expected);
        count--;
        lastDequeueTime = System.currentTimeMillis();
    }

    /**
     * Set the session 'activity budget' to the given value. Automatically
     * reset continually as new CrawlURIs are enqueued; a direct change
     * here by operator will not persist. Instead, change the 'balanceReplenishAmount' 
     * (or overlay its value with a URI/queue-specific value) to affect this
     * value.
     * 
     * @param budget to use
     */
    protected void setSessionBudget(int budget) {
        this.sessionBudget = budget;
    }

    /**
     * Return current session 'activity budget balance' 
     * 
     * @return session balance
     */
    public int getSessionBudget() {
        return this.sessionBudget;
    }

    /**
     * Begin an 'active' session, which begins when a queue first offers a
     * URI for crawling, and continues until it is deactivated (for example, 
     * for session-budget reasons). 
     */
    public synchronized void considerActive() {
        if(active) {
            return; 
        }
        active=true; 
        expenditureAtLastActivation = totalExpenditure;
    }
    
    /**
     * Set the total expenditure level allowable before queue is 
     * considered inherently 'over-budget'. 
     * 
     * Automatically reset continually as new CrawlURIs are enqueued; a direct change
     * here by operator will not persist. Instead, change the 'queueTotalBudget' 
     * (or overlay its value with a URI/queue-specific value) to affect this
     * value.
     * 
     * @param budget
     */
    protected void setTotalBudget(long budget) {
        this.totalBudget = budget;
    }

    /**
     * Check whether queue has temporarily (session) exceeded its budget.
     * 
     * @return true if queue is over either of its set budget(s)
     */
    public boolean isOverSessionBudget() {
        // check whether session budget exceeded
        // or totalExpenditure exceeds totalBudget
        return (sessionBudget > 0 && (totalExpenditure - expenditureAtLastActivation) > sessionBudget);
    }

    /**
     * Check whether queue has permanently (total) exceeded its budget.
     * 
     * @return true if queue is over either of its set budget(s)
     */
    public boolean isOverTotalBudget() {
        // check whether session budget exceeded
        // or totalExpenditure exceeds totalBudget
        return (this.totalBudget >= 0 && this.totalExpenditure >= this.totalBudget);
    }
    
    /**
     * Return the tally of all expenditures on this queue
     * 
     * @return total amount expended on this queue
     */
    public long getTotalExpenditure() {
        return totalExpenditure;
    }

    /**
     * Decrease the internal running budget by the given amount. (Use
     * negative value to effect 'refund'/undo.)
     * 
     * @param amount tp decrement
     */
    public void expend(int amount) {
        this.totalExpenditure = this.totalExpenditure + amount;
        if(amount >= 0) {
            this.lastCost = amount;
            this.costCount++;
        } else {
            this.costCount--; 
        }
    }

    
    /**
     * Note an error and assess an extra penalty. 
     * @param penalty additional amount to deduct
     */
    public void noteError(int penalty) {
        this.totalExpenditure = this.totalExpenditure + penalty;
        errorCount++;
    }
    
    /**
     * @param l
     */
    public void setWakeTime(long l) {
        wakeTime = l;
    }

    /**
     * @return wakeTime
     */
    public long getWakeTime() {
        return wakeTime;
    }

    /**
     * @return classKey, the 'identifier', for this queue.
     */
    public String getClassKey() {
        return this.classKey;
    }

    /**
     * Forgive the peek, allowing a subsequent peek to 
     * return a different item. 
     * 
     */
    public synchronized void unpeek(CrawlURI expected) {
        assert expected == peekItem : "unexpected peekItem";
        peekItem = null;
    }

    /* (non-Javadoc)
     * @see java.util.concurrent.Delayed#getDelay(java.util.concurrent.TimeUnit)
     */
    public long getDelay(TimeUnit unit) {
        return unit.convert(
                getWakeTime()-System.currentTimeMillis(),
                TimeUnit.MILLISECONDS);
    }

    public final int compareTo(Delayed obj) {
        if(this == obj) {
            return 0; // for exact identity only
        }
        WorkQueue other = (WorkQueue) obj;
        if(getWakeTime() > other.getWakeTime()) {
            return 1;
        }
        if(getWakeTime() < other.getWakeTime()) {
            return -1;
        }
        // at this point, the ordering is arbitrary, but still
        // must be consistent/stable over time
        return this.classKey.compareTo(other.getClassKey());
    }

    /**
     * Update the given CrawlURI, which should already be present. (This
     * is not checked.) Equivalent to an enqueue without affecting the count.
     * 
     * @param frontier Work queues manager.
     * @param curi CrawlURI to update.
     */
    protected void update(final WorkQueueFrontier frontier, CrawlURI curi) {
        try {
            insert(frontier, curi, true);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Count of URIs in this queue. Only precise if called within frontier's
     * manager thread. 
     * 
     * @return Returns the count.
     */
    public synchronized long getCount() {
        return this.count;
    }

    /**
     * Insert the given curi, whether it is already present or not. 
     * @param frontier WorkQueueFrontier.
     * @param curi CrawlURI to insert.
     * @throws IOException
     */
    private void insert(final WorkQueueFrontier frontier, CrawlURI curi,
            boolean overwriteIfPresent)
        throws IOException {
        insertItem(frontier, curi, overwriteIfPresent);
        lastQueued = curi.toString();
    }

    /**
     * Insert the given curi, whether it is already present or not.
     * Hook for subclasses. 
     * 
     * @param frontier WorkQueueFrontier.
     * @param curi CrawlURI to insert.
     * @throws IOException  if there was a problem while inserting the item
     */
    protected abstract void insertItem(final WorkQueueFrontier frontier,
        CrawlURI curi, boolean overwriteIfPresent) throws IOException;

    /**
     * Delete URIs matching the given pattern from this queue. 
     * @param frontier WorkQueues manager.
     * @param match  the pattern to match
     * @return count of deleted URIs
     * @throws IOException  if there was a problem while deleting
     */
    protected abstract long deleteMatchingFromQueue(
        final WorkQueueFrontier frontier, final String match)
        throws IOException;

    /**
     * Removes the given item from the queue.
     * 
     * This is only used to remove the first item in the queue,
     * so it is not necessary to implement a random-access queue.
     * 
     * @param frontier  Work queues manager.
     * @throws IOException  if there was a problem while deleting the item
     */
    protected abstract void deleteItem(final WorkQueueFrontier frontier,
        final CrawlURI item) throws IOException;

    /**
     * Returns first item from queue (does not delete)
     * 
     * @return The peeked item, or null
     * @throws IOException  if there was a problem while peeking
     */
    protected abstract CrawlURI peekItem(final WorkQueueFrontier frontier)
        throws IOException;

    // 
    // Reporter
    //

    @Override
    public synchronized Map shortReportMap() {
        Map map = new LinkedHashMap();

        map.put("queueName", classKey);
        map.put("precedence", getPrecedence());
        map.put("itemCount", count);
        map.put("enqueueCount", enqueueCount);
        map.put("sessionBalance", getSessionBalance());
        map.put("lastCost", lastCost);
        map.put("averageCost", (double) totalExpenditure / costCount);
        if (lastDequeueTime != 0) {
            map.put("lastDequeueTime", new Date(lastDequeueTime));
        } else {
            map.put("lastDequeueTime", null);
        }
        if (wakeTime != 0) {
            map.put("lastDequeueTime", new Date(wakeTime));
        } else {
            map.put("lastDequeueTime", null);
        }
        map.put("totalExpenditure", totalExpenditure);
        map.put("totalBudget", totalBudget);
        map.put("errorCount", errorCount);
        map.put("lastPeeked", lastPeeked);
        map.put("lastQueued", lastQueued);

        return map;
    }

    protected long getSessionBalance() {
        return sessionBudget - (totalExpenditure-expenditureAtLastActivation);
    }

    @Override
    public synchronized void shortReportLineTo(PrintWriter writer) {
        // queue name
        writer.print(classKey);
        writer.print(" ");
        // precedence
        writer.print(getPrecedence());
        writer.print(" ");
        // count of items
        writer.print(Long.toString(count));
        writer.print(" ");
        // enqueue count
        writer.print(Long.toString(enqueueCount));
        writer.print(" ");
        writer.print(getSessionBalance());
        writer.print(" ");
        writer.print(lastCost);
        writer.print("(");
        writer.print(ArchiveUtils.doubleToString(
                    ((double) totalExpenditure / costCount), 1));
        writer.print(")");
        writer.print(" ");
        // last dequeue time, if any, or '-'
        if (lastDequeueTime != 0) {
            writer.print(ArchiveUtils.getLog17Date(lastDequeueTime));
        } else {
            writer.print("-");
        }
        writer.print(" ");
        // wake time if snoozed, or '-'
        if (wakeTime != 0) {
            writer.print(ArchiveUtils.formatMillisecondsToConventional(wakeTime - System.currentTimeMillis()));
        } else {
            writer.print("-");
        }
        writer.print(" ");
        writer.print(Long.toString(totalExpenditure));
        writer.print("/");
        writer.print(Long.toString(totalBudget));
        writer.print(" ");
        writer.print(Long.toString(errorCount));
        writer.print(" ");
        writer.print(lastPeeked);
        writer.print(" ");
        writer.print(lastQueued);
        writer.print("\n");
    }

    @Override
    public String shortReportLegend() {
        return "queue precedence currentSize totalEnqueues sessionBalance " +
                "lastCost (averageCost) lastDequeueTime wakeTime " +
                "totalSpend/totalBudget errorCount lastPeekUri lastQueuedUri";
    }
    
    public String shortReportLine() {
        return ReportUtils.shortReportLine(this);
    }
    
    /**
     * @param writer
     */
    @Override
    public synchronized void reportTo(PrintWriter writer) {
        // name is ignored: only one kind of report for now
        writer.print("Queue ");
        writer.print(classKey);
        writer.print(" (p");
        writer.print(getPrecedence());
        writer.print(")\n");
        writer.print("  ");
        writer.print(Long.toString(count));
        writer.print(" items");
        if (wakeTime != 0) {
            writer.print("\n   wakes in: "+ArchiveUtils.formatMillisecondsToConventional(wakeTime - System.currentTimeMillis()));
        }
        writer.print("\n    last enqueued: ");
        writer.print(lastQueued);
        writer.print("\n      last peeked: ");
        writer.print(lastPeeked);
        writer.print("\n");
        writer.print("   total expended: ");
        writer.print(Long.toString(totalExpenditure));
        writer.print(" (total budget: ");
        writer.print(Long.toString(totalBudget));
        writer.print(")\n");
        writer.print("   active balance: ");
        writer.print(getSessionBalance());
        writer.print("\n   last(avg) cost: ");
        writer.print(lastCost);
        writer.print("(");
        writer.print(ArchiveUtils.doubleToString(
                    ((double) totalExpenditure / costCount), 1));
        writer.print(")\n   ");
        writer.print(getSubstats().shortReportLegend());
        writer.print("\n   ");
        writer.print(getSubstats().shortReportLine());
        writer.print("\n   ");
        writer.print(getPrecedenceProvider().shortReportLegend());
        writer.print("\n   ");
        writer.print(getPrecedenceProvider().shortReportLine());
        writer.print("\n\n");
    }
    
    public FetchStats getSubstats() {
        return substats;
    }

    /**
     * Set the retired status of this queue.
     * 
     * @param b new value for retired status
     */
    protected void setRetired(boolean b) {
        this.retired = b;
    }
    
    public boolean isRetired() {
        return retired;
    }

    /**
     * @return the precedenceProvider
     */
    public PrecedenceProvider getPrecedenceProvider() {
        return precedenceProvider;
    }

    /**
     * @param precedenceProvider the precedenceProvider to set
     */
    public void setPrecedenceProvider(PrecedenceProvider precedenceProvider) {
        this.precedenceProvider = precedenceProvider;
    }
    
    /**
     * @return the precedence
     */
    public int getPrecedence() {
        return precedenceProvider.getPrecedence();
    }

    /* (non-Javadoc)
     * @see org.archive.modules.fetcher.FetchStats.HasFetchStats#tally(org.archive.modules.CrawlURI, org.archive.modules.fetcher.FetchStats.Stage)
     */
    public void tally(CrawlURI curi, Stage stage) {
        substats.tally(curi, stage);
        precedenceProvider.tally(curi, stage);
    }

    /**
     * Update queue state to recognize it has been sent to one of the
     * inactive (by-precedence) queues, waiting for a turn. 
     */
    public synchronized void noteDeactivated() {
        active = false;
        isManaged = true; 
        makeDirty();
    }
    
    /**
     * Update queue state to recognize it has been completely exhausted,
     * and is no longer on any of the ready/inactive queues-of-queues
     */
    public synchronized void noteExhausted() {
        active = false;
        isManaged = false; 
        makeDirty();
    }

    /**
     * Whether the queue is already in a lifecycle stage --
     * such as ready, in-progress, snoozed -- and thus should
     * not be redundantly inserted to readyClassQueues
     * 
     * @return isManaged
     */
    public boolean isManaged() {
        return isManaged;
    }
    
    /* (non-Javadoc)
     * @see java.lang.Object#toString()
     */
    public String toString() {
        return super.toString()+"("+getClassKey()+")";
    }

    
    //
    // IdentityCacheable support
    //
    transient private ObjectIdentityCache cache;
    @Override
    public String getKey() {
        return getClassKey();
    }

    @Override
    public void makeDirty() {
        cache.dirtyKey(getKey());
    }

    @Override
    public void setIdentityCache(ObjectIdentityCache cache) {
        this.cache = cache; 
    } 
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy