org.archive.crawler.frontier.precedence.HighestUriQueuePrecedencePolicy Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.frontier.precedence;
import java.io.PrintWriter;
import org.archive.crawler.frontier.WorkQueue;
import org.archive.modules.CrawlURI;
import org.archive.modules.fetcher.FetchStats.Stage;
import org.archive.util.Histotable;
/**
* QueuePrecedencePolicy that sets a uri-queue's precedence to that of the
* highest URI currently enqueued within itself, added to the configured
* base-precedence.
*
* It does this by maintaining a count of the included URIs at each
* URI-precedence, updated on each URI add or remove.
*/
public class HighestUriQueuePrecedencePolicy extends BaseQueuePrecedencePolicy {
private static final long serialVersionUID = -8652293180921419601L;
/* (non-Javadoc)
* @see org.archive.crawler.frontier.precedence.BaseQueuePrecedencePolicy#installProvider(org.archive.crawler.frontier.WorkQueue)
*/
@Override
protected void installProvider(WorkQueue wq) {
// TODO:SPRINGY ensure proper override context installed for getBasePrecedence() below
HighestUriPrecedenceProvider provider = new HighestUriPrecedenceProvider(getBasePrecedence());
wq.setPrecedenceProvider(provider);
}
/**
* Helper provider for maintaining the tracked distribution of included
* URIs and calculating the queue precedence.
*/
public class HighestUriPrecedenceProvider extends SimplePrecedenceProvider {
private static final long serialVersionUID = 5545297542888582745L;
protected Histotable enqueuedCounts = new Histotable();
public HighestUriPrecedenceProvider(int base) {
super(base);
}
/* (non-Javadoc)
* @see org.archive.crawler.frontier.precedence.PrecedenceProvider#tally(org.archive.modules.CrawlURI, org.archive.modules.fetcher.FetchStats.Stage)
*/
@Override
public void tally(CrawlURI curi, Stage stage) {
switch(stage) {
case SCHEDULED:
// enqueued
synchronized (enqueuedCounts) {
enqueuedCounts.tally(curi.getPrecedence());
}
break;
case SUCCEEDED:
case DISREGARDED:
case FAILED:
// dequeued
synchronized (enqueuedCounts) {
enqueuedCounts.tally(curi.getPrecedence(), -1);
}
break;
case RETRIED:
// do nothing, already tallied
break;
}
}
/* (non-Javadoc)
* @see org.archive.crawler.frontier.precedence.SimplePrecedenceProvider#getPrecedence()
*/
@Override
public int getPrecedence() {
// base plus highest URI still in queue
synchronized (enqueuedCounts) {
Integer delta = (enqueuedCounts.size() > 0)
? enqueuedCounts.firstKey()
: 0;
return super.getPrecedence() + delta;
}
}
/*
* @see org.archive.crawler.frontier.precedence.PrecedenceProvider#shortReportLegend()()
*/
@Override
public String shortReportLegend() {
StringBuilder sb = new StringBuilder();
sb.append(super.shortReportLegend());
sb.append(":");
synchronized (enqueuedCounts) {
for (Integer p : enqueuedCounts.keySet()) {
sb.append(" p");
sb.append(p);
}
}
return sb.toString();
}
@Override
public void shortReportLineTo(PrintWriter writer) {
boolean betwixt = false;
synchronized (enqueuedCounts) {
for (Long count : enqueuedCounts.values()) {
if (betwixt)
writer.print(" ");
writer.print(count);
betwixt = true;
}
}
}
}
}