org.archive.crawler.frontier.precedence.SuccessCountsQueuePrecedencePolicy Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.frontier.precedence;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.Transformer;
import org.archive.crawler.frontier.WorkQueue;
/**
* QueuePrecedencePolicy that sets a uri-queue's precedence to a configured
* base value, then lowers its precedence with each tier of successful URIs
* completed. Any number of comma-separated tier sizes may be provided, with
* the last value assumed to repeat indefinitely. For example, with a
* 'base-precedence' value of 2, and 'increment-counts' of "100,1000", the
* queue will have a precedence of 2 until 100 URIs are successfully fetched,
* then a precedence of 3 for the next 1000 URIs successfully fetched, then
* continue to drop one precedence rank for each 1000 URIs successfully
* fetched.
*/
public class SuccessCountsQueuePrecedencePolicy extends BaseQueuePrecedencePolicy {
private static final long serialVersionUID = -4469760728466350850L;
// TODO: determine why this doesn't work
//
// /** comma-separated list of success-counts at which precedence is bumped*/
// final public static Key> INCREMENT_COUNTS =
// Key.make((List)Arrays.asList(new Integer[] {100}));
//
// /**
// * @param wq
// * @return
// */
// protected int calculatePrecedence(WorkQueue wq) {
// // FIXME: it's inefficient to do this every time; optimizing
// // should be possible via more sophisticated custom PrecedenceProvider
// int precedence = wq.get(this,BASE_PRECEDENCE) - 1;
// Iterator iter = wq.get(this,INCREMENT_COUNTS).iterator();
// int increment = iter.next();
// long successes = wq.getSubstats().getFetchSuccesses();
// while(successes>0) {
// successes -= increment;
// precedence++;
// increment = iter.hasNext() ? iter.next() : increment;
// }
// return precedence;
// }
{
setIncrementCounts("100,1000");
}
public String getIncrementCounts() {
return (String) kp.get("incrementCounts");
}
/** comma-separated list of success-counts at which precedence is bumped*/
public void setIncrementCounts(String counts) {
kp.put("incrementCounts",counts);
}
/* (non-Javadoc)
* @see org.archive.crawler.frontier.QueuePrecedencePolicy#queueReevaluate(org.archive.crawler.frontier.WorkQueue)
*/
@SuppressWarnings("unchecked")
@Override
protected int calculatePrecedence(WorkQueue wq) {
// FIXME: it's ridiculously inefficient to do this every time,
// and optimizing will probably require inserting stateful policy
// helper object into WorkQueue -- expected when URI-precedence is
// also supported
int precedence = getBasePrecedence() - 1;
Collection increments = CollectionUtils.collect(
Arrays.asList(getIncrementCounts().split(",")),
new Transformer() {
public Object transform(final Object string) {
return Integer.parseInt((String)string);
}});
Iterator iter = increments.iterator();
int increment = iter.next();
long successes = wq.getSubstats().getFetchSuccesses();
while(successes>=0) {
successes -= increment;
precedence++;
increment = iter.hasNext() ? iter.next() : increment;
}
return precedence;
}
}