org.archive.crawler.prefetch.Preselector Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.prefetch;
import static org.archive.modules.fetcher.FetchStatusCodes.*;
import org.archive.crawler.framework.Scoper;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.util.TextUtils;
/**
* If set to recheck the crawl's scope, gives a yes/no on whether
* a CrawlURI should be processed at all. If not, its status
* will be marked OUT_OF_SCOPE and the URI will skip directly
* to the first "postprocessor".
*
*
* @author gojomo
*
*/
public class Preselector extends Scoper {
@SuppressWarnings("unused")
private static final long serialVersionUID = 3L;
{
setRecheckScope(false);
}
public boolean getRecheckScope() {
return (Boolean) kp.get("recheckScope");
}
/**
* Recheck if uri is in scope. This is meaningful if the scope is altered
* during a crawl. URIs are checked against the scope when they are added to
* queues. Setting this value to true forces the URI to be checked against
* the scope when it is coming out of the queue, possibly after the scope
* is altered.
*/
public void setRecheckScope(boolean recheck) {
kp.put("recheckScope",recheck);
}
{
setBlockAll(false);
}
public boolean getBlockAll() {
return (Boolean) kp.get("blockAll");
}
/**
* Block all URIs from being processed. This is most likely to be used in
* overrides to easily reject certain hosts from being processed.
*/
public void setBlockAll(boolean recheck) {
kp.put("blockAll",recheck);
}
{
setBlockByRegex("");
}
public String getBlockByRegex() {
return (String) kp.get("blockByRegex");
}
/**
* Block all URIs matching the regular expression from being processed.
*/
public void setBlockByRegex(String regex) {
kp.put("blockByRegex",regex);
}
{
setAllowByRegex("");
}
public String getAllowByRegex() {
return (String) kp.get("allowByRegex");
}
/**
* Allow only URIs matching the regular expression to be processed.
*/
public void setAllowByRegex(String regex) {
kp.put("allowByRegex",regex);
}
/**
* Constructor.
*/
public Preselector() {
super();
}
@Override
protected boolean shouldProcess(CrawlURI puri) {
return puri instanceof CrawlURI;
}
@Override
protected void innerProcess(CrawlURI puri) {
throw new AssertionError();
}
@Override
protected ProcessResult innerProcessResult(CrawlURI puri) {
CrawlURI curi = (CrawlURI)puri;
// Check if uris should be blocked
if (getBlockAll()) {
curi.setFetchStatus(S_BLOCKED_BY_USER);
return ProcessResult.FINISH;
}
// Check if allowed by regular expression
String regex = getAllowByRegex();
if (regex != null && !regex.equals("")) {
if (!TextUtils.matches(regex, curi.toString())) {
curi.setFetchStatus(S_BLOCKED_BY_USER);
return ProcessResult.FINISH;
}
}
// Check if blocked by regular expression
regex = getBlockByRegex();
if (regex != null && !regex.equals("")) {
if (TextUtils.matches(regex, curi.toString())) {
curi.setFetchStatus(S_BLOCKED_BY_USER);
return ProcessResult.FINISH;
}
}
// Possibly recheck scope
if (getRecheckScope()) {
if (!isInScope(curi)) {
// Scope rejected
curi.setFetchStatus(S_OUT_OF_SCOPE);
return ProcessResult.FINISH;
}
}
return ProcessResult.PROCEED;
}
}