org.archive.modules.deciderules.surt.SurtPrefixedDecideRule Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the
Heritrix application to crawl the web. The modules in this project can
be used in applications other than Heritrix, however.
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.deciderules.surt;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.io.ReadSource;
import org.archive.modules.CrawlURI;
import org.archive.modules.deciderules.DecideResult;
import org.archive.modules.deciderules.PredicatedDecideRule;
import org.archive.modules.seeds.SeedListener;
import org.archive.modules.seeds.SeedModule;
import org.archive.net.UURI;
import org.archive.spring.ConfigFile;
import org.archive.util.SurtPrefixSet;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.context.event.ContextStartedEvent;
/**
* Rule applies configured decision to any URIs that, when
* expressed in SURT form, begin with one of the prefixes
* in the configured set.
*
* The set can be filled with SURT prefixes implied or
* listed in the seeds file, or another external file.
*
* The "also-check-via" option to implement "one hop off"
* scoping derives from a contribution by Shifra Raffel
* of the California Digital Library.
*
* @author gojomo
*/
public class SurtPrefixedDecideRule extends PredicatedDecideRule implements
SeedListener, ApplicationListener, Checkpointable,
BeanNameAware {
private static final long serialVersionUID = 3L;
private static final Logger logger =
Logger.getLogger(SurtPrefixedDecideRule.class.getName());
/**
* Source file from which to infer SURT prefixes. Any URLs in file will be
* converted to the implied SURT prefix, and literal SURT prefixes may be
* listed on lines beginning with a '+' character.
*
* @deprecated redundant now that we have
* {@link SurtPrefixedDecideRule#surtsSource}
*/
public ConfigFile getSurtsSourceFile() {
if (getSurtsSource() instanceof ConfigFile) {
return (ConfigFile) getSurtsSource();
} else {
return null;
}
}
/** @deprecated */
public void setSurtsSourceFile(ConfigFile cp) {
setSurtsSource(cp);
}
/**
* Text from which to infer SURT prefixes. Any URLs will be converted to the
* implied SURT prefix, and literal SURT prefixes may be listed on lines
* beginning with a '+' character.
*/
protected ReadSource surtsSource = null;
public ReadSource getSurtsSource() {
return surtsSource;
}
public void setSurtsSource(ReadSource surtsSource) {
this.surtsSource = surtsSource;
}
/**
* Should seeds also be interpreted as SURT prefixes.
*/
protected boolean seedsAsSurtPrefixes = true;
public boolean getSeedsAsSurtPrefixes() {
return seedsAsSurtPrefixes;
}
public void setSeedsAsSurtPrefixes(boolean seedsAsSurtPrefixes) {
this.seedsAsSurtPrefixes = seedsAsSurtPrefixes;
}
/**
* Dump file to save SURT prefixes actually used: Useful debugging SURTs.
*/
protected ConfigFile surtsDumpFile =
new ConfigFile("surtsDumpFile","${launchId}/surts.dump");
public ConfigFile getSurtsDumpFile() {
return surtsDumpFile;
}
public void setSurtsDumpFile(ConfigFile cp) {
this.surtsDumpFile.merge(cp);
}
/**
* Whether to also make the configured decision if a URI's 'via' URI (the
* URI from which it was discovered) in SURT form begins with any of the
* established prefixes. For example, can be used to ACCEPT URIs that are
* 'one hop off' URIs fitting the SURT prefixes. Default is false.
*/
{
setAlsoCheckVia(false);
}
public boolean getAlsoCheckVia() {
return (Boolean) kp.get("alsoCheckVia");
}
public void setAlsoCheckVia(boolean checkVia) {
kp.put("alsoCheckVia", checkVia);
}
protected SeedModule seeds;
public SeedModule getSeeds() {
return this.seeds;
}
@Autowired
public void setSeeds(SeedModule seeds) {
this.seeds = seeds;
if(seeds!=null) {
// in case this bean wasn't autowired to listeners (as if an
// inner bean)
seeds.addSeedListener(this);
}
}
protected SurtPrefixSet surtPrefixes = new SurtPrefixSet();
public SurtPrefixedDecideRule() {
}
public void concludedSeedBatch() {
dumpSurtPrefixSet();
}
/**
* Evaluate whether given object's URI is covered by the SURT prefix set
*
* @return true if item, as SURT form URI, is prefixed by an item in the set
*/
@Override
protected boolean evaluate(CrawlURI uri) {
if (getAlsoCheckVia()) {
if (innerDecide(uri.getVia())) {
return true;
}
}
return innerDecide(uri.getUURI());
}
private boolean innerDecide(UURI uuri) {
String candidateSurt;
candidateSurt = SurtPrefixSet.getCandidateSurt(uuri);
if (candidateSurt == null) {
return false;
}
if (surtPrefixes.containsPrefixOf(candidateSurt)) {
return true;
} else {
return false;
}
}
protected void readPrefixes() {
buildSurtPrefixSet();
}
/**
* Dump the current prefixes in use to configured dump file (if any)
*/
protected void dumpSurtPrefixSet() {
// dump surts to file, if appropriate
String dumpPath = getSurtsDumpFile().getPath();
if (!StringUtils.isEmpty(dumpPath)) {
File dump = getSurtsDumpFile().getFile();
try {
FileWriter fw = new FileWriter(dump);
try {
surtPrefixes.exportTo(fw);
} finally {
fw.close();
}
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
}
/**
* Construct the set of prefixes to use, from the seed list (
* which may include both URIs and '+'-prefixed directives).
*/
protected void buildSurtPrefixSet() {
if (getSurtsSource() != null) {
if (logger.isLoggable(Level.FINE)) {
logger.fine("reading surt prefixes from " + getSurtsSource());
}
Reader reader = getSurtsSource().obtainReader();
try {
surtPrefixes.importFromMixed(reader, true);
} finally {
IOUtils.closeQuietly(reader);
}
}
}
/**
* If appropriate, convert seed notification into prefix-addition.
*
* @see org.archive.modules.seeds.SeedListener#addedSeed(org.archive.modules.CrawlURI)
*/
public void addedSeed(final CrawlURI curi) {
if(getSeedsAsSurtPrefixes()) {
surtPrefixes.add(prefixFrom(curi.getURI()));
}
}
protected String prefixFrom(String uri) {
return SurtPrefixSet.prefixFromPlainForceHttp(uri);
}
private static final String DEFAULT_ACCEPT_ADD_PREFIX_DIRECTIVE = "+";
private static final String DEFAULT_REJECT_ADD_PREFIX_DIRECTIVE = "-";
/**
* Consider nonseed lines as possible SURT prefix directives. If we
* are ACCEPTing URIs, expect '+' as the add-SURT directive. If we
* are REJECTing URIs, expect '-' as the add-SURT directive.
*
* TODO: perhaps, make directive configurable, so many SeedListener
* SurtPrefixedDecideRules can all listen for directives meant only
* for them.
*
* @see org.archive.modules.seeds.SeedListener#nonseedLine(java.lang.String)
*/
public boolean nonseedLine(String line) {
String effectiveDirective = getEffectiveAddDirective();
if(line.startsWith(effectiveDirective)) {
return surtPrefixes.considerAsAddDirective(line.substring(effectiveDirective.length()));
} else {
// not a line this instance is interested in
return false;
}
}
private String getEffectiveAddDirective() {
if(getDecision()==DecideResult.ACCEPT) {
return DEFAULT_ACCEPT_ADD_PREFIX_DIRECTIVE;
}
if(getDecision()==DecideResult.REJECT) {
return DEFAULT_REJECT_ADD_PREFIX_DIRECTIVE;
}
throw new IllegalArgumentException("decision must be ACCEPT or REJECT");
}
@Override
public void onApplicationEvent(ApplicationEvent event) {
if (event instanceof ContextStartedEvent) {
if (recoveryCheckpoint != null) {
JSONObject json = recoveryCheckpoint.loadJson(beanName);
try {
JSONArray jsonArray = json.getJSONArray("surtPrefixes");
for (int i = 0; i < jsonArray.length(); i++) {
surtPrefixes.add(jsonArray.getString(i));
}
} catch (JSONException e) {
throw new IllegalStateException(e);
}
} else {
readPrefixes();
}
}
}
// BeanNameAware
protected String beanName;
public void setBeanName(String name) {
this.beanName = name;
}
@Override
public void startCheckpoint(Checkpoint checkpointInProgress) {
}
@Override
public void doCheckpoint(Checkpoint checkpointInProgress)
throws IOException {
try {
JSONObject json = new JSONObject();
json.put("surtPrefixes", surtPrefixes);
checkpointInProgress.saveJson(beanName, json);
} catch (JSONException e) {
throw new RuntimeException(e);
}
}
@Override
public void finishCheckpoint(Checkpoint checkpointInProgress) {
}
protected Checkpoint recoveryCheckpoint;
@Override
public void setRecoveryCheckpoint(Checkpoint recoveryCheckpoint) {
this.recoveryCheckpoint = recoveryCheckpoint;
}
}//EOC