org.archive.modules.extractor.Extractor Maven / Gradle / Ivy
Show all versions of heritrix-modules Show documentation
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.extractor;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.annotation.Autowired;
/**
* Extracts links from fetched URIs. This class provides error handling
* for some common issues that occur when parsing document content. You
* almost certainly want to subclass {@link ContentExtractor} instead of
* this class.
*
* @author pjack
*/
public abstract class Extractor extends Processor {
private static final Logger LOGGER = Logger.getLogger(Extractor.class.getName());
protected AtomicLong numberOfLinksExtracted = new AtomicLong(0);
/** Logger. */
private static final Logger logger =
Logger.getLogger(Extractor.class.getName());
public static final ExtractorParameters DEFAULT_PARAMETERS =
new ExtractorParameters() {
public int getMaxOutlinks() {
return 6000;
}
public boolean getExtractIndependently() {
return false;
}
public boolean getExtract404s() {
return false;
}
};
transient protected UriErrorLoggerModule loggerModule;
public UriErrorLoggerModule getLoggerModule() {
return this.loggerModule;
}
@Autowired
public void setLoggerModule(UriErrorLoggerModule loggerModule) {
this.loggerModule = loggerModule;
}
{
setExtractorParameters(DEFAULT_PARAMETERS);
}
protected transient ExtractorParameters extractorParameters;
public ExtractorParameters getExtractorParameters() {
return extractorParameters;
}
@Autowired(required=false)
public void setExtractorParameters(ExtractorParameters helper) {
this.extractorParameters = helper;
}
/**
* Processes the given URI. This method just delegates to
* {@link #extract(CrawlURI)}, catching runtime exceptions and
* errors that are usually non-fatal, to highlight them in the
* relevant log(s).
*
* Notably, StackOverflowError is caught here, as that seems to
* happen a lot when dealing with document parsing APIs.
*
* @param uri the URI to extract links from
*/
final protected void innerProcess(CrawlURI uri)
throws InterruptedException {
try {
extract(uri);
} catch (NullPointerException npe) {
handleException(uri, npe);
} catch (StackOverflowError soe) {
handleException(uri, soe);
} catch (java.nio.charset.CoderMalfunctionError cme) {
// See http://sourceforge.net/tracker/index.php?func=detail&aid=1540222&group_id=73833&atid=539099
handleException(uri, cme);
}
}
private void handleException(CrawlURI uri, Throwable t) {
// both annotate (to highlight in crawl log) & add as local-error
uri.getAnnotations().add("err=" + t.getClass().getName());
uri.getNonFatalFailures().add(t);
// also log as INFO
// TODO: remove as redundant, given nonfatal logging?
logger.log(Level.INFO, "Exception", t);
}
/**
* Extracts links from the given URI. Subclasses should use
* {@link CrawlURI#getRecorder()} to process the content of the
* URI. Any links that are discovered should be added to the
* {@link CrawlURI#getOutLinks()} set.
*
* @param uri the uri to extract links from
*/
protected abstract void extract(CrawlURI uri);
/**
* Create and add a 'Link' to the CrawlURI with given URI/context/hop-type
* @param curi
* @param uri
* @param context
* @param hop
*/
protected void addOutlink(CrawlURI curi, String uri, LinkContext context,
Hop hop) {
try {
UURI dest = UURIFactory.getInstance(curi.getUURI(), uri);
CrawlURI link = curi.createCrawlURI(dest, context, hop);
curi.getOutLinks().add(link);
} catch (URIException e) {
logUriError(e, curi.getUURI(), uri);
}
}
protected void addOutlink(CrawlURI curi, UURI uuri, LinkContext context,
Hop hop) {
try {
CrawlURI link = curi.createCrawlURI(uuri, context, hop);
curi.getOutLinks().add(link);
} catch (URIException e) {
logUriError(e, curi.getUURI(), uuri.toString());
}
}
public void logUriError(URIException e, UURI uuri,
CharSequence l) {
loggerModule.logUriError(e, uuri, l);
}
@Override
protected JSONObject toCheckpointJson() throws JSONException {
JSONObject json = super.toCheckpointJson();
json.put("numberOfLinksExtracted", numberOfLinksExtracted.get());
return json;
}
@Override
protected void fromCheckpointJson(JSONObject json) throws JSONException {
super.fromCheckpointJson(json);
numberOfLinksExtracted.set(json.getLong("numberOfLinksExtracted"));
}
public String report() {
StringBuffer ret = new StringBuffer();
ret.append(super.report());
ret.append(" " + numberOfLinksExtracted + " links from " + getURICount() +" CrawlURIs\n");
return ret.toString();
}
public static CrawlURI addRelativeToBase(CrawlURI uri, int max,
String newUri, LinkContext context, Hop hop) throws URIException {
UURI dest = UURIFactory.getInstance(uri.getBaseURI(), newUri);
return add2(uri, max, dest, context, hop);
}
public static CrawlURI addRelativeToVia(CrawlURI uri, int max, String newUri,
LinkContext context, Hop hop) throws URIException {
UURI relTo = uri.getVia();
if (relTo == null) {
if (!uri.getAnnotations().contains("usedBaseForVia")) {
LOGGER.info("no via where expected; using base instead: " + uri);
uri.getAnnotations().add("usedBaseForVia");
}
relTo = uri.getBaseURI();
}
UURI dest = UURIFactory.getInstance(relTo, newUri);
return add2(uri, max, dest, context, hop);
}
public static void add(CrawlURI uri, int max, String newUri,
LinkContext context, Hop hop) throws URIException {
UURI dest = UURIFactory.getInstance(newUri);
add2(uri, max, dest, context, hop);
}
private static CrawlURI add2(CrawlURI curi, int max, UURI dest,
LinkContext context, Hop hop) throws URIException {
if (curi.getOutLinks().size() < max) {
CrawlURI link = curi.createCrawlURI(dest, context, hop);
curi.getOutLinks().add(link);
return link;
} else {
curi.incrementDiscardedOutLinks();
return null;
}
}
}