org.archive.modules.extractor.ExtractorMultipleRegex Maven / Gradle / Ivy
Show all versions of heritrix-modules Show documentation
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.extractor;
import groovy.text.SimpleTemplateEngine;
import groovy.text.Template;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import org.apache.commons.httpclient.URIException;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.CrawlURI;
import org.archive.modules.fetcher.FetchStatusCodes;
import org.archive.util.TextUtils;
/**
* An extractor that uses regular expressions to find strings in the fetched
* content of a URI, and constructs outlink URIs from those strings.
*
*
* The crawl operator configures these parameters:
*
*
* -
uriRegex
: a regular expression to match against the url
* -
contentRegexes
a map of named regular expressions { name =>
* regex } to run against the content
* -
template
: the template for constructing the outlinks
*
*
*
* The URI is checked against uriRegex
. The match is done using
* {@link Matcher#matches()}, so the full URI string must match, not just a
* substring. If it does match, then the matching groups are available to the
* URI-building template as ${uriRegex[n]}
. If it does not match,
* processing of the URI is finished and no outlinks are extracted.
*
*
* Then the extractor looks for matches for each of the
* contentRegexes
in the fetched content. If any of the regular
* expressions produce no matches, processing of the URI is finished and no
* outlinks are extracted. If at least one match is found for each regular
* expression, then an outlink is constructed, using the URI-building template,
* for every combination of matches. The matching groups are available to the
* template as ${name[n]}
.
*
*
* Outlinks are constructed using the URI-building template
.
* Variable interpolation using the familiar ${...} syntax is supported. The
* template is evaluated for each combination of regular expression matches
* found, and the matching groups are available to the template as
* ${regexName[n]}
. An example template might look like:
* http://example.org/${uriRegex[1]}/foo?bar=${myContentRegex[0]}
.
*
*
* The template is evaluated as a Groovy Template, so further capabilities
* beyond simple variable interpolation are available.
*
* @see http://groovy.codehaus.org/Groovy+Templates
*
* @author nlevitt
* @author travis
*/
public class ExtractorMultipleRegex extends Extractor {
private static final Logger LOGGER =
Logger.getLogger(ExtractorMultipleRegex.class.getName());
{
setUriRegex("");
}
/**
* Regular expression against which to match the URI. If the URI matches,
* then the matching groups are available to the URI-building template as
* ${uriRegex[n]}
. If it does not match, processing of this URI
* is finished and no outlinks are extracted.
*/
public void setUriRegex(String uriRegex) {
kp.put("uriRegex", uriRegex);
}
public String getUriRegex() {
return (String) kp.get("uriRegex");
}
{
setContentRegexes(new LinkedHashMap());
}
/**
* A map of { name => regex }. The extractor looks for matches for each
* regular expression in the content of the URI being processed. If any of
* the regular expressions produce no matches, processing of the URI is
* finished and no outlinks are extracted. If at least one match is found
* for each regular expression, then an outlink is constructed for every
* combination of matches. The matching groups are available to the
* URI-building template as ${name[n]}
.
*/
public void setContentRegexes(Map contentRegexes) {
kp.put("contentRegexes", contentRegexes);
}
@SuppressWarnings("unchecked")
public Map getContentRegexes() {
return (Map) kp.get("contentRegexes");
}
{
setTemplate("");
}
/**
* URI-building template. Provides variable interpolation using the familiar
* ${...} syntax. The template is evaluated for each combination of regular
* expression matches found, and the matching groups are available to the
* template as ${regexName[n]}
. An example template might look
* like:
* http://example.org/${uriRegex[1]}/foo?bar=${myContentRegex[0]}
.
*
*
* The template is evaluated as a Groovy Template, so further capabilities
* beyond simple variable interpolation are available.
*
* @see http://groovy.codehaus.org/Groovy+Templates
*/
public void setTemplate(String template) {
kp.put("template", template);
}
public String getTemplate() {
return (String) kp.get("template");
}
/*
* Cache of groovy templates because they're a little expensive to create.
* Needs to be a map rather than a single value to handle overrides.
* XXX confirm Template is thread safe
*/
protected ConcurrentHashMap groovyTemplates = new ConcurrentHashMap();
protected Template groovyTemplate() {
Template groovyTemplate = groovyTemplates.get(getTemplate());
if (groovyTemplate == null) {
try {
groovyTemplate = new SimpleTemplateEngine().createTemplate(getTemplate());
groovyTemplates.put(getTemplate(), groovyTemplate);
} catch (Exception e) {
LOGGER.log(Level.SEVERE, "problem with groovy template " + getTemplate(), e);
}
}
return groovyTemplate;
}
@Override
protected boolean shouldProcess(CrawlURI uri) {
if (uri.getContentLength() <= 0) {
return false;
}
if (!getExtractorParameters().getExtract404s()
&& uri.getFetchStatus() == FetchStatusCodes.S_NOT_FOUND) {
return false;
}
return true;
}
protected class MatchList extends LinkedList {
private static final long serialVersionUID = 1L;
public MatchList(String regex, CharSequence cs) {
Matcher matcher = TextUtils.getMatcher(regex, cs);
while (matcher.find()) {
add(new GroupList(matcher));
}
}
public MatchList(GroupList... groupList) {
for (GroupList x: groupList) {
add(x);
}
}
};
protected class GroupList extends LinkedList {
private static final long serialVersionUID = 1L;
public GroupList(MatchResult matchResult) {
for (int i = 0; i <= matchResult.groupCount(); i++) {
add(matchResult.group(i));
}
}
};
@Override
public void extract(CrawlURI curi) {
// { regex name -> list of matches }
Map matchLists;
// uri regex
Matcher matcher = TextUtils.getMatcher(getUriRegex(), curi.getURI());
if (matcher.matches()) {
matchLists = new LinkedHashMap();
matchLists.put("uriRegex", new MatchList(new GroupList(matcher)));
} else {
return; // if uri regex doesn't match, we're done
}
ReplayCharSequence cs;
try {
cs = curi.getRecorder().getContentReplayCharSequence();
} catch (IOException e) {
curi.getNonFatalFailures().add(e);
LOGGER.log(Level.WARNING, "Failed get of replay char sequence in "
+ Thread.currentThread().getName(), e);
return;
}
// run all the regexes on the content and cache results
for (String regexName: getContentRegexes().keySet()) {
String regex = getContentRegexes().get(regexName);
MatchList matchList = new MatchList(regex, cs);
if (matchList.isEmpty()) {
return; // no match found for regex, so we can stop now
}
matchLists.put(regexName, matchList);
}
/*
* If we have 3 regexes, the first one has 1 match, second has 12
* matches, third has 3 matches, then we have 36 combinations of
* matches, thus 36 outlinks to extracted.
*/
int numOutlinks = 1;
for (MatchList matchList: matchLists.values()) {
numOutlinks *= matchList.size();
}
String[] regexNames = matchLists.keySet().toArray(new String[0]);
for (int i = 0; i < numOutlinks; i++) {
Map bindings = makeBindings(matchLists, regexNames, i);
buildAndAddOutlink(curi, bindings);
}
}
// bindings are the variables available to populate the template
// { String patternName => List groups }
protected Map makeBindings(Map matchLists,
String[] regexNames, int outlinkIndex) {
Map bindings = new LinkedHashMap();
int tmp = outlinkIndex;
for (int regexIndex = 0; regexIndex < regexNames.length; regexIndex++) {
MatchList matchList = matchLists.get(regexNames[regexIndex]);
int matchIndex = tmp % matchList.size();
bindings.put(regexNames[regexIndex], matchList.get(matchIndex));
tmp = tmp / matchList.size();
}
return bindings;
}
protected void buildAndAddOutlink(CrawlURI curi, Map bindings) {
String outlinkUri = groovyTemplate().make(bindings).toString();
try {
addRelativeToBase(curi,
getExtractorParameters().getMaxOutlinks(), outlinkUri,
HTMLLinkContext.INFERRED_MISC, Hop.INFERRED);
} catch (URIException e) {
logUriError(e, curi.getUURI(), outlinkUri);
}
}
}