
org.archive.modules.extractor.ExtractorURI Maven / Gradle / Ivy
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.extractor;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import org.apache.commons.codec.DecoderException;
import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;
import org.archive.url.LaxURLCodec;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.TextUtils;
/**
* An extractor for finding URIs inside other URIs. Unlike most other
* extractors, this works on URIs discovered by previous extractors. Thus
* it should appear near the end of any set of extractors.
*
* Initially, only finds absolute HTTP(S) URIs in query-string or its
* parameters.
*
* TODO: extend to find URIs in path-info
*
* @author Gordon Mohr
*
**/
public class ExtractorURI extends Extractor {
@SuppressWarnings("unused")
private static final long serialVersionUID = 3L;
private static Logger LOGGER =
Logger.getLogger(ExtractorURI.class.getName());
protected static final String ABS_HTTP_URI_PATTERN = "^https?://[^\\s<>]*$";
/**
* Constructor
*/
public ExtractorURI() {
}
@Override
protected boolean shouldProcess(CrawlURI uri) {
return true;
}
/**
* Perform usual extraction on a CrawlURI
*
* @param curi Crawl URI to process.
*/
@Override
public void extract(CrawlURI curi) {
for (CrawlURI link : curi.getOutLinks()) {
extractLink(curi, link);
}
}
/**
* Consider a single Link for internal URIs
*
* @param curi CrawlURI to add discoveries to
* @param wref Link to examine for internal URIs
*/
protected void extractLink(CrawlURI curi, CrawlURI wref) {
UURI source = null;
try {
source = UURIFactory.getInstance(wref.getURI());
} catch (URIException e) {
LOGGER.log(Level.FINE,"bad URI",e);
}
if(source == null) {
// shouldn't happen
return;
}
List found = extractQueryStringLinks(source);
for (String uri : found) {
try {
UURI dest = UURIFactory.getInstance(uri);
LinkContext lc = LinkContext.SPECULATIVE_MISC;
Hop hop = Hop.SPECULATIVE;
addOutlink(curi, dest, lc, hop);
numberOfLinksExtracted.incrementAndGet();
} catch (URIException e) {
LOGGER.log(Level.FINE, "bad URI", e);
}
}
// TODO: consider path URIs too
}
/**
* Look for URIs inside the supplied UURI.
*
* Static for ease of testing or outside use.
*
* @param source UURI to example
* @return List of discovered String URIs.
*/
protected static List extractQueryStringLinks(UURI source) {
List results = new ArrayList();
String decodedQuery;
try {
decodedQuery = source.getQuery();
} catch (URIException e1) {
// shouldn't happen
return results;
}
if(decodedQuery==null) {
return results;
}
// check if full query-string appears to be http(s) URI
Matcher m = TextUtils.getMatcher(ABS_HTTP_URI_PATTERN,decodedQuery);
if(m.matches()) {
TextUtils.recycleMatcher(m);
results.add(decodedQuery);
}
// split into params, see if any param value is http(s) URI
String rawQuery = new String(source.getRawQuery());
String[] params = rawQuery.split("&");
for (String param : params) {
String[] keyVal = param.split("=");
if(keyVal.length==2) {
String candidate;
try {
candidate = LaxURLCodec.DEFAULT.decode(keyVal[1]);
} catch (DecoderException e) {
continue;
}
// TODO: use other non-UTF8 codecs when appropriate
m.reset(candidate);
if(m.matches()) {
results.add(candidate);
}
}
}
return results;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy