
org.archive.modules.extractor.ExtractorJS Maven / Gradle / Ivy
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.extractor;
import static org.archive.modules.extractor.Hop.SPECULATIVE;
import static org.archive.modules.extractor.LinkContext.JS_MISC;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.exception.NestableRuntimeException;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.CrawlURI;
import org.archive.net.UURI;
import org.archive.util.DevUtils;
import org.archive.util.TextUtils;
import org.archive.util.UriUtils;
/**
* Processes Javascript files for strings that are likely to be
* crawlable URIs.
*
* NOTE: This processor may open a ReplayCharSequence from the
* CrawlURI's Recorder, without closing that ReplayCharSequence, to allow
* reuse by later processors in sequence. In the usual (Heritrix) case, a
* call after all processing to the Recorder's endReplays() method ensures
* timely close of any reused ReplayCharSequences. Reuse of this processor
* elsewhere should ensure a similar cleanup call to Recorder.endReplays()
* occurs.
*
* TODO: Replace with a system for actually executing Javascript in a
* browser-workalike DOM, such as via HtmlUnit or remote-controlled
* browser engines.
*
* @author gojomo
* @author nlevitt
*/
public class ExtractorJS extends ContentExtractor {
@SuppressWarnings("unused")
private static final long serialVersionUID = 3L;
private static Logger LOGGER =
Logger.getLogger(ExtractorJS.class.getName());
// finds strings in Javascript
// (areas between paired ' or " characters, possibly backslash-quoted
// on the ends, but not in the middle)
protected static final String JAVASCRIPT_STRING_EXTRACTOR =
"(\\\\{0,8}+(?:['\"]|u002[27]))([^'\"]{0,"+UURI.MAX_URL_LENGTH+"})(?:\\1)";
// GROUPS:
// (G1) ' or " with optional leading backslashes
// (G2) whitespace-free string delimited on boths ends by G1
protected long numberOfCURIsHandled = 0;
protected boolean shouldExtract(CrawlURI uri) {
String contentType = uri.getContentType();
if (contentType == null) {
return false;
}
// If the content-type indicates js, we should process it.
if (contentType.indexOf("javascript") >= 0) {
return true;
}
if (contentType.indexOf("jscript") >= 0) {
return true;
}
if (contentType.indexOf("ecmascript") >= 0) {
return true;
}
if (contentType.startsWith("application/json")) {
return true;
}
// If the filename indicates js, we should process it.
if (uri.toString().toLowerCase().endsWith(".js")) {
return true;
}
// If the viaContext indicates a script, we should process it.
LinkContext context = uri.getViaContext();
if (context == null) {
return false;
}
String s = context.toString().toLowerCase();
return s.startsWith("script");
}
@Override
protected boolean innerExtract(CrawlURI curi) {
this.numberOfCURIsHandled++;
ReplayCharSequence cs = null;
try {
cs = curi.getRecorder().getContentReplayCharSequence();
try {
numberOfLinksExtracted.addAndGet(considerStrings(curi, cs));
} catch (StackOverflowError e) {
DevUtils.warnHandle(e, "ExtractorJS StackOverflowError");
}
// Set flag to indicate that link extraction is completed.
return true;
} catch (IOException e) {
curi.getNonFatalFailures().add(e);
}
return false;
}
protected long considerStrings(CrawlURI curi, CharSequence cs) {
return considerStrings(this, curi, cs, true);
}
public long considerStrings(Extractor ext,
CrawlURI curi, CharSequence cs) {
return considerStrings(ext, curi, cs, false);
}
public long considerStrings(Extractor ext,
CrawlURI curi, CharSequence cs, boolean handlingJSFile) {
long foundLinks = 0;
Matcher strings =
TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, cs);
int startIndex = 0;
while (strings.find(startIndex)) {
CharSequence subsequence =
cs.subSequence(strings.start(2), strings.end(2));
if (UriUtils.isPossibleUri(subsequence)) {
if (considerString(ext, curi, handlingJSFile, subsequence.toString())) {
foundLinks++;
}
}
startIndex = strings.end(1);
}
TextUtils.recycleMatcher(strings);
return foundLinks;
}
protected boolean considerString(Extractor ext, CrawlURI curi,
boolean handlingJSFile, String candidate) {
try {
candidate = StringEscapeUtils.unescapeJavaScript(candidate);
} catch (NestableRuntimeException e) {
LOGGER.log(Level.WARNING, "problem unescaping some javascript", e);
}
candidate = UriUtils.speculativeFixup(candidate, curi.getUURI());
if (UriUtils.isVeryLikelyUri(candidate)) {
try {
int max = ext.getExtractorParameters().getMaxOutlinks();
if (handlingJSFile) {
addRelativeToVia(curi, max, candidate, JS_MISC,
SPECULATIVE);
return true;
} else {
addRelativeToBase(curi, max, candidate, JS_MISC,
SPECULATIVE);
return true;
}
} catch (URIException e) {
ext.logUriError(e, curi.getUURI(), candidate);
}
}
return false;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy