
com.jaeksoft.searchlib.crawler.web.spider.HtmlArchiver Maven / Gradle / Ivy
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2013 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler.web.spider;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.conn.HttpHostConnectException;
import org.htmlcleaner.ContentNode;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.xml.sax.SAXException;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.SearchLibException.WrongStatusCodeException;
import com.jaeksoft.searchlib.crawler.web.browser.BrowserDriver;
import com.jaeksoft.searchlib.crawler.web.spider.NaiveCSSParser.CSSImportRule;
import com.jaeksoft.searchlib.crawler.web.spider.NaiveCSSParser.CSSProperty;
import com.jaeksoft.searchlib.crawler.web.spider.NaiveCSSParser.CSSRule;
import com.jaeksoft.searchlib.crawler.web.spider.NaiveCSSParser.CSSStyleRule;
import com.jaeksoft.searchlib.parser.htmlParser.HtmlCleanerParser;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.LinkUtils;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.ThreadUtils.RecursiveTracker;
import com.jaeksoft.searchlib.util.ThreadUtils.RecursiveTracker.RecursiveEntry;
public class HtmlArchiver {
private final BrowserDriver> browserDriver;
private final File filesDir;
private final File indexFile;
private final File sourceFile;
private final Map fileCountMap;
private final Map urlFileMap;
private final URL pageUrl;
private final HttpDownloader downloader;
private final RecursiveTracker recursiveSecurity;
private URL baseUrl;
public HtmlArchiver(BrowserDriver> browserDriver, File parentDir,
HttpDownloader httpDownloader, URL url) {
this.browserDriver = browserDriver;
filesDir = new File(parentDir, "files");
indexFile = new File(parentDir, "index.html");
sourceFile = new File(parentDir, "source.html");
this.pageUrl = url;
this.baseUrl = url;
this.downloader = httpDownloader;
fileCountMap = new TreeMap();
urlFileMap = new TreeMap();
filesDir.mkdir();
recursiveSecurity = new RecursiveTracker(20);
}
final private static String buildFileName(String baseName,
String extension, Integer fileCount) {
if (baseName.length() > 160)
baseName = baseName.substring(0, 160);
if (extension.length() > 32)
extension = extension.substring(0, 32);
StringBuilder sb = new StringBuilder(baseName);
if (fileCount != null && fileCount > 0) {
sb.append('_');
sb.append(fileCount);
}
if (extension != null && extension.length() > 0) {
sb.append('.');
sb.append(extension);
}
return sb.toString();
}
final private String getLocalPath(URL parentUrl, String fileName) {
if (parentUrl == null
|| urlFileMap.get(parentUrl.toExternalForm()) != null)
return fileName;
StringBuilder sb = new StringBuilder("./");
sb.append(filesDir.getName());
sb.append('/');
sb.append(fileName);
return sb.toString();
}
final public File getLocalFile(String fileName) {
return new File(filesDir, fileName);
}
final public String getUrlFileName(String src) {
if (urlFileMap == null)
return null;
URL objectURL = LinkUtils.getLink(pageUrl, src, null, false);
String url = objectURL == null ? src : objectURL.toExternalForm();
return urlFileMap.get(url);
}
final public File getAndRegisterDestFile(String urlString, String baseName,
String extension) {
String fileName = buildFileName(baseName, extension, null);
Integer fileCount = fileCountMap.get(fileName);
fileCount = fileCount == null ? new Integer(0) : fileCount + 1;
fileCountMap.put(fileName, fileCount);
fileName = buildFileName(baseName, extension, fileCount);
if (urlString != null)
urlFileMap.put(urlString, fileName);
return new File(filesDir, fileName);
}
final private String downloadObject(URL parentUrl, String src,
String contentType) throws ClientProtocolException,
IllegalStateException, IOException, SearchLibException,
URISyntaxException {
RecursiveEntry recursiveEntry = recursiveSecurity.enter();
if (recursiveEntry == null) {
Logging.warn("Max recursion reached - " + recursiveSecurity
+ " src: " + src + " url: " + parentUrl);
return src;
}
try {
src = StringEscapeUtils.unescapeXml(src);
URL objectURL = LinkUtils.getLink(parentUrl, src, null, false);
if (objectURL == null)
return src;
if (objectURL.equals(pageUrl)) {
return "index.html";
}
String urlString = objectURL.toExternalForm();
String fileName = urlFileMap.get(urlString);
if (fileName != null)
return getLocalPath(parentUrl, fileName);
DownloadItem downloadItem = null;
try {
downloadItem = downloader.get(objectURL.toURI(), null);
} catch (IOException e) {
Logging.warn("IO Exception on " + objectURL.toURI(), e);
return src;
}
fileName = downloadItem.getFileName();
if (fileName == null || fileName.length() == 0)
return src;
downloadItem.checkNoErrorRange(200, 300);
String baseName = FilenameUtils.getBaseName(fileName);
String extension = FilenameUtils.getExtension(fileName);
if (contentType == null)
contentType = downloadItem.getContentBaseType();
if ("text/html".equalsIgnoreCase(contentType))
extension = "html";
else if ("text/javascript".equalsIgnoreCase(contentType))
extension = "js";
else if ("text/css".equalsIgnoreCase(contentType))
extension = "css";
else if ("application/x-shockwave-flash"
.equalsIgnoreCase(contentType))
extension = "swf";
else if ("image/png".equalsIgnoreCase(contentType))
extension = "png";
else if ("image/gif".equalsIgnoreCase(contentType))
extension = "gif";
else if ("image/jpeg".equalsIgnoreCase(contentType))
extension = "jpg";
else if ("image/jpg".equalsIgnoreCase(contentType))
extension = "jpg";
File destFile = getAndRegisterDestFile(urlString, baseName,
extension);
if ("css".equals(extension)) {
String cssContent = downloadItem.getContentAsString();
StringBuffer sb = checkCSSContent(objectURL, cssContent);
if (sb != null && sb.length() > 0)
cssContent = sb.toString();
FileUtils.write(destFile, cssContent);
} else
downloadItem.writeToFile(destFile);
return getLocalPath(parentUrl, destFile.getName());
} catch (HttpHostConnectException e) {
Logging.warn(e);
return src;
} catch (UnknownHostException e) {
Logging.warn(e);
return src;
} catch (WrongStatusCodeException e) {
Logging.warn(e);
return src;
} finally {
recursiveEntry.release();
}
}
final private boolean handleCssProperty(URL objectUrl, CSSProperty property)
throws ClientProtocolException, IllegalStateException, IOException,
SearchLibException, URISyntaxException {
if (property == null)
return false;
String oldValue = property.getValue();
if (oldValue == null)
return false;
Matcher matcher = NaiveCSSParser.findUrl(oldValue);
if (!matcher.find())
return false;
String url = matcher.group(1);
if (url == null || url.length() == 0)
return false;
String newSrc = downloadObject(objectUrl, url, null);
if (newSrc == null)
return false;
property.setValue(NaiveCSSParser.replaceUrl(oldValue, matcher, newSrc));
return true;
}
final private boolean handleCssStyle(URL objectUrl, CSSStyleRule rule)
throws ClientProtocolException, IllegalStateException, IOException,
SearchLibException, URISyntaxException {
boolean change = false;
for (CSSProperty property : rule.getProperties()) {
if (handleCssProperty(objectUrl, property))
change = true;
}
return change;
}
final private StringBuffer checkCSSContent(URL objectUrl, String css)
throws ClientProtocolException, IllegalStateException, IOException,
SearchLibException, URISyntaxException {
StringWriter sw = null;
PrintWriter pw = null;
try {
NaiveCSSParser cssParser = new NaiveCSSParser();
Collection rules = cssParser.parseStyleSheet(css);
if (rules == null)
return null;
if (rules.size() == 0)
return null;
sw = new StringWriter();
pw = new PrintWriter(sw);
for (CSSRule rule : rules) {
if (rule instanceof CSSStyleRule) {
handleCssStyle(objectUrl, (CSSStyleRule) rule);
} else if (rule instanceof CSSImportRule) {
CSSImportRule importRule = (CSSImportRule) rule;
String newSrc = downloadObject(objectUrl,
importRule.getHref(), "text/css");
importRule.setHref(newSrc);
}
}
cssParser.write(pw);
return sw.getBuffer();
} catch (IOException e) {
Logging.warn("CSS ISSUE", e);
return null;
} finally {
IOUtils.close(pw, sw);
}
}
final private void checkStyleCSS(TagNode node)
throws ClientProtocolException, IllegalStateException, IOException,
SearchLibException, URISyntaxException {
if (!("style".equalsIgnoreCase(node.getName())))
return;
String attr = node.getAttributeByName("type");
if (!StringUtils.isEmpty(attr) && !"text/css".equalsIgnoreCase(attr))
return;
attr = node.getAttributeByName("media");
if (!StringUtils.isEmpty(attr) && !"screen".equalsIgnoreCase(attr)
&& !"all".equalsIgnoreCase(attr))
return;
StringBuilder builder = (StringBuilder) node.getText();
if (builder == null)
return;
String content = builder.toString();
String newContent = StringEscapeUtils.unescapeXml(content);
StringBuffer sb = checkCSSContent(baseUrl, newContent);
if (sb != null)
newContent = sb.toString();
if (newContent.equals(content))
return;
node.removeAllChildren();
node.addChild(new ContentNode(newContent));
}
final private void checkStyleAttribute(TagNode node)
throws ClientProtocolException, IllegalStateException, IOException,
SearchLibException, URISyntaxException {
String style = node.getAttributeByName("style");
if (style == null)
return;
if (style.length() == 0)
return;
NaiveCSSParser cssParser = new NaiveCSSParser();
CSSStyleRule cssStyle = cssParser.parseStyleAttribute(style);
if (!handleCssStyle(baseUrl, cssStyle))
return;
node.addAttribute("style", cssStyle.getPropertyString());
}
final boolean hasAncestorId(String[] ids, TagNode node) {
if (node == null)
return false;
String nodeId = node.getAttributeByName("id");
if (nodeId != null)
for (String id : ids)
if (id.equalsIgnoreCase(nodeId))
return true;
return hasAncestorId(ids, node.getParent());
}
final boolean hasAncestorXPath(Set xpathSelectorSet, TagNode node) {
if (node == null)
return false;
if (xpathSelectorSet.contains(node))
return true;
return hasAncestorXPath(xpathSelectorSet, node.getParent());
}
final private void checkScriptContent(TagNode node,
Set disableScriptNodeSet) {
if (!("script".equalsIgnoreCase(node.getName())))
return;
if (disableScriptNodeSet != null
&& hasAncestorXPath(disableScriptNodeSet, node)) {
node.removeFromTree();
return;
}
StringBuilder builder = (StringBuilder) node.getText();
if (builder == null)
return;
String content = builder.toString();
if (content == null)
return;
String newContent = StringEscapeUtils.unescapeXml(content);
if (newContent.equals(content))
return;
node.removeAllChildren();
node.addChild(new ContentNode(newContent));
}
final private String downloadIframe(URL parentUrl, TagNode node,
Map iframeNodeMap) throws IOException,
ParserConfigurationException, SAXException, IllegalStateException,
SearchLibException, URISyntaxException {
if (iframeNodeMap == null) {
Logging.warn("Unable to download IFRAME (no iframeNodeNap) " + node);
return null;
}
WebElement webElement = iframeNodeMap.get(node);
if (webElement == null) {
Logging.warn("Issue when finding IFRAME for " + node);
return null;
}
URL oldBaseUrl = baseUrl;
String src = node.getAttributeByName("src");
baseUrl = LinkUtils.getLink(parentUrl, src, null, false);
String urlFileMapKey = null;
if (baseUrl != null
&& !urlFileMap.containsKey(baseUrl.toExternalForm()))
urlFileMapKey = baseUrl.toExternalForm();
else
urlFileMapKey = Integer.toString(node.hashCode());
File destFile = getAndRegisterDestFile(urlFileMapKey, "iframe", "html");
browserDriver.switchToFrame(webElement);
String frameSource = browserDriver.getSourceCode();
HtmlCleanerParser htmlCleanerParser = new HtmlCleanerParser();
htmlCleanerParser.init(frameSource);
recursiveArchive(htmlCleanerParser.getTagNode(), null, iframeNodeMap);
htmlCleanerParser.writeHtmlToFile(destFile);
baseUrl = oldBaseUrl;
browserDriver.switchToMain();
return getLocalPath(parentUrl, destFile.getName());
}
final private boolean downloadObjectIframe(TagNode node,
Map iframeNodeMap)
throws IllegalStateException, IOException,
ParserConfigurationException, SAXException, SearchLibException,
URISyntaxException {
if (!"iframe".equalsIgnoreCase(node.getName()))
return false;
String src = downloadIframe(baseUrl, node, iframeNodeMap);
if (src != null)
node.addAttribute("src", src);
return true;
}
final private boolean downloadObjectSrc(TagNode node)
throws ClientProtocolException, IllegalStateException, IOException,
SearchLibException, URISyntaxException {
String src = node.getAttributeByName("src");
if (src == null)
return false;
src = downloadObject(baseUrl, src, null);
if (src != null)
node.addAttribute("src", src);
return true;
}
final private boolean downloadObjectLink(TagNode node)
throws ClientProtocolException, IllegalStateException, IOException,
SearchLibException, URISyntaxException,
ParserConfigurationException, SAXException {
String src = node.getAttributeByName("href");
if (src == null)
return false;
String type = node.getAttributeByName("type");
if (type == null && node.getName().equalsIgnoreCase("script"))
type = "text/javascript";
if (type == null
&& node.getName().equalsIgnoreCase("link")
&& "stylesheet"
.equalsIgnoreCase(node.getAttributeByName("rel")))
type = "text/css";
if (type == null)
return false;
src = downloadObject(baseUrl, src, type);
if (src != null)
node.addAttribute("href", src);
return true;
}
final private void checkBaseHref(TagNode node) {
if (node == null)
return;
if (!"base".equalsIgnoreCase(node.getName()))
return;
String href = node.getAttributeByName("href");
if (href != null) {
try {
baseUrl = new URL(href);
} catch (MalformedURLException e) {
Logging.warn(e);
return;
}
}
node.removeFromTree();
}
final private void recursiveArchive(TagNode node,
Set disableScriptNodeSet,
Map iframeNodeMap)
throws ClientProtocolException, IllegalStateException, IOException,
SearchLibException, URISyntaxException,
ParserConfigurationException, SAXException {
if (node == null)
return;
checkBaseHref(node);
if (!downloadObjectIframe(node, iframeNodeMap))
if (!downloadObjectSrc(node))
downloadObjectLink(node);
checkStyleCSS(node);
checkScriptContent(node, disableScriptNodeSet);
checkStyleAttribute(node);
TagNode[] nodes = node.getChildTags();
if (nodes == null)
return;
for (TagNode n : nodes)
recursiveArchive(n, disableScriptNodeSet, iframeNodeMap);
}
final public void archive(BrowserDriver> browserDriver,
Set xPathDisableScriptSet) throws IOException,
ParserConfigurationException, SAXException, IllegalStateException,
SearchLibException, URISyntaxException, XPatherException {
String pageSource = browserDriver.getSourceCode();
HtmlCleanerParser htmlCleanerParser = new HtmlCleanerParser();
htmlCleanerParser.init(pageSource);
// Find iframe
Set iframeWebElementSet = new HashSet();
browserDriver.locateBy(By.tagName("iframe"), iframeWebElementSet, true);
Map iframeNodeMap = null;
if (iframeWebElementSet != null && iframeWebElementSet.size() > 0) {
iframeNodeMap = new HashMap();
Set tagNodeSet = new HashSet();
for (WebElement webElement : iframeWebElementSet) {
String xPath = browserDriver.getXPath(webElement, true);
if (xPath == null)
continue;
if (htmlCleanerParser.xpath(xPath, tagNodeSet) == 0) {
Logging.warn("DisableScript not found using XPath: "
+ xPath);
continue;
}
for (TagNode tagNode : tagNodeSet)
iframeNodeMap.put(tagNode, webElement);
tagNodeSet.clear();
}
}
// Find node that need to be disabled
Set disableScriptNodeSet = null;
if (xPathDisableScriptSet != null && xPathDisableScriptSet.size() > 0) {
disableScriptNodeSet = new HashSet();
for (String xPath : xPathDisableScriptSet)
if (htmlCleanerParser.xpath(xPath, disableScriptNodeSet) == 0)
Logging.warn("DisableScript not found using XPath: "
+ xPath);
}
recursiveArchive(htmlCleanerParser.getTagNode(), disableScriptNodeSet,
iframeNodeMap);
htmlCleanerParser.writeHtmlToFile(indexFile);
String charset = htmlCleanerParser.findCharset();
if (charset == null)
FileUtils.write(sourceFile, pageSource);
else
FileUtils.write(sourceFile, pageSource, charset);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy