All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.crawler.web.spider.HtmlArchiver Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

The newest version!
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.crawler.web.spider;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;

import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.conn.HttpHostConnectException;
import org.htmlcleaner.ContentNode;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.xml.sax.SAXException;

import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.SearchLibException.WrongStatusCodeException;
import com.jaeksoft.searchlib.crawler.web.browser.BrowserDriver;
import com.jaeksoft.searchlib.crawler.web.spider.NaiveCSSParser.CSSImportRule;
import com.jaeksoft.searchlib.crawler.web.spider.NaiveCSSParser.CSSProperty;
import com.jaeksoft.searchlib.crawler.web.spider.NaiveCSSParser.CSSRule;
import com.jaeksoft.searchlib.crawler.web.spider.NaiveCSSParser.CSSStyleRule;
import com.jaeksoft.searchlib.parser.htmlParser.HtmlCleanerParser;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.LinkUtils;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.ThreadUtils.RecursiveTracker;
import com.jaeksoft.searchlib.util.ThreadUtils.RecursiveTracker.RecursiveEntry;

public class HtmlArchiver {

	private final BrowserDriver browserDriver;
	private final File filesDir;
	private final File indexFile;
	private final File sourceFile;
	private final Map fileCountMap;
	private final Map urlFileMap;
	private final URL pageUrl;
	private final HttpDownloader downloader;
	private final RecursiveTracker recursiveSecurity;
	private URL baseUrl;

	public HtmlArchiver(BrowserDriver browserDriver, File parentDir,
			HttpDownloader httpDownloader, URL url) {
		this.browserDriver = browserDriver;
		filesDir = new File(parentDir, "files");
		indexFile = new File(parentDir, "index.html");
		sourceFile = new File(parentDir, "source.html");
		this.pageUrl = url;
		this.baseUrl = url;
		this.downloader = httpDownloader;
		fileCountMap = new TreeMap();
		urlFileMap = new TreeMap();
		filesDir.mkdir();
		recursiveSecurity = new RecursiveTracker(20);
	}

	final private static String buildFileName(String baseName,
			String extension, Integer fileCount) {
		if (baseName.length() > 160)
			baseName = baseName.substring(0, 160);
		if (extension.length() > 32)
			extension = extension.substring(0, 32);
		StringBuilder sb = new StringBuilder(baseName);
		if (fileCount != null && fileCount > 0) {
			sb.append('_');
			sb.append(fileCount);
		}
		if (extension != null && extension.length() > 0) {
			sb.append('.');
			sb.append(extension);
		}
		return sb.toString();
	}

	final private String getLocalPath(URL parentUrl, String fileName) {
		if (parentUrl == null
				|| urlFileMap.get(parentUrl.toExternalForm()) != null)
			return fileName;
		StringBuilder sb = new StringBuilder("./");
		sb.append(filesDir.getName());
		sb.append('/');
		sb.append(fileName);
		return sb.toString();
	}

	final public File getLocalFile(String fileName) {
		return new File(filesDir, fileName);
	}

	final public String getUrlFileName(String src) {
		if (urlFileMap == null)
			return null;
		URL objectURL = LinkUtils.getLink(pageUrl, src, null, false);
		String url = objectURL == null ? src : objectURL.toExternalForm();
		return urlFileMap.get(url);
	}

	final public File getAndRegisterDestFile(String urlString, String baseName,
			String extension) {
		String fileName = buildFileName(baseName, extension, null);
		Integer fileCount = fileCountMap.get(fileName);
		fileCount = fileCount == null ? new Integer(0) : fileCount + 1;
		fileCountMap.put(fileName, fileCount);
		fileName = buildFileName(baseName, extension, fileCount);
		if (urlString != null)
			urlFileMap.put(urlString, fileName);
		return new File(filesDir, fileName);
	}

	final private String downloadObject(URL parentUrl, String src,
			String contentType) throws ClientProtocolException,
			IllegalStateException, IOException, SearchLibException,
			URISyntaxException {
		RecursiveEntry recursiveEntry = recursiveSecurity.enter();
		if (recursiveEntry == null) {
			Logging.warn("Max recursion reached - " + recursiveSecurity
					+ " src: " + src + " url: " + parentUrl);
			return src;
		}
		try {
			src = StringEscapeUtils.unescapeXml(src);
			URL objectURL = LinkUtils.getLink(parentUrl, src, null, false);
			if (objectURL == null)
				return src;
			if (objectURL.equals(pageUrl)) {
				return "index.html";
			}
			String urlString = objectURL.toExternalForm();
			String fileName = urlFileMap.get(urlString);
			if (fileName != null)
				return getLocalPath(parentUrl, fileName);
			DownloadItem downloadItem = null;
			try {
				downloadItem = downloader.get(objectURL.toURI(), null);
			} catch (IOException e) {
				Logging.warn("IO Exception on " + objectURL.toURI(), e);
				return src;
			}
			fileName = downloadItem.getFileName();
			if (fileName == null || fileName.length() == 0)
				return src;
			downloadItem.checkNoErrorRange(200, 300);
			String baseName = FilenameUtils.getBaseName(fileName);
			String extension = FilenameUtils.getExtension(fileName);
			if (contentType == null)
				contentType = downloadItem.getContentBaseType();
			if ("text/html".equalsIgnoreCase(contentType))
				extension = "html";
			else if ("text/javascript".equalsIgnoreCase(contentType))
				extension = "js";
			else if ("text/css".equalsIgnoreCase(contentType))
				extension = "css";
			else if ("application/x-shockwave-flash"
					.equalsIgnoreCase(contentType))
				extension = "swf";
			else if ("image/png".equalsIgnoreCase(contentType))
				extension = "png";
			else if ("image/gif".equalsIgnoreCase(contentType))
				extension = "gif";
			else if ("image/jpeg".equalsIgnoreCase(contentType))
				extension = "jpg";
			else if ("image/jpg".equalsIgnoreCase(contentType))
				extension = "jpg";
			File destFile = getAndRegisterDestFile(urlString, baseName,
					extension);
			if ("css".equals(extension)) {
				String cssContent = downloadItem.getContentAsString();
				StringBuffer sb = checkCSSContent(objectURL, cssContent);
				if (sb != null && sb.length() > 0)
					cssContent = sb.toString();
				FileUtils.write(destFile, cssContent);
			} else
				downloadItem.writeToFile(destFile);

			return getLocalPath(parentUrl, destFile.getName());
		} catch (HttpHostConnectException e) {
			Logging.warn(e);
			return src;
		} catch (UnknownHostException e) {
			Logging.warn(e);
			return src;
		} catch (WrongStatusCodeException e) {
			Logging.warn(e);
			return src;
		} finally {
			recursiveEntry.release();
		}
	}

	final private boolean handleCssProperty(URL objectUrl, CSSProperty property)
			throws ClientProtocolException, IllegalStateException, IOException,
			SearchLibException, URISyntaxException {
		if (property == null)
			return false;
		String oldValue = property.getValue();
		if (oldValue == null)
			return false;
		Matcher matcher = NaiveCSSParser.findUrl(oldValue);
		if (!matcher.find())
			return false;
		String url = matcher.group(1);
		if (url == null || url.length() == 0)
			return false;
		String newSrc = downloadObject(objectUrl, url, null);
		if (newSrc == null)
			return false;
		property.setValue(NaiveCSSParser.replaceUrl(oldValue, matcher, newSrc));
		return true;
	}

	final private boolean handleCssStyle(URL objectUrl, CSSStyleRule rule)
			throws ClientProtocolException, IllegalStateException, IOException,
			SearchLibException, URISyntaxException {
		boolean change = false;
		for (CSSProperty property : rule.getProperties()) {
			if (handleCssProperty(objectUrl, property))
				change = true;
		}
		return change;
	}

	final private StringBuffer checkCSSContent(URL objectUrl, String css)
			throws ClientProtocolException, IllegalStateException, IOException,
			SearchLibException, URISyntaxException {
		StringWriter sw = null;
		PrintWriter pw = null;

		try {
			NaiveCSSParser cssParser = new NaiveCSSParser();
			Collection rules = cssParser.parseStyleSheet(css);
			if (rules == null)
				return null;
			if (rules.size() == 0)
				return null;
			sw = new StringWriter();
			pw = new PrintWriter(sw);
			for (CSSRule rule : rules) {
				if (rule instanceof CSSStyleRule) {
					handleCssStyle(objectUrl, (CSSStyleRule) rule);
				} else if (rule instanceof CSSImportRule) {
					CSSImportRule importRule = (CSSImportRule) rule;
					String newSrc = downloadObject(objectUrl,
							importRule.getHref(), "text/css");
					importRule.setHref(newSrc);
				}
			}
			cssParser.write(pw);
			return sw.getBuffer();
		} catch (IOException e) {
			Logging.warn("CSS ISSUE", e);
			return null;
		} finally {
			IOUtils.close(pw, sw);
		}
	}

	final private void checkStyleCSS(TagNode node)
			throws ClientProtocolException, IllegalStateException, IOException,
			SearchLibException, URISyntaxException {
		if (!("style".equalsIgnoreCase(node.getName())))
			return;
		String attr = node.getAttributeByName("type");
		if (!StringUtils.isEmpty(attr) && !"text/css".equalsIgnoreCase(attr))
			return;
		attr = node.getAttributeByName("media");
		if (!StringUtils.isEmpty(attr) && !"screen".equalsIgnoreCase(attr)
				&& !"all".equalsIgnoreCase(attr))
			return;
		StringBuilder builder = (StringBuilder) node.getText();
		if (builder == null)
			return;
		String content = builder.toString();
		String newContent = StringEscapeUtils.unescapeXml(content);
		StringBuffer sb = checkCSSContent(baseUrl, newContent);
		if (sb != null)
			newContent = sb.toString();
		if (newContent.equals(content))
			return;
		node.removeAllChildren();
		node.addChild(new ContentNode(newContent));
	}

	final private void checkStyleAttribute(TagNode node)
			throws ClientProtocolException, IllegalStateException, IOException,
			SearchLibException, URISyntaxException {
		String style = node.getAttributeByName("style");
		if (style == null)
			return;
		if (style.length() == 0)
			return;

		NaiveCSSParser cssParser = new NaiveCSSParser();
		CSSStyleRule cssStyle = cssParser.parseStyleAttribute(style);
		if (!handleCssStyle(baseUrl, cssStyle))
			return;
		node.addAttribute("style", cssStyle.getPropertyString());
	}

	final boolean hasAncestorId(String[] ids, TagNode node) {
		if (node == null)
			return false;
		String nodeId = node.getAttributeByName("id");
		if (nodeId != null)
			for (String id : ids)
				if (id.equalsIgnoreCase(nodeId))
					return true;
		return hasAncestorId(ids, node.getParent());
	}

	final boolean hasAncestorXPath(Set xpathSelectorSet, TagNode node) {
		if (node == null)
			return false;
		if (xpathSelectorSet.contains(node))
			return true;
		return hasAncestorXPath(xpathSelectorSet, node.getParent());
	}

	final private void checkScriptContent(TagNode node,
			Set disableScriptNodeSet) {
		if (!("script".equalsIgnoreCase(node.getName())))
			return;
		if (disableScriptNodeSet != null
				&& hasAncestorXPath(disableScriptNodeSet, node)) {
			node.removeFromTree();
			return;
		}
		StringBuilder builder = (StringBuilder) node.getText();
		if (builder == null)
			return;
		String content = builder.toString();
		if (content == null)
			return;
		String newContent = StringEscapeUtils.unescapeXml(content);
		if (newContent.equals(content))
			return;
		node.removeAllChildren();
		node.addChild(new ContentNode(newContent));
	}

	final private String downloadIframe(URL parentUrl, TagNode node,
			Map iframeNodeMap) throws IOException,
			ParserConfigurationException, SAXException, IllegalStateException,
			SearchLibException, URISyntaxException {
		if (iframeNodeMap == null) {
			Logging.warn("Unable to download IFRAME (no iframeNodeNap) " + node);
			return null;
		}
		WebElement webElement = iframeNodeMap.get(node);
		if (webElement == null) {
			Logging.warn("Issue when finding IFRAME for " + node);
			return null;
		}
		URL oldBaseUrl = baseUrl;
		String src = node.getAttributeByName("src");
		baseUrl = LinkUtils.getLink(parentUrl, src, null, false);
		String urlFileMapKey = null;
		if (baseUrl != null
				&& !urlFileMap.containsKey(baseUrl.toExternalForm()))
			urlFileMapKey = baseUrl.toExternalForm();
		else
			urlFileMapKey = Integer.toString(node.hashCode());
		File destFile = getAndRegisterDestFile(urlFileMapKey, "iframe", "html");
		browserDriver.switchToFrame(webElement);
		String frameSource = browserDriver.getSourceCode();
		HtmlCleanerParser htmlCleanerParser = new HtmlCleanerParser();
		htmlCleanerParser.init(frameSource);
		recursiveArchive(htmlCleanerParser.getTagNode(), null, iframeNodeMap);
		htmlCleanerParser.writeHtmlToFile(destFile);
		baseUrl = oldBaseUrl;
		browserDriver.switchToMain();
		return getLocalPath(parentUrl, destFile.getName());
	}

	final private boolean downloadObjectIframe(TagNode node,
			Map iframeNodeMap)
			throws IllegalStateException, IOException,
			ParserConfigurationException, SAXException, SearchLibException,
			URISyntaxException {
		if (!"iframe".equalsIgnoreCase(node.getName()))
			return false;
		String src = downloadIframe(baseUrl, node, iframeNodeMap);
		if (src != null)
			node.addAttribute("src", src);
		return true;
	}

	final private boolean downloadObjectSrc(TagNode node)
			throws ClientProtocolException, IllegalStateException, IOException,
			SearchLibException, URISyntaxException {
		String src = node.getAttributeByName("src");
		if (src == null)
			return false;
		src = downloadObject(baseUrl, src, null);
		if (src != null)
			node.addAttribute("src", src);
		return true;
	}

	final private boolean downloadObjectLink(TagNode node)
			throws ClientProtocolException, IllegalStateException, IOException,
			SearchLibException, URISyntaxException,
			ParserConfigurationException, SAXException {
		String src = node.getAttributeByName("href");
		if (src == null)
			return false;
		String type = node.getAttributeByName("type");
		if (type == null && node.getName().equalsIgnoreCase("script"))
			type = "text/javascript";
		if (type == null
				&& node.getName().equalsIgnoreCase("link")
				&& "stylesheet"
						.equalsIgnoreCase(node.getAttributeByName("rel")))
			type = "text/css";
		if (type == null)
			return false;
		src = downloadObject(baseUrl, src, type);
		if (src != null)
			node.addAttribute("href", src);
		return true;
	}

	final private void checkBaseHref(TagNode node) {
		if (node == null)
			return;
		if (!"base".equalsIgnoreCase(node.getName()))
			return;
		String href = node.getAttributeByName("href");
		if (href != null) {
			try {
				baseUrl = new URL(href);
			} catch (MalformedURLException e) {
				Logging.warn(e);
				return;
			}
		}
		node.removeFromTree();
	}

	final private void recursiveArchive(TagNode node,
			Set disableScriptNodeSet,
			Map iframeNodeMap)
			throws ClientProtocolException, IllegalStateException, IOException,
			SearchLibException, URISyntaxException,
			ParserConfigurationException, SAXException {
		if (node == null)
			return;
		checkBaseHref(node);
		if (!downloadObjectIframe(node, iframeNodeMap))
			if (!downloadObjectSrc(node))
				downloadObjectLink(node);
		checkStyleCSS(node);
		checkScriptContent(node, disableScriptNodeSet);
		checkStyleAttribute(node);
		TagNode[] nodes = node.getChildTags();
		if (nodes == null)
			return;
		for (TagNode n : nodes)
			recursiveArchive(n, disableScriptNodeSet, iframeNodeMap);
	}

	final public void archive(BrowserDriver browserDriver,
			Set xPathDisableScriptSet) throws IOException,
			ParserConfigurationException, SAXException, IllegalStateException,
			SearchLibException, URISyntaxException, XPatherException {
		String pageSource = browserDriver.getSourceCode();
		HtmlCleanerParser htmlCleanerParser = new HtmlCleanerParser();
		htmlCleanerParser.init(pageSource);
		// Find iframe
		Set iframeWebElementSet = new HashSet();
		browserDriver.locateBy(By.tagName("iframe"), iframeWebElementSet, true);
		Map iframeNodeMap = null;
		if (iframeWebElementSet != null && iframeWebElementSet.size() > 0) {
			iframeNodeMap = new HashMap();
			Set tagNodeSet = new HashSet();
			for (WebElement webElement : iframeWebElementSet) {
				String xPath = browserDriver.getXPath(webElement, true);
				if (xPath == null)
					continue;
				if (htmlCleanerParser.xpath(xPath, tagNodeSet) == 0) {
					Logging.warn("DisableScript not found using XPath: "
							+ xPath);
					continue;
				}
				for (TagNode tagNode : tagNodeSet)
					iframeNodeMap.put(tagNode, webElement);
				tagNodeSet.clear();
			}
		}
		// Find node that need to be disabled
		Set disableScriptNodeSet = null;
		if (xPathDisableScriptSet != null && xPathDisableScriptSet.size() > 0) {
			disableScriptNodeSet = new HashSet();
			for (String xPath : xPathDisableScriptSet)
				if (htmlCleanerParser.xpath(xPath, disableScriptNodeSet) == 0)
					Logging.warn("DisableScript not found using XPath: "
							+ xPath);
		}
		recursiveArchive(htmlCleanerParser.getTagNode(), disableScriptNodeSet,
				iframeNodeMap);
		htmlCleanerParser.writeHtmlToFile(indexFile);
		String charset = htmlCleanerParser.findCharset();
		if (charset == null)
			FileUtils.write(sourceFile, pageSource);
		else
			FileUtils.write(sourceFile, pageSource, charset);

	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy