All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.crawler.web.spider.ClickCapture Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

The newest version!
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.crawler.web.spider;

import java.awt.Rectangle;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import javax.imageio.ImageIO;
import javax.xml.bind.annotation.XmlTransient;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonInclude.Include;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.web.browser.BrowserDriver;
import com.jaeksoft.searchlib.script.ScriptCommandContext;
import com.jaeksoft.searchlib.script.commands.Selectors;
import com.jaeksoft.searchlib.util.FileUtils;
import com.jaeksoft.searchlib.util.ImageUtils;
import com.jaeksoft.searchlib.util.StringUtils;

@JsonInclude(Include.NON_NULL)
public final class ClickCapture implements Comparable {

	private final Selectors.Selector selector;
	private final Collection webElements;
	private final Rectangle firstElementBox;

	public String anchorHref = null;
	public String finalUrl = null;
	public String embedSrc = null;
	public String imgSrc = null;
	public String filename = null;
	public String file_md5 = null;
	public String file_phash = null;

	public ClickCapture(BrowserDriver browserDriver,
			Selectors.Selector selector, Collection webElements) {
		this.selector = selector;
		this.webElements = webElements;
		Rectangle box = null;
		if (webElements != null) {
			for (WebElement webElement : webElements) {
				Rectangle r = browserDriver.getRectangle(webElement);
				if (r.width > 0 && r.height > 0) {
					box = r;
					break;
				}
			}
		}
		firstElementBox = box;
	}

	private boolean isEmpty() {
		return anchorHref == null && finalUrl == null && imgSrc == null
				&& embedSrc == null;
	}

	@Override
	public int compareTo(ClickCapture o) {
		int c;
		if ((c = StringUtils.compareNullString(anchorHref, o.anchorHref)) != 0)
			return c;
		if ((c = StringUtils.compareNullString(embedSrc, o.embedSrc)) != 0)
			return c;
		if ((c = StringUtils.compareNullString(imgSrc, o.imgSrc)) != 0)
			return c;
		return 0;
	}

	@JsonIgnore
	@XmlTransient
	private String sql(String sql) {
		sql = StringUtils.replace(
				sql,
				"{custom}",
				selector.custom == null ? StringUtils.EMPTY : StringEscapeUtils
						.escapeEcmaScript(selector.custom));
		sql = StringUtils.replace(
				sql,
				"{anchor_href}",
				anchorHref == null ? StringUtils.EMPTY : StringEscapeUtils
						.escapeEcmaScript(anchorHref));
		sql = StringUtils.replace(
				sql,
				"{final_url}",
				finalUrl == null ? StringUtils.EMPTY : StringEscapeUtils
						.escapeEcmaScript(finalUrl));
		sql = StringUtils.replace(
				sql,
				"{embed_src}",
				embedSrc == null ? StringUtils.EMPTY : StringEscapeUtils
						.escapeEcmaScript(embedSrc));
		sql = StringUtils.replace(
				sql,
				"{img_src}",
				imgSrc == null ? StringUtils.EMPTY : StringEscapeUtils
						.escapeEcmaScript(imgSrc));
		sql = StringUtils.replace(
				sql,
				"{filename}",
				filename == null ? StringUtils.EMPTY : StringEscapeUtils
						.escapeEcmaScript(filename));
		sql = StringUtils.replace(
				sql,
				"{file_md5}",
				file_md5 == null ? StringUtils.EMPTY : StringEscapeUtils
						.escapeEcmaScript(file_md5));
		sql = StringUtils.replace(
				sql,
				"{file_phash}",
				file_phash == null ? StringUtils.EMPTY : StringEscapeUtils
						.escapeEcmaScript(file_phash));
		return sql;
	}

	private static String performClickGetUrl(BrowserDriver browserDriver,
			String url) throws IOException, SearchLibException {
		if (url == null)
			return null;
		try {
			browserDriver.openNewWindow();
			browserDriver.get(url);
			return browserDriver.getCurrentUrl();
		} catch (org.openqa.selenium.TimeoutException e) {
			Logging.warn(e);
			return null;
		}
	}

	private boolean locateAimgClickCapture(WebElement aElement) {
		if (!aElement.isDisplayed())
			return false;
		String ahref = aElement.getAttribute("href");
		List imgElements = aElement.findElements(By
				.cssSelector("img"));
		if (imgElements == null)
			return false;
		for (WebElement imgElement : imgElements) {
			if (!imgElement.isDisplayed())
				continue;
			imgSrc = imgElement.getAttribute("src");
		}
		anchorHref = ahref;
		return true;
	}

	private boolean locateAimgClickCapture(List aElements)
			throws SearchLibException, IOException {
		if (aElements == null)
			return false;
		for (WebElement aElement : aElements)
			if (locateAimgClickCapture(aElement))
				return true;
		return false;
	}

	private boolean locateEmbedClickCapture(List embedElements)
			throws SearchLibException, IOException {
		if (embedElements == null)
			return false;
		for (WebElement embedElement : embedElements) {
			if (!embedElement.isDisplayed())
				continue;
			embedSrc = embedElement.getAttribute("src");
			String flashVars = embedElement.getAttribute("flashvars");
			if (!StringUtils.isEmpty(flashVars)) {
				try {
					URI uri = new URI(embedSrc);
					flashVars = uri.getQuery();
				} catch (URISyntaxException e) {
					Logging.warn(e);
				}
			}
			String[] params = StringUtils.split(flashVars, '&');
			Map paramMap = new TreeMap();
			if (params != null) {
				for (String param : params) {
					String[] keyValue = StringUtils.split(param, '=');
					if (keyValue != null && keyValue.length == 2)
						paramMap.put(keyValue[0].toLowerCase(),
								URLDecoder.decode(keyValue[1], "UTF-8"));
				}
			}
			if (selector.flashVarsLink != null)
				anchorHref = paramMap.get(selector.flashVarsLink);
			return true;
		}
		return false;
	}

	private boolean locateElement(BrowserDriver browserDriver,
			WebElement webElement) throws SearchLibException, IOException {
		By by = By.cssSelector("a");
		List aElements = webElement == null ? browserDriver
				.locateBy(by) : webElement.findElements(by);
		if (locateAimgClickCapture(aElements))
			return true;
		by = By.cssSelector("embed");
		List embedElements = webElement == null ? browserDriver
				.locateBy(by) : webElement.findElements(by);
		if (locateEmbedClickCapture(embedElements))
			return true;
		by = By.cssSelector("object > object");
		List objectElements = webElement == null ? browserDriver
				.locateBy(by) : webElement.findElements(by);
		if (locateEmbedClickCapture(objectElements))
			return true;
		by = By.tagName("iframe");
		List iFrameElements = webElement == null ? browserDriver
				.locateBy(by) : webElement.findElements(by);
		if (locateIFrame(browserDriver, iFrameElements))
			return true;
		return false;
	}

	private boolean locateIFrame(BrowserDriver browserDriver,
			List iFrameElements) throws SearchLibException,
			IOException {
		try {
			if (CollectionUtils.isEmpty(iFrameElements))
				return false;
			for (WebElement frameWebElement : iFrameElements) {
				if (!frameWebElement.isDisplayed())
					continue;
				browserDriver.switchToFrame(frameWebElement);
				if (locateElement(browserDriver, null))
					return true;
			}
			return false;
		} finally {
			browserDriver.switchToMain();
		}
	}

	private void locate(BrowserDriver browserDriver) {
		try {
			if (CollectionUtils.isEmpty(webElements))
				return;
			for (WebElement webElement : webElements) {
				if ("img".equalsIgnoreCase(webElement.getTagName())) {
					webElement = browserDriver.getParent("a", webElement);
					if (webElement != null)
						if (locateAimgClickCapture(webElement))
							return;
				}
				if (locateElement(browserDriver, webElement))
					return;
			}
		} catch (Exception e) {
			Logging.warn(e);
		}
	}

	/**
	 * Try to locate a/img object/object or object/embed items
	 * 
	 * @param browserDriver
	 * @param clickCaptures
	 */
	public static void locate(BrowserDriver browserDriver,
			Collection clickCaptures) {
		for (ClickCapture clickCapture : clickCaptures)
			clickCapture.locate(browserDriver);
	}

	private void click(BrowserDriver browserDriver,
			HtmlArchiver htmlArchiver, BufferedImage screenshot) {
		try {
			filename = null;
			if (isEmpty()) {
				if (CollectionUtils.isEmpty(webElements))
					return;
				if (firstElementBox == null || htmlArchiver == null)
					return;
				if (firstElementBox.width == 0 || firstElementBox.height == 0)
					throw new SearchLibException(
							"Box height or width is null: " + selector);
				BufferedImage image = ImageUtils.getSubImage(screenshot,
						firstElementBox);
				File imageFile = htmlArchiver.getAndRegisterDestFile(null,
						"clickCapture", "png");
				ImageIO.write(image, "png", imageFile);
				filename = imageFile.getName();
				imgSrc = imageFile.getName();
			} else {
				if (htmlArchiver != null) {
					if (imgSrc != null)
						filename = htmlArchiver.getUrlFileName(imgSrc);
					else if (embedSrc != null)
						filename = htmlArchiver.getUrlFileName(embedSrc);
				}
			}
			if (filename != null) {
				File file = htmlArchiver.getLocalFile(filename);
				if (file.exists()) {
					file_md5 = FileUtils.computeMd5(file);
					if (imgSrc != null)
						file_phash = ImageUtils.computePHash(file);
				}
			}
			finalUrl = performClickGetUrl(browserDriver, anchorHref);
		} catch (Exception e) {
			Logging.warn(e);
		}
	}

	/**
	 * Collect the final URL
	 * 
	 * @param browserDriver
	 * @param results
	 * @param htmlArchiver
	 * @throws SearchLibException
	 * @throws IOException
	 */
	public static void click(BrowserDriver browserDriver,
			Collection clickCaptures, HtmlArchiver htmlArchiver,
			BufferedImage screenshot) throws SearchLibException, IOException {
		String window = browserDriver.getWindow();
		try {
			for (ClickCapture clickCapture : clickCaptures)
				clickCapture.click(browserDriver, htmlArchiver, screenshot);
		} finally {
			if (window != null)
				browserDriver.switchToWindow(window);
		}
	}

	/**
	 * 
	 * @param context
	 * @param clickCaptureSql
	 * @param clickCaptures
	 */
	public static void sql(ScriptCommandContext context,
			String clickCaptureSql, Collection clickCaptures) {
		for (ClickCapture clickCapture : clickCaptures) {
			String sql = clickCapture.sql(clickCaptureSql);
			try {
				context.executeSqlUpdate(sql);
			} catch (Exception e) {
				Logging.warn(e);
			}
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy