com.jaeksoft.searchlib.crawler.web.spider.ClickCapture Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.
The newest version!
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2013 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler.web.spider;
import java.awt.Rectangle;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import javax.imageio.ImageIO;
import javax.xml.bind.annotation.XmlTransient;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonInclude.Include;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.web.browser.BrowserDriver;
import com.jaeksoft.searchlib.script.ScriptCommandContext;
import com.jaeksoft.searchlib.script.commands.Selectors;
import com.jaeksoft.searchlib.util.FileUtils;
import com.jaeksoft.searchlib.util.ImageUtils;
import com.jaeksoft.searchlib.util.StringUtils;
@JsonInclude(Include.NON_NULL)
public final class ClickCapture implements Comparable {
private final Selectors.Selector selector;
private final Collection webElements;
private final Rectangle firstElementBox;
public String anchorHref = null;
public String finalUrl = null;
public String embedSrc = null;
public String imgSrc = null;
public String filename = null;
public String file_md5 = null;
public String file_phash = null;
public ClickCapture(BrowserDriver> browserDriver,
Selectors.Selector selector, Collection webElements) {
this.selector = selector;
this.webElements = webElements;
Rectangle box = null;
if (webElements != null) {
for (WebElement webElement : webElements) {
Rectangle r = browserDriver.getRectangle(webElement);
if (r.width > 0 && r.height > 0) {
box = r;
break;
}
}
}
firstElementBox = box;
}
private boolean isEmpty() {
return anchorHref == null && finalUrl == null && imgSrc == null
&& embedSrc == null;
}
@Override
public int compareTo(ClickCapture o) {
int c;
if ((c = StringUtils.compareNullString(anchorHref, o.anchorHref)) != 0)
return c;
if ((c = StringUtils.compareNullString(embedSrc, o.embedSrc)) != 0)
return c;
if ((c = StringUtils.compareNullString(imgSrc, o.imgSrc)) != 0)
return c;
return 0;
}
@JsonIgnore
@XmlTransient
private String sql(String sql) {
sql = StringUtils.replace(
sql,
"{custom}",
selector.custom == null ? StringUtils.EMPTY : StringEscapeUtils
.escapeEcmaScript(selector.custom));
sql = StringUtils.replace(
sql,
"{anchor_href}",
anchorHref == null ? StringUtils.EMPTY : StringEscapeUtils
.escapeEcmaScript(anchorHref));
sql = StringUtils.replace(
sql,
"{final_url}",
finalUrl == null ? StringUtils.EMPTY : StringEscapeUtils
.escapeEcmaScript(finalUrl));
sql = StringUtils.replace(
sql,
"{embed_src}",
embedSrc == null ? StringUtils.EMPTY : StringEscapeUtils
.escapeEcmaScript(embedSrc));
sql = StringUtils.replace(
sql,
"{img_src}",
imgSrc == null ? StringUtils.EMPTY : StringEscapeUtils
.escapeEcmaScript(imgSrc));
sql = StringUtils.replace(
sql,
"{filename}",
filename == null ? StringUtils.EMPTY : StringEscapeUtils
.escapeEcmaScript(filename));
sql = StringUtils.replace(
sql,
"{file_md5}",
file_md5 == null ? StringUtils.EMPTY : StringEscapeUtils
.escapeEcmaScript(file_md5));
sql = StringUtils.replace(
sql,
"{file_phash}",
file_phash == null ? StringUtils.EMPTY : StringEscapeUtils
.escapeEcmaScript(file_phash));
return sql;
}
private static String performClickGetUrl(BrowserDriver> browserDriver,
String url) throws IOException, SearchLibException {
if (url == null)
return null;
try {
browserDriver.openNewWindow();
browserDriver.get(url);
return browserDriver.getCurrentUrl();
} catch (org.openqa.selenium.TimeoutException e) {
Logging.warn(e);
return null;
}
}
private boolean locateAimgClickCapture(WebElement aElement) {
if (!aElement.isDisplayed())
return false;
String ahref = aElement.getAttribute("href");
List imgElements = aElement.findElements(By
.cssSelector("img"));
if (imgElements == null)
return false;
for (WebElement imgElement : imgElements) {
if (!imgElement.isDisplayed())
continue;
imgSrc = imgElement.getAttribute("src");
}
anchorHref = ahref;
return true;
}
private boolean locateAimgClickCapture(List aElements)
throws SearchLibException, IOException {
if (aElements == null)
return false;
for (WebElement aElement : aElements)
if (locateAimgClickCapture(aElement))
return true;
return false;
}
private boolean locateEmbedClickCapture(List embedElements)
throws SearchLibException, IOException {
if (embedElements == null)
return false;
for (WebElement embedElement : embedElements) {
if (!embedElement.isDisplayed())
continue;
embedSrc = embedElement.getAttribute("src");
String flashVars = embedElement.getAttribute("flashvars");
if (!StringUtils.isEmpty(flashVars)) {
try {
URI uri = new URI(embedSrc);
flashVars = uri.getQuery();
} catch (URISyntaxException e) {
Logging.warn(e);
}
}
String[] params = StringUtils.split(flashVars, '&');
Map paramMap = new TreeMap();
if (params != null) {
for (String param : params) {
String[] keyValue = StringUtils.split(param, '=');
if (keyValue != null && keyValue.length == 2)
paramMap.put(keyValue[0].toLowerCase(),
URLDecoder.decode(keyValue[1], "UTF-8"));
}
}
if (selector.flashVarsLink != null)
anchorHref = paramMap.get(selector.flashVarsLink);
return true;
}
return false;
}
private boolean locateElement(BrowserDriver> browserDriver,
WebElement webElement) throws SearchLibException, IOException {
By by = By.cssSelector("a");
List aElements = webElement == null ? browserDriver
.locateBy(by) : webElement.findElements(by);
if (locateAimgClickCapture(aElements))
return true;
by = By.cssSelector("embed");
List embedElements = webElement == null ? browserDriver
.locateBy(by) : webElement.findElements(by);
if (locateEmbedClickCapture(embedElements))
return true;
by = By.cssSelector("object > object");
List objectElements = webElement == null ? browserDriver
.locateBy(by) : webElement.findElements(by);
if (locateEmbedClickCapture(objectElements))
return true;
by = By.tagName("iframe");
List iFrameElements = webElement == null ? browserDriver
.locateBy(by) : webElement.findElements(by);
if (locateIFrame(browserDriver, iFrameElements))
return true;
return false;
}
private boolean locateIFrame(BrowserDriver> browserDriver,
List iFrameElements) throws SearchLibException,
IOException {
try {
if (CollectionUtils.isEmpty(iFrameElements))
return false;
for (WebElement frameWebElement : iFrameElements) {
if (!frameWebElement.isDisplayed())
continue;
browserDriver.switchToFrame(frameWebElement);
if (locateElement(browserDriver, null))
return true;
}
return false;
} finally {
browserDriver.switchToMain();
}
}
private void locate(BrowserDriver> browserDriver) {
try {
if (CollectionUtils.isEmpty(webElements))
return;
for (WebElement webElement : webElements) {
if ("img".equalsIgnoreCase(webElement.getTagName())) {
webElement = browserDriver.getParent("a", webElement);
if (webElement != null)
if (locateAimgClickCapture(webElement))
return;
}
if (locateElement(browserDriver, webElement))
return;
}
} catch (Exception e) {
Logging.warn(e);
}
}
/**
* Try to locate a/img object/object or object/embed items
*
* @param browserDriver
* @param clickCaptures
*/
public static void locate(BrowserDriver> browserDriver,
Collection clickCaptures) {
for (ClickCapture clickCapture : clickCaptures)
clickCapture.locate(browserDriver);
}
private void click(BrowserDriver> browserDriver,
HtmlArchiver htmlArchiver, BufferedImage screenshot) {
try {
filename = null;
if (isEmpty()) {
if (CollectionUtils.isEmpty(webElements))
return;
if (firstElementBox == null || htmlArchiver == null)
return;
if (firstElementBox.width == 0 || firstElementBox.height == 0)
throw new SearchLibException(
"Box height or width is null: " + selector);
BufferedImage image = ImageUtils.getSubImage(screenshot,
firstElementBox);
File imageFile = htmlArchiver.getAndRegisterDestFile(null,
"clickCapture", "png");
ImageIO.write(image, "png", imageFile);
filename = imageFile.getName();
imgSrc = imageFile.getName();
} else {
if (htmlArchiver != null) {
if (imgSrc != null)
filename = htmlArchiver.getUrlFileName(imgSrc);
else if (embedSrc != null)
filename = htmlArchiver.getUrlFileName(embedSrc);
}
}
if (filename != null) {
File file = htmlArchiver.getLocalFile(filename);
if (file.exists()) {
file_md5 = FileUtils.computeMd5(file);
if (imgSrc != null)
file_phash = ImageUtils.computePHash(file);
}
}
finalUrl = performClickGetUrl(browserDriver, anchorHref);
} catch (Exception e) {
Logging.warn(e);
}
}
/**
* Collect the final URL
*
* @param browserDriver
* @param results
* @param htmlArchiver
* @throws SearchLibException
* @throws IOException
*/
public static void click(BrowserDriver> browserDriver,
Collection clickCaptures, HtmlArchiver htmlArchiver,
BufferedImage screenshot) throws SearchLibException, IOException {
String window = browserDriver.getWindow();
try {
for (ClickCapture clickCapture : clickCaptures)
clickCapture.click(browserDriver, htmlArchiver, screenshot);
} finally {
if (window != null)
browserDriver.switchToWindow(window);
}
}
/**
*
* @param context
* @param clickCaptureSql
* @param clickCaptures
*/
public static void sql(ScriptCommandContext context,
String clickCaptureSql, Collection clickCaptures) {
for (ClickCapture clickCapture : clickCaptures) {
String sql = clickCapture.sql(clickCaptureSql);
try {
context.executeSqlUpdate(sql);
} catch (Exception e) {
Logging.warn(e);
}
}
}
}