
com.jaeksoft.searchlib.crawler.web.browser.BrowserDriver Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface,
the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and
easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and
Linux/Unix/BSD.
The newest version!
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2013 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler.web.browser;
import java.awt.Rectangle;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import javax.imageio.ImageIO;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.FileUtils;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.htmlcleaner.XPatherException;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.Dimension;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebDriver.Timeouts;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.interactions.Action;
import org.openqa.selenium.interactions.Actions;
import org.xml.sax.SAXException;
import com.google.common.base.Charsets;
import com.google.common.io.Resources;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.web.database.CookieItem;
import com.jaeksoft.searchlib.crawler.web.spider.HtmlArchiver;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.script.commands.Selectors.Selector;
import com.jaeksoft.searchlib.util.IOUtils;
public abstract class BrowserDriver implements Closeable {
protected final BrowserDriverEnum type;
protected T driver = null;
protected BrowserDriver(BrowserDriverEnum type) {
this.type = type;
driver = initialize();
}
protected abstract T initialize();
@Override
public void close() throws IOException {
if (driver == null)
return;
driver.quit();
driver = null;
}
final public void get(String sUrl) {
driver.get(sUrl);
}
public BrowserDriverEnum getType() {
return type;
}
public Object javascript(String javascript, boolean faultTolerant, Object... objects)
throws IOException, SearchLibException {
try {
if (!(driver instanceof JavascriptExecutor))
throw new IOException("The Web driver does not support javascript execution");
JavascriptExecutor js = (JavascriptExecutor) driver;
return js.executeScript(javascript, objects);
} catch (IOException e) {
if (!faultTolerant)
throw e;
Logging.warn(e);
} catch (Exception e) {
if (!faultTolerant)
throw new SearchLibException(e);
Logging.warn(e);
}
return null;
}
public List> getElementByTag(String tag, boolean faultTolerant) throws IOException, SearchLibException {
List> result = (List>) javascript("return document.getElementsByTagName(arguments[0])", faultTolerant, tag);
return result;
}
public String getJavascriptInnerHtml() throws IOException, SearchLibException {
String source = (String) javascript("document.getElementsByTagName('body')[0].innerHTML", false);
return source;
}
private static String XPATH_SCRIPT = null;
private final synchronized static String getXPath() throws IOException {
if (XPATH_SCRIPT != null)
return XPATH_SCRIPT;
URL url = Resources.getResource("/com/jaeksoft/searchlib/crawler/web/browser/get_xpath.js");
String content = Resources.toString(url, Charsets.UTF_8);
BufferedReader br = new BufferedReader(new StringReader(content));
StringBuilder sb = new StringBuilder();
String line;
while ((line = br.readLine()) != null)
sb.append(line.trim());
br.close();
XPATH_SCRIPT = sb.toString();
return XPATH_SCRIPT;
}
public String getXPath(WebElement webElement, boolean faultTolerant) throws IOException, SearchLibException {
String xPath = (String) javascript(getXPath(), faultTolerant, webElement);
if (xPath == null)
Logging.warn("XPATH extraction failed on " + webElement);
return xPath;
}
final public BufferedImage getScreenshot() throws IOException {
if (!(driver instanceof TakesScreenshot))
throw new IOException("This browser driver does not support screenshot");
TakesScreenshot takesScreenshot = (TakesScreenshot) driver;
byte[] data = takesScreenshot.getScreenshotAs(OutputType.BYTES);
return ImageIO.read(new ByteArrayInputStream(data));
}
final public Rectangle getRectangle(WebElement element) {
if (element == null)
return null;
Rectangle box = new Rectangle(element.getLocation().x, element.getLocation().y, element.getSize().width,
element.getSize().height);
return box;
}
public String getSourceCode() throws IOException, SearchLibException {
return driver.getPageSource();
}
final public String getSourceCode(String sUrl) {
get(sUrl);
return driver.getPageSource();
}
final public String getJavascriptBody() {
try {
return driver.findElement(By.tagName("body")).getText();
} catch (NoSuchElementException e) {
return null;
}
}
final public String getTitle() {
return driver.getTitle();
}
final public String getTitle(String sUrl) {
get(sUrl);
return driver.getTitle();
}
final public void setSize(int width, int height) throws SearchLibException {
driver.manage().window().setSize(new Dimension(width, height));
}
final public void setTimeouts(Integer pageLoad, Integer script) {
Timeouts timeOuts = driver.manage().timeouts();
timeOuts.pageLoadTimeout(pageLoad, TimeUnit.SECONDS);
timeOuts.setScriptTimeout(script, TimeUnit.SECONDS);
}
final public List locateBy(By by) throws SearchLibException {
return driver.findElements(by);
}
final public int locateBy(By by, Collection elements, boolean faultTolerant) throws SearchLibException {
try {
List list = driver.findElements(by);
if (list == null)
return 0;
elements.addAll(list);
return list.size();
} catch (Exception e) {
if (!faultTolerant)
throw new SearchLibException("Web element location failed: " + by);
Logging.warn(e);
return 0;
}
}
public final List locateBy(WebElement originElement, By by, boolean faultTolerant)
throws SearchLibException {
try {
if (originElement == null)
return null;
return originElement.findElements(by);
} catch (Exception e) {
if (!faultTolerant)
throw new SearchLibException("Web element location failed: " + by);
Logging.warn(e);
return null;
}
}
final public HtmlArchiver saveArchive(HttpDownloader httpDownloader, File parentDirectory,
Collection selectors)
throws ClientProtocolException, IllegalStateException, IOException, SearchLibException,
URISyntaxException, SAXException, ParserConfigurationException, ClassCastException,
ClassNotFoundException, InstantiationException, IllegalAccessException, XPatherException {
URL currentURL = new URL(driver.getCurrentUrl());
StringReader reader = null;
try {
HtmlArchiver archiver = new HtmlArchiver(this, parentDirectory, httpDownloader, currentURL);
Set disableScriptWebElements = new HashSet();
Set xPathDisableScriptSet = new HashSet();
if (selectors != null)
for (Selector selector : selectors)
if (selector.disableScript)
locateBy(selector.getBy(), disableScriptWebElements, true);
for (WebElement webElement : disableScriptWebElements) {
String xPath = getXPath(webElement, true);
if (xPath != null)
xPathDisableScriptSet.add(xPath);
}
archiver.archive(this, xPathDisableScriptSet);
return archiver;
} finally {
IOUtils.close(reader);
}
}
final public String getWindow() {
return driver.getWindowHandle();
}
final public void switchToWindow(String window) {
driver.switchTo().window(window);
}
final public void switchToFrame(WebElement frameWebelement) {
driver.switchTo().frame(frameWebelement);
}
final public void switchToMain() {
driver.switchTo().defaultContent();
}
final public void getFrameSource(WebElement frameWebelement, File captureDirectory)
throws IOException, SearchLibException {
if (!captureDirectory.exists())
captureDirectory.mkdir();
File sourceFile = new File(captureDirectory, "source.html");
switchToFrame(frameWebelement);
FileUtils.write(sourceFile, getSourceCode());
switchToMain();
}
/**
* Click on the given WebElement using Actions
*
* @param element
* @return
*/
public void click(WebElement element) {
Actions builder = new Actions(driver);
Action click = builder.moveToElement(element).click(element).build();
click.perform();
}
public void switchToLastWindow() {
String window = null;
Iterator iterator = driver.getWindowHandles().iterator();
while (iterator.hasNext())
window = iterator.next();
driver.switchTo().window(window);
}
public void openNewWindow() throws IOException, SearchLibException {
javascript("window.open()", false);
switchToLastWindow();
}
public void closeWindow() {
driver.close();
}
public String getCurrentUrl() {
return driver.getCurrentUrl();
}
public List getCookies() {
Set cookies = driver.manage().getCookies();
if (CollectionUtils.isEmpty(cookies))
return null;
List cookieList = new ArrayList(cookies.size());
for (Cookie cookie : cookies) {
BasicClientCookie basicCookie = new BasicClientCookie(cookie.getName(), cookie.getValue());
basicCookie.setDomain(cookie.getDomain());
basicCookie.setExpiryDate(cookie.getExpiry());
basicCookie.setPath(cookie.getPath());
basicCookie.setSecure(cookie.isSecure());
cookieList.add(new CookieItem(basicCookie));
}
return cookieList;
}
public WebElement getParent(String tagName, WebElement element) {
try {
WebElement parent = element.findElement(By.xpath(".."));
if (parent == null)
return null;
if (tagName == null)
return parent;
if (tagName.equalsIgnoreCase(parent.getTagName()))
return parent;
return getParent(tagName, parent);
} catch (NoSuchElementException e) {
Logging.warn(e);
return null;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy