All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.crawler.web.browser.BrowserDriver Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

The newest version!
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.crawler.web.browser;

import java.awt.Rectangle;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import javax.imageio.ImageIO;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.FileUtils;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.htmlcleaner.XPatherException;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.Dimension;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebDriver.Timeouts;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.interactions.Action;
import org.openqa.selenium.interactions.Actions;
import org.xml.sax.SAXException;

import com.google.common.base.Charsets;
import com.google.common.io.Resources;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.web.database.CookieItem;
import com.jaeksoft.searchlib.crawler.web.spider.HtmlArchiver;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.script.commands.Selectors.Selector;
import com.jaeksoft.searchlib.util.IOUtils;

public abstract class BrowserDriver implements Closeable {

	protected final BrowserDriverEnum type;
	protected T driver = null;

	protected BrowserDriver(BrowserDriverEnum type) {
		this.type = type;
		driver = initialize();
	}

	protected abstract T initialize();

	@Override
	public void close() throws IOException {
		if (driver == null)
			return;
		driver.quit();
		driver = null;
	}

	final public void get(String sUrl) {
		driver.get(sUrl);
	}

	public BrowserDriverEnum getType() {
		return type;
	}

	public Object javascript(String javascript, boolean faultTolerant, Object... objects)
			throws IOException, SearchLibException {
		try {
			if (!(driver instanceof JavascriptExecutor))
				throw new IOException("The Web driver does not support javascript execution");
			JavascriptExecutor js = (JavascriptExecutor) driver;
			return js.executeScript(javascript, objects);
		} catch (IOException e) {
			if (!faultTolerant)
				throw e;
			Logging.warn(e);
		} catch (Exception e) {
			if (!faultTolerant)
				throw new SearchLibException(e);
			Logging.warn(e);
		}
		return null;
	}

	public List getElementByTag(String tag, boolean faultTolerant) throws IOException, SearchLibException {
		List result = (List) javascript("return document.getElementsByTagName(arguments[0])", faultTolerant, tag);
		return result;
	}

	public String getJavascriptInnerHtml() throws IOException, SearchLibException {
		String source = (String) javascript("document.getElementsByTagName('body')[0].innerHTML", false);
		return source;
	}

	private static String XPATH_SCRIPT = null;

	private final synchronized static String getXPath() throws IOException {
		if (XPATH_SCRIPT != null)
			return XPATH_SCRIPT;
		URL url = Resources.getResource("/com/jaeksoft/searchlib/crawler/web/browser/get_xpath.js");
		String content = Resources.toString(url, Charsets.UTF_8);
		BufferedReader br = new BufferedReader(new StringReader(content));
		StringBuilder sb = new StringBuilder();
		String line;
		while ((line = br.readLine()) != null)
			sb.append(line.trim());
		br.close();
		XPATH_SCRIPT = sb.toString();
		return XPATH_SCRIPT;
	}

	public String getXPath(WebElement webElement, boolean faultTolerant) throws IOException, SearchLibException {
		String xPath = (String) javascript(getXPath(), faultTolerant, webElement);
		if (xPath == null)
			Logging.warn("XPATH extraction failed on " + webElement);
		return xPath;
	}

	final public BufferedImage getScreenshot() throws IOException {
		if (!(driver instanceof TakesScreenshot))
			throw new IOException("This browser driver does not support screenshot");
		TakesScreenshot takesScreenshot = (TakesScreenshot) driver;
		byte[] data = takesScreenshot.getScreenshotAs(OutputType.BYTES);
		return ImageIO.read(new ByteArrayInputStream(data));
	}

	final public Rectangle getRectangle(WebElement element) {
		if (element == null)
			return null;
		Rectangle box = new Rectangle(element.getLocation().x, element.getLocation().y, element.getSize().width,
				element.getSize().height);
		return box;
	}

	public String getSourceCode() throws IOException, SearchLibException {
		return driver.getPageSource();
	}

	final public String getSourceCode(String sUrl) {
		get(sUrl);
		return driver.getPageSource();
	}

	final public String getJavascriptBody() {
		try {
			return driver.findElement(By.tagName("body")).getText();
		} catch (NoSuchElementException e) {
			return null;
		}
	}

	final public String getTitle() {
		return driver.getTitle();
	}

	final public String getTitle(String sUrl) {
		get(sUrl);
		return driver.getTitle();
	}

	final public void setSize(int width, int height) throws SearchLibException {
		driver.manage().window().setSize(new Dimension(width, height));
	}

	final public void setTimeouts(Integer pageLoad, Integer script) {
		Timeouts timeOuts = driver.manage().timeouts();
		timeOuts.pageLoadTimeout(pageLoad, TimeUnit.SECONDS);
		timeOuts.setScriptTimeout(script, TimeUnit.SECONDS);
	}

	final public List locateBy(By by) throws SearchLibException {
		return driver.findElements(by);
	}

	final public int locateBy(By by, Collection elements, boolean faultTolerant) throws SearchLibException {
		try {
			List list = driver.findElements(by);
			if (list == null)
				return 0;
			elements.addAll(list);
			return list.size();
		} catch (Exception e) {
			if (!faultTolerant)
				throw new SearchLibException("Web element location failed: " + by);
			Logging.warn(e);
			return 0;
		}
	}

	public final List locateBy(WebElement originElement, By by, boolean faultTolerant)
			throws SearchLibException {
		try {
			if (originElement == null)
				return null;
			return originElement.findElements(by);
		} catch (Exception e) {
			if (!faultTolerant)
				throw new SearchLibException("Web element location failed: " + by);
			Logging.warn(e);
			return null;
		}
	}

	final public HtmlArchiver saveArchive(HttpDownloader httpDownloader, File parentDirectory,
			Collection selectors)
					throws ClientProtocolException, IllegalStateException, IOException, SearchLibException,
					URISyntaxException, SAXException, ParserConfigurationException, ClassCastException,
					ClassNotFoundException, InstantiationException, IllegalAccessException, XPatherException {

		URL currentURL = new URL(driver.getCurrentUrl());
		StringReader reader = null;
		try {
			HtmlArchiver archiver = new HtmlArchiver(this, parentDirectory, httpDownloader, currentURL);
			Set disableScriptWebElements = new HashSet();
			Set xPathDisableScriptSet = new HashSet();
			if (selectors != null)
				for (Selector selector : selectors)
					if (selector.disableScript)
						locateBy(selector.getBy(), disableScriptWebElements, true);
			for (WebElement webElement : disableScriptWebElements) {
				String xPath = getXPath(webElement, true);
				if (xPath != null)
					xPathDisableScriptSet.add(xPath);
			}
			archiver.archive(this, xPathDisableScriptSet);
			return archiver;
		} finally {
			IOUtils.close(reader);
		}
	}

	final public String getWindow() {
		return driver.getWindowHandle();
	}

	final public void switchToWindow(String window) {
		driver.switchTo().window(window);
	}

	final public void switchToFrame(WebElement frameWebelement) {
		driver.switchTo().frame(frameWebelement);
	}

	final public void switchToMain() {
		driver.switchTo().defaultContent();
	}

	final public void getFrameSource(WebElement frameWebelement, File captureDirectory)
			throws IOException, SearchLibException {
		if (!captureDirectory.exists())
			captureDirectory.mkdir();
		File sourceFile = new File(captureDirectory, "source.html");
		switchToFrame(frameWebelement);
		FileUtils.write(sourceFile, getSourceCode());
		switchToMain();
	}

	/**
	 * Click on the given WebElement using Actions
	 * 
	 * @param element
	 * @return
	 */
	public void click(WebElement element) {
		Actions builder = new Actions(driver);
		Action click = builder.moveToElement(element).click(element).build();
		click.perform();
	}

	public void switchToLastWindow() {
		String window = null;
		Iterator iterator = driver.getWindowHandles().iterator();
		while (iterator.hasNext())
			window = iterator.next();
		driver.switchTo().window(window);
	}

	public void openNewWindow() throws IOException, SearchLibException {
		javascript("window.open()", false);
		switchToLastWindow();
	}

	public void closeWindow() {
		driver.close();
	}

	public String getCurrentUrl() {
		return driver.getCurrentUrl();
	}

	public List getCookies() {
		Set cookies = driver.manage().getCookies();
		if (CollectionUtils.isEmpty(cookies))
			return null;
		List cookieList = new ArrayList(cookies.size());
		for (Cookie cookie : cookies) {
			BasicClientCookie basicCookie = new BasicClientCookie(cookie.getName(), cookie.getValue());
			basicCookie.setDomain(cookie.getDomain());
			basicCookie.setExpiryDate(cookie.getExpiry());
			basicCookie.setPath(cookie.getPath());
			basicCookie.setSecure(cookie.isSecure());
			cookieList.add(new CookieItem(basicCookie));
		}
		return cookieList;
	}

	public WebElement getParent(String tagName, WebElement element) {
		try {
			WebElement parent = element.findElement(By.xpath(".."));
			if (parent == null)
				return null;
			if (tagName == null)
				return parent;
			if (tagName.equalsIgnoreCase(parent.getTagName()))
				return parent;
			return getParent(tagName, parent);
		} catch (NoSuchElementException e) {
			Logging.warn(e);
			return null;
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy