net.ruippeixotog.scalascraper.browser.HtmlUnitBrowser.scala Maven / Gradle / Ivy
The newest version!
package net.ruippeixotog.scalascraper.browser
import{File, InputStream}
import java.nio.charset.Charset
import java.util.UUID
import scala.collection.JavaConverters._
import org.apache.http.HttpStatus
import org.htmlunit._
import org.htmlunit.html._
import org.htmlunit.html.parser.neko.HtmlUnitNekoHtmlParser
import org.htmlunit.util.{NameValuePair, StringUtils}
import org.htmlunit.util.UrlUtils
import net.ruippeixotog.scalascraper.browser.HtmlUnitBrowser._
import net.ruippeixotog.scalascraper.model._
import net.ruippeixotog.scalascraper.util._
/** A [[Browser]] implementation based on [[ HtmlUnit]], a GUI-less browser for Java
* programs. `HtmlUnitBrowser` simulates thoroughly a web browser, executing JavaScript code in the pages besides
* parsing and modelling its HTML content. It supports several compatibility modes, allowing it to emulate browsers
* such as Internet Explorer.
* Both the [[net.ruippeixotog.scalascraper.model.Document]] and the [[net.ruippeixotog.scalascraper.model.Element]]
* instances obtained from `HtmlUnitBrowser` can be mutated in the background. JavaScript code can at any time change
* attributes and the content of elements, reflected both in queries to `Document` and on previously stored references
* to `Element`s. The `Document` instance will always represent the current page in the browser's "window". This means
* the `Document`'s `location` value can change, together with its root element, in the event of client-side page
* refreshes or redirections. However, `Element` instances belong to a fixed DOM tree and they stop being meaningful as
* soon as they are removed from the DOM or a client-side page reload occurs.
* @param browserType
* the browser type and version to simulate
* @param proxy
* an optional proxy configuration to use
class HtmlUnitBrowser(browserType: BrowserVersion = BrowserVersion.CHROME, proxy: Option[ProxyConfig] = None)
extends Browser {
type DocumentType = HtmlUnitDocument
lazy val underlying: WebClient = {
val c = new WebClient(browserType)
def userAgent = browserType.getUserAgent
def exec(req: WebRequest): HtmlUnitDocument = {
val window = newWindow()
underlying.getPage(window, req)
def get(url: String): HtmlUnitDocument =
exec(newRequest(new URL(url)))
def post(url: String, form: Map[String, String]): HtmlUnitDocument = {
val req = newRequest(new URL(url), HttpMethod.POST)
req.setRequestParameters( { case (k, v) => new NameValuePair(k, v) }.toBuffer.asJava)
def parseFile(file: File, charset: String): HtmlUnitDocument = {
val req = newRequest(new URL(s"file://${file.getAbsolutePath}"), HttpMethod.GET)
def parseString(html: String): HtmlUnitDocument = {
val response = new StringWebResponse(html, UrlUtils.URL_ABOUT_BLANK)
val window = newWindow()
new DefaultPageCreator().createPage(response, window)
def parseInputStream(inputStream: InputStream, charset: String): HtmlUnitDocument = {
using(inputStream) { _ =>
val response = new WebResponse(
newWebResponseData(inputStream, charset),
newRequest(UrlUtils.URL_ABOUT_BLANK, charset = Some(charset)),
val window = newWindow()
new DefaultPageCreator().createPage(response, window)
def cookies(url: String) =
underlying.getCookies(new URL(url)) { c => c.getName -> c.getValue }.toMap
def clearCookies() = underlying.getCookieManager.clearCookies()
/** Closes all windows opened in this browser.
def closeAll() = underlying.close()
def withProxy(proxy: Proxy): HtmlUnitBrowser = {
val (scheme, isSocks) = proxy.proxyType match {
case Proxy.HTTP => ("http", false)
case Proxy.SOCKS => (null, true)
val newProxyConf = new ProxyConfig(, proxy.port, scheme, isSocks)
new HtmlUnitBrowser(browserType, Some(newProxyConf))
protected[this] def defaultClientSettings(client: WebClient): Unit = {
proxy.foreach { proxy => client.getOptions.setProxyConfig(proxy) }
protected[this] def defaultRequestSettings(req: WebRequest): Unit = {
req.setAdditionalHeader("Accept", "text/html,application/xhtml+xml,application/xml")
req.setAdditionalHeader("Accept-Charset", "utf-8")
private[this] def newWebResponseData(inputStream: InputStream, charset: String): WebResponseData = {
val bytes = IOUtils.toByteArray(inputStream)
val compiledHeaders = List(new NameValuePair("Content-Type", "text/html; charset=" + charset))
new WebResponseData(bytes, HttpStatus.SC_OK, "OK", compiledHeaders.asJava)
private[this] def newRequest(url: URL, method: HttpMethod = HttpMethod.GET, charset: Option[String] = None) = {
val req = new WebRequest(url, method)
private[this] def newWindow(): WebWindow =
underlying.synchronized {
underlying.openTargetWindow(underlying.getCurrentWindow, null, UUID.randomUUID().toString)
object HtmlUnitBrowser {
def apply(): Browser = new HtmlUnitBrowser()
def typed(): HtmlUnitBrowser = new HtmlUnitBrowser()
case class HtmlUnitElement(underlying: DomElement) extends Element {
type ThisType = HtmlUnitElement
def tagName = underlying.getTagName
def parent = Option(underlying.getParentNode).collect { case elem: DomElement => HtmlUnitElement(elem) }
def children =
def siblings = {
val previousSiblings = Stream.iterate(underlying)(_.getPreviousElementSibling).tail.takeWhile(_ != null)
val nextSiblings = Stream.iterate(underlying)(_.getNextElementSibling).tail.takeWhile(_ != null)
(previousSiblings.reverse ++ nextSiblings).map(HtmlUnitElement.apply)
def childNodes = underlying.getChildNodes.asScala.flatMap(HtmlUnitNode.apply)
def siblingNodes = {
val previousSiblings = Stream.iterate[DomNode](underlying)(_.getPreviousSibling).tail.takeWhile(_ != null)
val nextSiblings = Stream.iterate[DomNode](underlying)(_.getNextSibling).tail.takeWhile(_ != null)
(previousSiblings.reverse ++ nextSiblings).flatMap(HtmlUnitNode.apply)
def attrs = underlying.getAttributesMap.asScala.mapValues(_.getValue).toMap
def hasAttr(name: String) =
underlying.hasAttribute(name) &&
(underlying.getAttribute(name) ne DomElement.ATTRIBUTE_NOT_DEFINED)
def attr(name: String) = {
val v = underlying.getAttribute(name)
if (v ne DomElement.ATTRIBUTE_NOT_DEFINED) v else throw new NoSuchElementException
def text = underlying.getTextContent.trim
def ownText =
underlying.getChildren.asScala.collect { case node: DomText => node.getWholeText }.mkString
def innerHtml = {
case node: DomElement => HtmlUnitElement(node).outerHtml
case node: DomText => node.getWholeText
case node => node.asXml.trim
def outerHtml = {
val a = { case (k, v) => s"""$k="${StringUtils.escapeXmlAttributeValue(v)}"""" }
val attrsStr = if (a.isEmpty) "" else a.mkString(" ", " ", "")
private[this] def selectUnderlying(cssQuery: String): Iterator[HtmlUnitElement] =
underlying.querySelectorAll(cssQuery).iterator.asScala.collect { case elem: DomElement => HtmlUnitElement(elem) }
def select(cssQuery: String) = ElementQuery(cssQuery, this, selectUnderlying)
object HtmlUnitNode {
def apply(underlying: DomNode): Option[Node] =
underlying match {
case elem: DomElement => Some(ElementNode(HtmlUnitElement(elem)))
case textNode: DomText => Some(TextNode(textNode.getWholeText))
case _ => None
case class HtmlUnitDocument(window: WebWindow) extends Document {
type ElementType = HtmlUnitElement
private[this] var _underlying: SgmlPage = _
def underlying: SgmlPage = {
if (_underlying == null || window.getEnclosedPage.getUrl != _underlying.getUrl) {
_underlying = window.getEnclosedPage match {
case page: SgmlPage => page
case page: TextPage =>
val response = new StringWebResponse(page.getContent, page.getUrl)
new DefaultPageCreator().createPage(response, window).asInstanceOf[SgmlPage]
def location = underlying.getUrl.toString
def root = HtmlUnitElement(underlying.getDocumentElement)
override def title =
underlying match {
case page: HtmlPage => page.getTitleText
case _ => ""
def toHtml = root.outerHtml
© 2015 - 2025 Weber Informatics LLC | Privacy Policy