All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.ruippeixotog.scalascraper.browser.HtmlUnitBrowser.scala Maven / Gradle / Ivy

The newest version!
package net.ruippeixotog.scalascraper.browser

import java.io.{File, InputStream}
import java.net.URL
import java.nio.charset.Charset
import java.util.UUID

import scala.collection.JavaConverters._

import org.apache.commons.io.IOUtils
import org.apache.http.HttpStatus
import org.htmlunit._
import org.htmlunit.html._
import org.htmlunit.html.parser.neko.HtmlUnitNekoHtmlParser
import org.htmlunit.util.{NameValuePair, StringUtils}
import org.htmlunit.util.UrlUtils

import net.ruippeixotog.scalascraper.browser.HtmlUnitBrowser._
import net.ruippeixotog.scalascraper.model._
import net.ruippeixotog.scalascraper.util._

/** A [[Browser]] implementation based on [[http://htmlunit.sourceforge.net HtmlUnit]], a GUI-less browser for Java
  * programs. `HtmlUnitBrowser` simulates thoroughly a web browser, executing JavaScript code in the pages besides
  * parsing and modelling its HTML content. It supports several compatibility modes, allowing it to emulate browsers
  * such as Internet Explorer.
  *
  * Both the [[net.ruippeixotog.scalascraper.model.Document]] and the [[net.ruippeixotog.scalascraper.model.Element]]
  * instances obtained from `HtmlUnitBrowser` can be mutated in the background. JavaScript code can at any time change
  * attributes and the content of elements, reflected both in queries to `Document` and on previously stored references
  * to `Element`s. The `Document` instance will always represent the current page in the browser's "window". This means
  * the `Document`'s `location` value can change, together with its root element, in the event of client-side page
  * refreshes or redirections. However, `Element` instances belong to a fixed DOM tree and they stop being meaningful as
  * soon as they are removed from the DOM or a client-side page reload occurs.
  *
  * @param browserType
  *   the browser type and version to simulate
  * @param proxy
  *   an optional proxy configuration to use
  */
class HtmlUnitBrowser(browserType: BrowserVersion = BrowserVersion.CHROME, proxy: Option[ProxyConfig] = None)
    extends Browser {
  type DocumentType = HtmlUnitDocument

  lazy val underlying: WebClient = {
    val c = new WebClient(browserType)
    defaultClientSettings(c)
    c
  }

  def userAgent = browserType.getUserAgent

  def exec(req: WebRequest): HtmlUnitDocument = {
    val window = newWindow()
    underlying.getPage(window, req)
    HtmlUnitDocument(window)
  }

  def get(url: String): HtmlUnitDocument =
    exec(newRequest(new URL(url)))

  def post(url: String, form: Map[String, String]): HtmlUnitDocument = {
    val req = newRequest(new URL(url), HttpMethod.POST)
    req.setRequestParameters(form.map { case (k, v) => new NameValuePair(k, v) }.toBuffer.asJava)
    exec(req)
  }

  def parseFile(file: File, charset: String): HtmlUnitDocument = {
    val req = newRequest(new URL(s"file://${file.getAbsolutePath}"), HttpMethod.GET)
    req.setCharset(Charset.forName(charset))
    exec(req)
  }

  def parseString(html: String): HtmlUnitDocument = {
    val response = new StringWebResponse(html, UrlUtils.URL_ABOUT_BLANK)
    val window = newWindow()
    new DefaultPageCreator().createPage(response, window)
    HtmlUnitDocument(window)
  }

  def parseInputStream(inputStream: InputStream, charset: String): HtmlUnitDocument = {
    using(inputStream) { _ =>
      val response = new WebResponse(
        newWebResponseData(inputStream, charset),
        newRequest(UrlUtils.URL_ABOUT_BLANK, charset = Some(charset)),
        0
      )
      val window = newWindow()
      new DefaultPageCreator().createPage(response, window)
      HtmlUnitDocument(window)
    }
  }

  def cookies(url: String) =
    underlying.getCookies(new URL(url)).asScala.map { c => c.getName -> c.getValue }.toMap

  def clearCookies() = underlying.getCookieManager.clearCookies()

  /** Closes all windows opened in this browser.
    */
  def closeAll() = underlying.close()

  def withProxy(proxy: Proxy): HtmlUnitBrowser = {
    val (scheme, isSocks) = proxy.proxyType match {
      case Proxy.HTTP => ("http", false)
      case Proxy.SOCKS => (null, true)
    }
    val newProxyConf = new ProxyConfig(proxy.host, proxy.port, scheme, isSocks)
    new HtmlUnitBrowser(browserType, Some(newProxyConf))
  }

  protected[this] def defaultClientSettings(client: WebClient): Unit = {
    client.getOptions.setCssEnabled(false)
    client.getOptions.setThrowExceptionOnScriptError(false)
    proxy.foreach { proxy => client.getOptions.setProxyConfig(proxy) }
  }

  protected[this] def defaultRequestSettings(req: WebRequest): Unit = {
    req.setAdditionalHeader("Accept", "text/html,application/xhtml+xml,application/xml")
    req.setAdditionalHeader("Accept-Charset", "utf-8")
  }

  private[this] def newWebResponseData(inputStream: InputStream, charset: String): WebResponseData = {
    val bytes = IOUtils.toByteArray(inputStream)
    val compiledHeaders = List(new NameValuePair("Content-Type", "text/html; charset=" + charset))
    new WebResponseData(bytes, HttpStatus.SC_OK, "OK", compiledHeaders.asJava)
  }

  private[this] def newRequest(url: URL, method: HttpMethod = HttpMethod.GET, charset: Option[String] = None) = {
    val req = new WebRequest(url, method)
    charset.map(Charset.forName).foreach(req.setCharset)
    defaultRequestSettings(req)
    req
  }

  private[this] def newWindow(): WebWindow =
    underlying.synchronized {
      underlying.openTargetWindow(underlying.getCurrentWindow, null, UUID.randomUUID().toString)
    }
}

object HtmlUnitBrowser {
  def apply(): Browser = new HtmlUnitBrowser()

  def typed(): HtmlUnitBrowser = new HtmlUnitBrowser()

  case class HtmlUnitElement(underlying: DomElement) extends Element {
    type ThisType = HtmlUnitElement

    def tagName = underlying.getTagName

    def parent = Option(underlying.getParentNode).collect { case elem: DomElement => HtmlUnitElement(elem) }

    def children = underlying.getChildElements.asScala.map(HtmlUnitElement.apply)

    def siblings = {
      val previousSiblings = Stream.iterate(underlying)(_.getPreviousElementSibling).tail.takeWhile(_ != null)
      val nextSiblings = Stream.iterate(underlying)(_.getNextElementSibling).tail.takeWhile(_ != null)
      (previousSiblings.reverse ++ nextSiblings).map(HtmlUnitElement.apply)
    }

    def childNodes = underlying.getChildNodes.asScala.flatMap(HtmlUnitNode.apply)

    def siblingNodes = {
      val previousSiblings = Stream.iterate[DomNode](underlying)(_.getPreviousSibling).tail.takeWhile(_ != null)
      val nextSiblings = Stream.iterate[DomNode](underlying)(_.getNextSibling).tail.takeWhile(_ != null)
      (previousSiblings.reverse ++ nextSiblings).flatMap(HtmlUnitNode.apply)
    }

    def attrs = underlying.getAttributesMap.asScala.mapValues(_.getValue).toMap

    def hasAttr(name: String) =
      underlying.hasAttribute(name) &&
        (underlying.getAttribute(name) ne DomElement.ATTRIBUTE_NOT_DEFINED)

    def attr(name: String) = {
      val v = underlying.getAttribute(name)
      if (v ne DomElement.ATTRIBUTE_NOT_DEFINED) v else throw new NoSuchElementException
    }

    def text = underlying.getTextContent.trim

    def ownText =
      underlying.getChildren.asScala.collect { case node: DomText => node.getWholeText }.mkString

    def innerHtml =
      underlying.getChildNodes.iterator.asScala.map {
        case node: DomElement => HtmlUnitElement(node).outerHtml
        case node: DomText => node.getWholeText
        case node => node.asXml.trim
      }.mkString

    def outerHtml = {
      val a = attrs.map { case (k, v) => s"""$k="${StringUtils.escapeXmlAttributeValue(v)}"""" }
      val attrsStr = if (a.isEmpty) "" else a.mkString(" ", " ", "")

      s"<$tagName$attrsStr>$innerHtml"
    }

    private[this] def selectUnderlying(cssQuery: String): Iterator[HtmlUnitElement] =
      underlying.querySelectorAll(cssQuery).iterator.asScala.collect { case elem: DomElement => HtmlUnitElement(elem) }

    def select(cssQuery: String) = ElementQuery(cssQuery, this, selectUnderlying)
  }

  object HtmlUnitNode {
    def apply(underlying: DomNode): Option[Node] =
      underlying match {
        case elem: DomElement => Some(ElementNode(HtmlUnitElement(elem)))
        case textNode: DomText => Some(TextNode(textNode.getWholeText))
        case _ => None
      }
  }

  case class HtmlUnitDocument(window: WebWindow) extends Document {
    type ElementType = HtmlUnitElement

    private[this] var _underlying: SgmlPage = _

    def underlying: SgmlPage = {
      if (_underlying == null || window.getEnclosedPage.getUrl != _underlying.getUrl) {
        _underlying = window.getEnclosedPage match {
          case page: SgmlPage => page
          case page: TextPage =>
            val response = new StringWebResponse(page.getContent, page.getUrl)
            new DefaultPageCreator().createPage(response, window).asInstanceOf[SgmlPage]
        }
      }
      _underlying
    }

    def location = underlying.getUrl.toString

    def root = HtmlUnitElement(underlying.getDocumentElement)

    override def title =
      underlying match {
        case page: HtmlPage => page.getTitleText
        case _ => ""
      }

    def toHtml = root.outerHtml
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy