All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.ruippeixotog.scalascraper.scraper.ContentExtractors.scala Maven / Gradle / Ivy

The newest version!
package net.ruippeixotog.scalascraper.scraper

import scala.util.Try

import net.ruippeixotog.scalascraper.model.{Element, ElementQuery}

/** An object containing `HtmlExtractor` instances for extracting primitive data such as text, elements or attributes,
  * as well as more complex information such as form data. Because they do perform little to no navigation through the
  * document, they are typically preceded by a CSS query defining the location in the HTML document of the data to be
  * retrieved.
  */
object ContentExtractors {
  @inline private[this] implicit def funcToExtractor[E <: Element, A](f: ElementQuery[E] => A): HtmlExtractor[E, A] =
    HtmlExtractor(f)

  /** An extractor for the first element matched.
    */
  val element: HtmlExtractor[Element, Element] = _.head

  /** An extractor for an `ElementQuery` with the matched elements.
    */
  val elements: HtmlExtractor[Element, ElementQuery[Element]] = identity[ElementQuery[Element]](_)

  /** An extractor for a list of the matched elements.
    */
  val elementList: HtmlExtractor[Element, List[Element]] = _.toList

  /** An extractor for the first element matched. It retains the concrete type of the elements being extracted.
    */
  val pElement = new PolyHtmlExtractor {
    type Out[E] = E
    def apply[E <: Element]: HtmlExtractor[E, E] = _.head
  }

  /** An extractor for an `ElementQuery` with the matched elements. It retains the concrete type of the elements being
    * extracted.
    */
  val pElements = new PolyHtmlExtractor {
    type Out[E <: Element] = ElementQuery[E]
    def apply[E <: Element]: HtmlExtractor[E, ElementQuery[E]] = identity[ElementQuery[E]](_)
  }

  /** An extractor for a list of the matched elements. It retains the concrete type of the elements being extracted.
    */
  val pElementList = new PolyHtmlExtractor {
    type Out[E] = List[E]
    def apply[E <: Element]: HtmlExtractor[E, List[E]] = _.toList
  }

  /** An extractor for the text in the first element matched.
    */
  val text: HtmlExtractor[Element, String] = _.head.text

  /** An extractor for a lazy iterable of the text in each element matched.
    */
  val texts: HtmlExtractor[Element, Iterable[String]] = _.map(_.text)

  /** An extractor for the text in all matched elements.
    */
  val allText: HtmlExtractor[Element, String] = _.map(_.text).mkString

  /** An extractor for the value of an attribute of the first matched element.
    *
    * @param attr
    *   the attribute name to extract
    * @return
    *   an extractor for an attribute of the first matched element.
    */
  def attr(attr: String): HtmlExtractor[Element, String] = _.head.attr(attr)

  /** An extractor for a lazy iterable of the value of an attribute of each matched element.
    *
    * @param attr
    *   the attribute name to extract
    * @return
    *   an extractor for a lazy iterable of the value of an attribute of each matched element.
    */
  def attrs(attr: String): HtmlExtractor[Element, Iterable[String]] = _.map(_.attr(attr))

  /** An extractor for the form data present in the matched elements.
    */
  // TODO add support for