All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.ruippeixotog.scalascraper.scraper.HtmlExtractor.scala Maven / Gradle / Ivy

The newest version!
package net.ruippeixotog.scalascraper.scraper

import scalaz.Monad

import net.ruippeixotog.scalascraper.model.{Element, ElementQuery}

/** An object able to extract content from [[net.ruippeixotog.scalascraper.model.ElementQuery]] instances.
  *
  * @tparam E
  *   the type of the elements needed by this `HtmlExtractor`
  * @tparam A
  *   the type of the extracted content
  */
trait HtmlExtractor[-E <: Element, +A] extends (ElementQuery[E] => A) {

  /** Extracts content from an `ElementQuery`.
    *
    * @param q
    *   the element query from which content is to be extracted
    * @return
    *   the extracted content.
    */
  def extract(q: ElementQuery[E]): A

  @inline final def apply(q: ElementQuery[E]) = extract(q)

  /** Maps a function over the extracted content of this extractor.
    *
    * @param f
    *   the function to map over this extractor
    * @tparam B
    *   the output type of the function
    * @return
    *   an `HtmlExtractor` returning the contents extracted by this extractor mapped by `f`.
    */
  def map[B](f: A => B): HtmlExtractor[E, B] =
    HtmlExtractor { q => f(extract(q)) }

  /** Applies a CSS query to `ElementQuery` inputs before passing them to this extractor.
    *
    * @param cssQuery
    *   the CSS query to apply to input `ElementQuery` instances
    * @return
    *   an `HtmlExtractor` returning the contents extracted by this extractor after `cssQuery` is applied to the input
    *   queries.
    */
  def mapQuery(cssQuery: String): HtmlExtractor[E, A] =
    HtmlExtractor { q => extract(q.select(cssQuery)) }
}

/** The companion object for `HtmlExtractor`, containing methods for creating new extractors.
  */
object HtmlExtractor extends HtmlExtractorInstances {

  /** Creates a new `HtmlExtractor` from a function.
    *
    * @param f
    *   the function used to extract content from an `ElementQuery`
    * @tparam E
    *   the type of the elements needed by the `HtmlExtractor`
    * @tparam A
    *   the type of the extracted content
    * @return
    *   a new `HtmlExtractor` that extracts content using `f`.
    */
  def apply[E <: Element, A](f: ElementQuery[E] => A): HtmlExtractor[E, A] =
    new HtmlExtractor[E, A] {
      def extract(q: ElementQuery[E]): A = f(q)
    }

  /** Creates a new `HtmlExtractor` that extracts the elements of the input that match a CSS query.
    *
    * @param cssQuery
    *   the CSS query to apply
    * @tparam E
    *   the type of the elements needed by the `HtmlExtractor`
    * @return
    *   a new `HtmlExtractor` that extracts the elements of the input that match a CSS query.
    */
  def forQuery[E <: Element](cssQuery: String): HtmlExtractor[E, ElementQuery[E]] =
    HtmlExtractor(_.select(cssQuery))
}

trait HtmlExtractorInstances {

  implicit def extractorMonad[E <: Element]: Monad[({ type t[A] = HtmlExtractor[E, A] })#t] =
    new Monad[({ type t[A] = HtmlExtractor[E, A] })#t] {
      def point[A](a: => A) =
        new HtmlExtractor[E, A] {
          def extract(q: ElementQuery[E]) = a
        }

      def bind[A, B](fa: HtmlExtractor[E, A])(f: A => HtmlExtractor[E, B]) =
        new HtmlExtractor[E, B] {
          def extract(q: ElementQuery[E]) = f(fa.extract(q)).extract(q)
        }

      override def map[A, B](fa: HtmlExtractor[E, A])(f: A => B) = fa.map(f)
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy