net.ruippeixotog.scalascraper.scraper.ContentExtractors.scala Maven / Gradle / Ivy
The newest version!
package net.ruippeixotog.scalascraper.scraper
import scala.util.Try
import net.ruippeixotog.scalascraper.model.{Element, ElementQuery}
/** An object containing `HtmlExtractor` instances for extracting primitive data such as text, elements or attributes,
* as well as more complex information such as form data. Because they do perform little to no navigation through the
* document, they are typically preceded by a CSS query defining the location in the HTML document of the data to be
* retrieved.
*/
object ContentExtractors {
@inline private[this] implicit def funcToExtractor[E <: Element, A](f: ElementQuery[E] => A): HtmlExtractor[E, A] =
HtmlExtractor(f)
/** An extractor for the first element matched.
*/
val element: HtmlExtractor[Element, Element] = _.head
/** An extractor for an `ElementQuery` with the matched elements.
*/
val elements: HtmlExtractor[Element, ElementQuery[Element]] = identity[ElementQuery[Element]](_)
/** An extractor for a list of the matched elements.
*/
val elementList: HtmlExtractor[Element, List[Element]] = _.toList
/** An extractor for the first element matched. It retains the concrete type of the elements being extracted.
*/
val pElement = new PolyHtmlExtractor {
type Out[E] = E
def apply[E <: Element]: HtmlExtractor[E, E] = _.head
}
/** An extractor for an `ElementQuery` with the matched elements. It retains the concrete type of the elements being
* extracted.
*/
val pElements = new PolyHtmlExtractor {
type Out[E <: Element] = ElementQuery[E]
def apply[E <: Element]: HtmlExtractor[E, ElementQuery[E]] = identity[ElementQuery[E]](_)
}
/** An extractor for a list of the matched elements. It retains the concrete type of the elements being extracted.
*/
val pElementList = new PolyHtmlExtractor {
type Out[E] = List[E]
def apply[E <: Element]: HtmlExtractor[E, List[E]] = _.toList
}
/** An extractor for the text in the first element matched.
*/
val text: HtmlExtractor[Element, String] = _.head.text
/** An extractor for a lazy iterable of the text in each element matched.
*/
val texts: HtmlExtractor[Element, Iterable[String]] = _.map(_.text)
/** An extractor for the text in all matched elements.
*/
val allText: HtmlExtractor[Element, String] = _.map(_.text).mkString
/** An extractor for the value of an attribute of the first matched element.
*
* @param attr
* the attribute name to extract
* @return
* an extractor for an attribute of the first matched element.
*/
def attr(attr: String): HtmlExtractor[Element, String] = _.head.attr(attr)
/** An extractor for a lazy iterable of the value of an attribute of each matched element.
*
* @param attr
* the attribute name to extract
* @return
* an extractor for a lazy iterable of the value of an attribute of each matched element.
*/
def attrs(attr: String): HtmlExtractor[Element, Iterable[String]] = _.map(_.attr(attr))
/** An extractor for the form data present in the matched elements.
*/
// TODO add support for
© 2015 - 2025 Weber Informatics LLC | Privacy Policy