All Downloads are FREE. Search and download functionalities are using the official Maven repository.

it.skrape.core.Parser.kt Maven / Gradle / Ivy

Go to download

A Kotlin-based testing/scraping/parsing library providing the ability to analyze and extract data from HTML (server & client-side rendered). It places particular emphasis on ease of use and a high level of readability by providing an intuitive DSL. First and foremost it aims to be a testing lib, but it can also be used to scrape websites in a convenient fashion.

There is a newer version: 1.3.0-alpha.2
Show newest version
package it.skrape.core

import it.skrape.SkrapeItDsl
import it.skrape.fetcher.BrowserFetcher
import it.skrape.fetcher.Result
import it.skrape.selects.Doc
import org.intellij.lang.annotations.Language
import org.jsoup.nodes.Document
import java.io.File
import java.nio.charset.Charset
import org.jsoup.parser.Parser.parse as jSoupParser

internal class Parser(
    var html: String,
    val charset: Charset,
    val jsExecution: Boolean,
    val baseUri: String
) {

    fun parse(): Doc {
        return if (jsExecution) {
            checkBrowserFetcherIsPresent()
            jSoupParser(BrowserFetcher.render(html), baseUri).toDocWrapper()
        } else jSoupParser(html, baseUri).toDocWrapper()
    }

    private fun checkBrowserFetcherIsPresent() {
        try {
            Class.forName("it.skrape.fetcher.BrowserFetcherKt")
        } catch (e: ClassNotFoundException) {
            throw MissingDependencyException("you need to add browser-fetcher dependency to exec JS")
        }
    }

    private fun Document.toDocWrapper() = Doc(this)

    class MissingDependencyException(message: String = "") :
        Exception(message)
}

/**
 * Read and parse HTML from a String.
 * @param html represents a html snippet
 */
public fun  htmlDocument(
    @Language("HTML") html: String,
    charset: Charset = Charsets.UTF_8,
    jsExecution: Boolean = false,
    baseUri: String = "",
    init: Doc.() -> T
): T = htmlDocument(html, charset, jsExecution, baseUri).init()


/**
 * Read and parse a html file from local file-system.
 * @param file
 * @param charset defaults to UTF-8
 * @param jsExecution defaults to false
 * @param baseUri defaults to empty String
 */
public fun  htmlDocument(
    file: File,
    charset: Charset = Charsets.UTF_8,
    jsExecution: Boolean = false,
    baseUri: String = "",
    init: Doc.() -> T
): T = htmlDocument(file.readText(charset), charset, jsExecution, baseUri).init()

@SkrapeItDsl
public fun htmlDocument(
    @Language("HTML") html: String,
    charset: Charset = Charsets.UTF_8,
    jsExecution: Boolean = false,
    baseUri: String = ""
): Doc = Parser(html, charset, jsExecution, baseUri).parse()

public fun htmlDocument(
    file: File,
    charset: Charset = Charsets.UTF_8,
    jsExecution: Boolean = false,
    baseUri: String = ""
): Doc = htmlDocument(file.readText(charset), charset, jsExecution, baseUri)

public val Result.document: Doc
    get() = htmlDocument { this }

public fun  Result.htmlDocument(init: Doc.() -> T): T = htmlDocument(html = responseBody, baseUri = baseUri).init()




© 2015 - 2025 Weber Informatics LLC | Privacy Policy