net.ruippeixotog.scalascraper.browser.Browser.scala Maven / Gradle / Ivy
The newest version!
package net.ruippeixotog.scalascraper.browser
import java.io.{File, InputStream}
import net.ruippeixotog.scalascraper.model.Document
/** A client able to retrieve and parse HTML pages from the web and from local resources.
*
* An implementation of `Browser` can fetch pages via HTTP GET or POST requests, parse the downloaded page and return a
* [[net.ruippeixotog.scalascraper.model.Document]] instance, which can be queried via the scraper DSL or using its
* methods.
*
* Different [[net.ruippeixotog.scalascraper.browser.Browser]] implementations can embed pages with different runtime
* behavior. For example, some browsers may limit themselves to parse the HTML content inside the page without
* executing any scripts inside, while others may run JavaScript and allow for `Document` instances with dynamic
* content. The documentation of each implementation should be read for more information on the semantics of its
* `Document` and [[net.ruippeixotog.scalascraper.model.Element]] implementations.
*/
trait Browser {
/** The concrete type of documents created by this browser.
*/
type DocumentType <: Document
/** The user agent used by this browser to retrieve HTML pages from the web.
*/
def userAgent: String
/** Retrieves and parses a web page using a GET request.
*
* @param url
* the URL of the page to retrieve
* @return
* a `Document` containing the retrieved web page.
*/
def get(url: String): DocumentType
/** Submits a form via a POST request and parses the resulting page.
*
* @param url
* the URL of the page to retrieve
* @param form
* a map containing the form fields to submit with their respective values
* @return
* a `Document` containing the resulting web page.
*/
def post(url: String, form: Map[String, String]): DocumentType
/** Parses a local HTML file with a specified charset.
*
* @param file
* the HTML file to parse
* @param charset
* the charset of the file
* @return
* a `Document` containing the parsed web page.
*/
def parseFile(file: File, charset: String): DocumentType
/** Parses a local HTML file encoded in UTF-8.
*
* @param file
* the HTML file to parse
* @return
* a `Document` containing the parsed web page.
*/
def parseFile(file: File): DocumentType = parseFile(file, "UTF-8")
/** Parses a local HTML file with a specified charset.
*
* @param path
* the path in the local filesystem where the HTML file is located
* @param charset
* the charset of the file
* @return
* a `Document` containing the parsed web page.
*/
def parseFile(path: String, charset: String): DocumentType = parseFile(new File(path), charset)
/** Parses a local HTML file encoded in UTF-8.
*
* @param path
* the path in the local filesystem where the HTML file is located
* @return
* a `Document` containing the parsed web page.
*/
def parseFile(path: String): DocumentType = parseFile(new File(path), "UTF-8")
/** Parses a resource with a specified charset.
*
* @param name
* the name of the resource to parse
* @param charset
* the charset of the resource
* @return
* a `Document` containing the parsed web page.
*/
def parseResource(name: String, charset: String = "UTF-8"): DocumentType =
parseInputStream(getClass.getResourceAsStream(name), charset)
/** Parses an input stream with its content in a specified charset. The provided input stream is always closed before
* this method returns or throws an exception.
*
* @param inputStream
* the input stream to parse
* @param charset
* the charset of the input stream content
* @return
* a `Document` containing the parsed web page.
*/
def parseInputStream(inputStream: InputStream, charset: String = "UTF-8"): DocumentType
/** Parses an HTML string.
*
* @param html
* the HTML string to parse
* @return
* a `Document` containing the parsed web page.
*/
def parseString(html: String): DocumentType
/** Returns the current set of cookies stored in this browser for a given URL.
*
* @param url
* the URL whose stored cookies are to be returned
* @return
* a mapping of cookie names to their respective values.
*/
def cookies(url: String): Map[String, String]
/** Clears the cookie store of this browser.
*/
def clearCookies(): Unit
/** Returns a new browser that uses the provided proxy for all connections.
*/
def withProxy(proxy: Proxy): Browser
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy