All Downloads are FREE. Search and download functionalities are using the official Maven repository.

leap.lang.jsoup.Jsoup Maven / Gradle / Ivy

The newest version!
package leap.lang.jsoup;


import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;

import leap.lang.jsoup.Connection;
import leap.lang.jsoup.HttpStatusException;
import leap.lang.jsoup.UnsupportedMimeTypeException;
import leap.lang.jsoup.helper.DataUtil;
import leap.lang.jsoup.helper.HttpConnection;
import leap.lang.jsoup.nodes.Document;
import leap.lang.jsoup.parser.HtmlParseMode;
import leap.lang.jsoup.parser.Parser;

/**
 The core public access point to the jsoup functionality.

 @author Jonathan Hedley */
public class Jsoup {
    private Jsoup() {}

    /**
     Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML.

     @param html    HTML to parse
     @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
     before the HTML declares a {@code } tag.
     @return sane HTML
     */
    public static Document parse(String html, String baseUri) {
        return Parser.parse(html, baseUri);
    }

    /**
     Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
     (non-HTML) parser.

     @param html    HTML to parse
     @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
     before the HTML declares a {@code } tag.
     @param parser alternate {@link Parser#xmlParser() parser} to use.
     @return sane HTML
     */
    public static Document parse(String html, String baseUri, Parser parser) {
        return parser.parseInput(html, baseUri);
    }

    /**
     Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a
     {@code } tag.

     @param html HTML to parse
     @return sane HTML

     @see #parse(String, String)
     */
    public static Document parse(String html) {
        return Parser.parse(html, "");
    }
    
    /**
    Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a
    {@code } tag.

    @param html HTML to parse
    @return sane HTML

    @see #parse(String, String)
    */
    public static Document parse(String html,HtmlParseMode parseMode) {
    	return Parser.parse(html, "", parseMode);
    }

    /**
     * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page.
     * 

* Use examples: *

    *
  • Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();
  • *
  • Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();
  • *
* @param url URL to connect to. The protocol must be {@code http} or {@code https}. * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute. */ public static Connection connect(String url) { return HttpConnection.connect(url); } /** Parse the contents of a file as HTML. @param in file to load HTML from @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if present, or fall back to {@code UTF-8} (which is often safe to do). @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. @return sane HTML @throws IOException if the file could not be found, or read, or if the charsetName is invalid. */ public static Document parse(File in, String charsetName, String baseUri) throws IOException { return DataUtil.load(in, charsetName, baseUri); } /** Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. @param in file to load HTML from @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if present, or fall back to {@code UTF-8} (which is often safe to do). @return sane HTML @throws IOException if the file could not be found, or read, or if the charsetName is invalid. @see #parse(File, String, String) */ public static Document parse(File in, String charsetName) throws IOException { return DataUtil.load(in, charsetName, in.getAbsolutePath()); } /** Read an input stream, and parse it to a Document. @param in input stream to read. Make sure to close it after parsing. @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if present, or fall back to {@code UTF-8} (which is often safe to do). @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. @return sane HTML @throws IOException if the file could not be found, or read, or if the charsetName is invalid. */ public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException { return DataUtil.load(in, charsetName, baseUri); } /** Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML (non-HTML) parser. @param in input stream to read. Make sure to close it after parsing. @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if present, or fall back to {@code UTF-8} (which is often safe to do). @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. @param parser alternate {@link Parser#xmlParser() parser} to use. @return sane HTML @throws IOException if the file could not be found, or read, or if the charsetName is invalid. */ public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException { return DataUtil.load(in, charsetName, baseUri, parser); } /** Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. @param bodyHtml body HTML fragment @param baseUri URL to resolve relative URLs against. @return sane HTML document @see Document#body() */ public static Document parseBodyFragment(String bodyHtml, String baseUri) { return Parser.parseBodyFragment(bodyHtml, baseUri); } /** Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. @param bodyHtml body HTML fragment @return sane HTML document @see Document#body() */ public static Document parseBodyFragment(String bodyHtml) { return Parser.parseBodyFragment(bodyHtml, ""); } /** Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead.

The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}. @param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}. @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown. @return The parsed HTML. @throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed @throws HttpStatusException if the response is not OK and HTTP response errors are not ignored @throws UnsupportedMimeTypeException if the response mime type is not supported and those errors are not ignored @throws java.net.SocketTimeoutException if the connection times out @throws IOException if a connection or read error occurs @see #connect(String) */ public static Document parse(URL url, int timeoutMillis) throws IOException { Connection con = HttpConnection.connect(url); con.timeout(timeoutMillis); return con.get(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy