org.jsoup.nodes.Document Maven / Gradle / Ivy
package org.jsoup.nodes;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.helper.DataUtil;
import org.jsoup.helper.Validate;
import org.jsoup.internal.StringUtil;
import org.jsoup.parser.ParseSettings;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;
import org.jsoup.select.Evaluator;
import org.jsoup.select.Selector;
import org.jspecify.annotations.Nullable;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.List;
import static org.jsoup.parser.Parser.NamespaceHtml;
/**
A HTML Document.
@author Jonathan Hedley, [email protected] */
public class Document extends Element {
private @Nullable Connection connection; // the connection this doc was fetched from, if any
private OutputSettings outputSettings = new OutputSettings();
private Parser parser; // the parser used to parse this document
private QuirksMode quirksMode = QuirksMode.noQuirks;
private final String location;
private boolean updateMetaCharset = false;
/**
Create a new, empty Document, in the specified namespace.
@param namespace the namespace of this Document's root node.
@param baseUri base URI of document
@see org.jsoup.Jsoup#parse
@see #createShell
*/
public Document(String namespace, String baseUri) {
super(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), baseUri);
this.location = baseUri;
this.parser = Parser.htmlParser(); // default, but overridable
}
/**
Create a new, empty Document, in the HTML namespace.
@param baseUri base URI of document
@see org.jsoup.Jsoup#parse
@see #Document(String namespace, String baseUri)
*/
public Document(String baseUri) {
this(NamespaceHtml, baseUri);
}
/**
Create a valid, empty shell of a document, suitable for adding more elements to.
@param baseUri baseUri of document
@return document with html, head, and body elements.
*/
public static Document createShell(String baseUri) {
Validate.notNull(baseUri);
Document doc = new Document(baseUri);
doc.parser = doc.parser();
Element html = doc.appendElement("html");
html.appendElement("head");
html.appendElement("body");
return doc;
}
/**
* Get the URL this Document was parsed from. If the starting URL is a redirect,
* this will return the final URL from which the document was served from.
* Will return an empty string if the location is unknown (e.g. if parsed from a String).
* @return location
*/
public String location() {
return location;
}
/**
Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new
default Connection object. This can be used to continue a session, preserving settings and cookies, etc.
@return the Connection (session) associated with this Document, or an empty one otherwise.
@see Connection#newRequest()
*/
public Connection connection() {
if (connection == null)
return Jsoup.newSession();
else
return connection;
}
/**
* Returns this Document's doctype.
* @return document type, or null if not set
*/
public @Nullable DocumentType documentType() {
for (Node node : childNodes) {
if (node instanceof DocumentType)
return (DocumentType) node;
else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc
break;
}
return null;
// todo - add a set document type?
}
/**
Find the root HTML element, or create it if it doesn't exist.
@return the root HTML element.
*/
private Element htmlEl() {
Element el = firstElementChild();
while (el != null) {
if (el.nameIs("html"))
return el;
el = el.nextElementSibling();
}
return appendElement("html");
}
/**
Get this document's {@code head} element.
As a side-effect, if this Document does not already have a HTML structure, it will be created. If you do not want
that, use {@code #selectFirst("head")} instead.
@return {@code head} element.
*/
public Element head() {
final Element html = htmlEl();
Element el = html.firstElementChild();
while (el != null) {
if (el.nameIs("head"))
return el;
el = el.nextElementSibling();
}
return html.prependElement("head");
}
/**
Get this document's {@code
} or {@code