All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jsoup.nodes.Document Maven / Gradle / Ivy

There is a newer version: 1.18.3
Show newest version
package org.jsoup.nodes;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.helper.DataUtil;
import org.jsoup.helper.Validate;
import org.jsoup.internal.StringUtil;
import org.jsoup.parser.ParseSettings;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;
import org.jsoup.select.Evaluator;
import org.jsoup.select.Selector;
import org.jspecify.annotations.Nullable;

import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.List;

/**
 A HTML Document.

 @author Jonathan Hedley, [email protected] */
public class Document extends Element {
    private @Nullable Connection connection; // the connection this doc was fetched from, if any
    private OutputSettings outputSettings = new OutputSettings();
    private Parser parser; // the parser used to parse this document
    private QuirksMode quirksMode = QuirksMode.noQuirks;
    private final String location;
    private boolean updateMetaCharset = false;

    /**
     Create a new, empty Document, in the specified namespace.
     @param namespace the namespace of this Document's root node.
     @param baseUri base URI of document
     @see org.jsoup.Jsoup#parse
     @see #createShell
     */
    public Document(String namespace, String baseUri) {
        super(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), baseUri);
        this.location = baseUri;
        this.parser = Parser.htmlParser(); // default, but overridable
    }

    /**
     Create a new, empty Document, in the HTML namespace.
     @param baseUri base URI of document
     @see org.jsoup.Jsoup#parse
     @see #Document(String namespace, String baseUri)
     */
    public Document(String baseUri) {
        this(Parser.NamespaceHtml, baseUri);
    }

    /**
     Create a valid, empty shell of a document, suitable for adding more elements to.
     @param baseUri baseUri of document
     @return document with html, head, and body elements.
     */
    public static Document createShell(String baseUri) {
        Validate.notNull(baseUri);

        Document doc = new Document(baseUri);
        doc.parser = doc.parser();
        Element html = doc.appendElement("html");
        html.appendElement("head");
        html.appendElement("body");

        return doc;
    }

    /**
     * Get the URL this Document was parsed from. If the starting URL is a redirect,
     * this will return the final URL from which the document was served from.
     * 

Will return an empty string if the location is unknown (e.g. if parsed from a String). * @return location */ public String location() { return location; } /** Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new default Connection object. This can be used to continue a session, preserving settings and cookies, etc. @return the Connection (session) associated with this Document, or an empty one otherwise. @see Connection#newRequest() */ public Connection connection() { if (connection == null) return Jsoup.newSession(); else return connection; } /** * Returns this Document's doctype. * @return document type, or null if not set */ public @Nullable DocumentType documentType() { for (Node node : childNodes) { if (node instanceof DocumentType) return (DocumentType) node; else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc break; } return null; // todo - add a set document type? } /** Find the root HTML element, or create it if it doesn't exist. @return the root HTML element. */ private Element htmlEl() { Element el = firstElementChild(); while (el != null) { if (el.normalName().equals("html")) return el; el = el.nextElementSibling(); } return appendElement("html"); } /** Get this document's {@code head} element.

As a side-effect, if this Document does not already have a HTML structure, it will be created. If you do not want that, use {@code #selectFirst("head")} instead. @return {@code head} element. */ public Element head() { final Element html = htmlEl(); Element el = html.firstElementChild(); while (el != null) { if (el.normalName().equals("head")) return el; el = el.nextElementSibling(); } return html.prependElement("head"); } /** Get this document's {@code } or {@code } element.

As a side-effect, if this Document does not already have a HTML structure, it will be created with a {@code } element. If you do not want that, use {@code #selectFirst("body")} instead. @return {@code body} element for documents with a {@code }, a new {@code } element if the document had no contents, or the outermost {@code element} for frameset documents. */ public Element body() { final Element html = htmlEl(); Element el = html.firstElementChild(); while (el != null) { if ("body".equals(el.normalName()) || "frameset".equals(el.normalName())) return el; el = el.nextElementSibling(); } return html.appendElement("body"); } /** Get each of the {@code

} elements contained in this document. @return a List of FormElement objects, which will be empty if there are none. @see Elements#forms() @see FormElement#elements() @since 1.15.4 */ public List forms() { return select("form").forms(); } /** Selects the first {@link FormElement} in this document that matches the query. If none match, throws an {@link IllegalArgumentException}. @param cssQuery a {@link Selector} CSS query @return the first matching {@code } element @throws IllegalArgumentException if no match is found @since 1.15.4 */ public FormElement expectForm(String cssQuery) { Elements els = select(cssQuery); for (Element el : els) { if (el instanceof FormElement) return (FormElement) el; } Validate.fail("No form elements matched the query '%s' in the document.", cssQuery); return null; // (not really) } /** Get the string contents of the document's {@code title} element. @return Trimmed title, or empty string if none set. */ public String title() { // title is a preserve whitespace tag (for document output), but normalised here Element titleEl = head().selectFirst(titleEval); return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : ""; } private static final Evaluator titleEval = new Evaluator.Tag("title"); /** Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if not present @param title string to set as title */ public void title(String title) { Validate.notNull(title); Element titleEl = head().selectFirst(titleEval); if (titleEl == null) // add to head titleEl = head().appendElement("title"); titleEl.text(title); } /** Create a new Element, with this document's base uri. Does not make the new element a child of this document. @param tagName element tag name (e.g. {@code a}) @return new element */ public Element createElement(String tagName) { return new Element(Tag.valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri()); } @Override public String outerHtml() { return super.html(); // no outer wrapper tag } /** Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared. @param text unencoded text @return this document */ @Override public Element text(String text) { body().text(text); // overridden to not nuke doc structure return this; } @Override public String nodeName() { return "#document"; } /** * Sets the charset used in this document. This method is equivalent * to {@link OutputSettings#charset(java.nio.charset.Charset) * OutputSettings.charset(Charset)} but in addition it updates the * charset / encoding element within the document. * *

This enables * {@link #updateMetaCharsetElement(boolean) meta charset update}.

* *

If there's no element with charset / encoding information yet it will * be created. Obsolete charset / encoding definitions are removed!

* *

Elements used:

* *
    *
  • Html: <meta charset="CHARSET">
  • *
  • Xml: <?xml version="1.0" encoding="CHARSET">
  • *
* * @param charset Charset * * @see #updateMetaCharsetElement(boolean) * @see OutputSettings#charset(java.nio.charset.Charset) */ public void charset(Charset charset) { updateMetaCharsetElement(true); outputSettings.charset(charset); ensureMetaCharsetElement(); } /** * Returns the charset used in this document. This method is equivalent * to {@link OutputSettings#charset()}. * * @return Current Charset * * @see OutputSettings#charset() */ public Charset charset() { return outputSettings.charset(); } /** * Sets whether the element with charset information in this document is * updated on changes through {@link #charset(java.nio.charset.Charset) * Document.charset(Charset)} or not. * *

If set to false (default) there are no elements * modified.

* * @param update If true the element updated on charset * changes, false if not * * @see #charset(java.nio.charset.Charset) */ public void updateMetaCharsetElement(boolean update) { this.updateMetaCharset = update; } /** * Returns whether the element with charset information in this document is * updated on changes through {@link #charset(java.nio.charset.Charset) * Document.charset(Charset)} or not. * * @return Returns true if the element is updated on charset * changes, false if not */ public boolean updateMetaCharsetElement() { return updateMetaCharset; } @Override public Document clone() { Document clone = (Document) super.clone(); clone.outputSettings = this.outputSettings.clone(); return clone; } @Override public Document shallowClone() { Document clone = new Document(this.tag().namespace(), baseUri()); if (attributes != null) clone.attributes = attributes.clone(); clone.outputSettings = this.outputSettings.clone(); return clone; } /** * Ensures a meta charset (html) or xml declaration (xml) with the current * encoding used. This only applies with * {@link #updateMetaCharsetElement(boolean) updateMetaCharset} set to * true, otherwise this method does nothing. * *
    *
  • An existing element gets updated with the current charset
  • *
  • If there's no element yet it will be inserted
  • *
  • Obsolete elements are removed
  • *
* *

Elements used:

* *
    *
  • Html: <meta charset="CHARSET">
  • *
  • Xml: <?xml version="1.0" encoding="CHARSET">
  • *
*/ private void ensureMetaCharsetElement() { if (updateMetaCharset) { OutputSettings.Syntax syntax = outputSettings().syntax(); if (syntax == OutputSettings.Syntax.html) { Element metaCharset = selectFirst("meta[charset]"); if (metaCharset != null) { metaCharset.attr("charset", charset().displayName()); } else { head().appendElement("meta").attr("charset", charset().displayName()); } select("meta[name=charset]").remove(); // Remove obsolete elements } else if (syntax == OutputSettings.Syntax.xml) { Node node = ensureChildNodes().get(0); if (node instanceof XmlDeclaration) { XmlDeclaration decl = (XmlDeclaration) node; if (decl.name().equals("xml")) { decl.attr("encoding", charset().displayName()); if (decl.hasAttr("version")) decl.attr("version", "1.0"); } else { decl = new XmlDeclaration("xml", false); decl.attr("version", "1.0"); decl.attr("encoding", charset().displayName()); prependChild(decl); } } else { XmlDeclaration decl = new XmlDeclaration("xml", false); decl.attr("version", "1.0"); decl.attr("encoding", charset().displayName()); prependChild(decl); } } } } /** * A Document's output settings control the form of the text() and html() methods. */ public static class OutputSettings implements Cloneable { /** * The output serialization syntax. */ public enum Syntax {html, xml} private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; private Charset charset; Entities.CoreCharset coreCharset; // fast encoders for ascii and utf8 private final ThreadLocal encoderThreadLocal = new ThreadLocal<>(); // initialized by start of OuterHtmlVisitor private boolean prettyPrint = true; private boolean outline = false; private int indentAmount = 1; private int maxPaddingWidth = 30; private Syntax syntax = Syntax.html; public OutputSettings() { charset(DataUtil.UTF_8); } /** * Get the document's current HTML escape mode: base, which provides a limited set of named HTML * entities and escapes other characters as numbered entities for maximum compatibility; or extended, * which uses the complete set of HTML named entities. *

* The default escape mode is base. * @return the document's current escape mode */ public Entities.EscapeMode escapeMode() { return escapeMode; } /** * Set the document's escape mode, which determines how characters are escaped when the output character set * does not support a given character:- using either a named or a numbered escape. * @param escapeMode the new escape mode to use * @return the document's output settings, for chaining */ public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { this.escapeMode = escapeMode; return this; } /** * Get the document's current output charset, which is used to control which characters are escaped when * generating HTML (via the html() methods), and which are kept intact. *

* Where possible (when parsing from a URL or File), the document's output charset is automatically set to the * input charset. Otherwise, it defaults to UTF-8. * @return the document's current charset. */ public Charset charset() { return charset; } /** * Update the document's output charset. * @param charset the new charset to use. * @return the document's output settings, for chaining */ public OutputSettings charset(Charset charset) { this.charset = charset; coreCharset = Entities.CoreCharset.byName(charset.name()); return this; } /** * Update the document's output charset. * @param charset the new charset (by name) to use. * @return the document's output settings, for chaining */ public OutputSettings charset(String charset) { charset(Charset.forName(charset)); return this; } CharsetEncoder prepareEncoder() { // created at start of OuterHtmlVisitor so each pass has own encoder, so OutputSettings can be shared among threads CharsetEncoder encoder = charset.newEncoder(); encoderThreadLocal.set(encoder); return encoder; } CharsetEncoder encoder() { CharsetEncoder encoder = encoderThreadLocal.get(); return encoder != null ? encoder : prepareEncoder(); } /** * Get the document's current output syntax. * @return current syntax */ public Syntax syntax() { return syntax; } /** * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or * {@code xml}, with self-closing tags. *

When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.

* @param syntax serialization syntax * @return the document's output settings, for chaining */ public OutputSettings syntax(Syntax syntax) { this.syntax = syntax; if (syntax == Syntax.xml) this.escapeMode(Entities.EscapeMode.xhtml); return this; } /** * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format * the output, and the output will generally look like the input. * @return if pretty printing is enabled. */ public boolean prettyPrint() { return prettyPrint; } /** * Enable or disable pretty printing. * @param pretty new pretty print setting * @return this, for chaining */ public OutputSettings prettyPrint(boolean pretty) { prettyPrint = pretty; return this; } /** * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider * all tags as block. * @return if outline mode is enabled. */ public boolean outline() { return outline; } /** * Enable or disable HTML outline mode. * @param outlineMode new outline setting * @return this, for chaining */ public OutputSettings outline(boolean outlineMode) { outline = outlineMode; return this; } /** * Get the current tag indent amount, used when pretty printing. * @return the current indent amount */ public int indentAmount() { return indentAmount; } /** * Set the indent amount for pretty printing * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0. * @return this, for chaining */ public OutputSettings indentAmount(int indentAmount) { Validate.isTrue(indentAmount >= 0); this.indentAmount = indentAmount; return this; } /** * Get the current max padding amount, used when pretty printing * so very deeply nested nodes don't get insane padding amounts. * @return the current indent amount */ public int maxPaddingWidth() { return maxPaddingWidth; } /** * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts. * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1. * Default is 30 and -1 means unlimited. * @return this, for chaining */ public OutputSettings maxPaddingWidth(int maxPaddingWidth) { Validate.isTrue(maxPaddingWidth >= -1); this.maxPaddingWidth = maxPaddingWidth; return this; } @Override public OutputSettings clone() { OutputSettings clone; try { clone = (OutputSettings) super.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } clone.charset(charset.name()); // new charset, coreCharset, and charset encoder clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle return clone; } } /** * Get the document's current output settings. * @return the document's current output settings. */ public OutputSettings outputSettings() { return outputSettings; } /** * Set the document's output settings. * @param outputSettings new output settings. * @return this document, for chaining. */ public Document outputSettings(OutputSettings outputSettings) { Validate.notNull(outputSettings); this.outputSettings = outputSettings; return this; } public enum QuirksMode { noQuirks, quirks, limitedQuirks } public QuirksMode quirksMode() { return quirksMode; } public Document quirksMode(QuirksMode quirksMode) { this.quirksMode = quirksMode; return this; } /** * Get the parser that was used to parse this document. * @return the parser */ public Parser parser() { return parser; } /** * Set the parser used to create this document. This parser is then used when further parsing within this document * is required. * @param parser the configured parser to use when further parsing is required for this document. * @return this document, for chaining. */ public Document parser(Parser parser) { this.parser = parser; return this; } /** Set the Connection used to fetch this document. This Connection is used as a session object when further requests are made (e.g. when a form is submitted). @param connection to set @return this document, for chaining @see Connection#newRequest() @since 1.14.1 */ public Document connection(Connection connection) { Validate.notNull(connection); this.connection = connection; return this; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy