All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jsoup.nodes.Node Maven / Gradle / Ivy

There is a newer version: 2024.q3.1
Show newest version
package org.jsoup.nodes;

import org.jsoup.SerializationException;
import org.jsoup.helper.Consumer;
import org.jsoup.helper.Validate;
import org.jsoup.internal.StringUtil;
import org.jsoup.select.NodeFilter;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

import javax.annotation.Nullable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/**
 The base, abstract Node model. Elements, Documents, Comments etc are all Node instances.

 @author Jonathan Hedley, [email protected] */
public abstract class Node implements Cloneable {
    static final List EmptyNodes = Collections.emptyList();
    static final String EmptyString = "";
    @Nullable Node parentNode; // Nodes don't always have parents
    int siblingIndex;

    /**
     * Default constructor. Doesn't setup base uri, children, or attributes; use with caution.
     */
    protected Node() {
    }

    /**
     Get the node name of this node. Use for debugging purposes and not logic switching (for that, use instanceof).
     @return node name
     */
    public abstract String nodeName();

    /**
     * Check if this Node has an actual Attributes object.
     */
    protected abstract boolean hasAttributes();

    /**
     Checks if this node has a parent. Nodes won't have parents if (e.g.) they are newly created and not added as a child
     to an existing node, or if they are a {@link #shallowClone()}. In such cases, {@link #parent()} will return {@code null}.
     @return if this node has a parent.
     */
    public boolean hasParent() {
        return parentNode != null;
    }

    /**
     * Get an attribute's value by its key. Case insensitive
     * 

* To get an absolute URL from an attribute that may be a relative URL, prefix the key with abs, * which is a shortcut to the {@link #absUrl} method. *

* E.g.: *
String url = a.attr("abs:href");
* * @param attributeKey The attribute key. * @return The attribute, or empty string if not present (to avoid nulls). * @see #attributes() * @see #hasAttr(String) * @see #absUrl(String) */ public String attr(String attributeKey) { Validate.notNull(attributeKey); if (!hasAttributes()) return EmptyString; String val = attributes().getIgnoreCase(attributeKey); if (val.length() > 0) return val; else if (attributeKey.startsWith("abs:")) return absUrl(attributeKey.substring("abs:".length())); else return ""; } /** * Get all of the element's attributes. * @return attributes (which implements iterable, in same order as presented in original HTML). */ public abstract Attributes attributes(); /** Get the number of attributes that this Node has. @return the number of attributes @since 1.14.2 */ public int attributesSize() { // added so that we can test how many attributes exist without implicitly creating the Attributes object return hasAttributes() ? attributes().size() : 0; } /** * Set an attribute (key=value). If the attribute already exists, it is replaced. The attribute key comparison is * case insensitive. The key will be set with case sensitivity as set in the parser settings. * @param attributeKey The attribute key. * @param attributeValue The attribute value. * @return this (for chaining) */ public Node attr(String attributeKey, String attributeValue) { attributeKey = NodeUtils.parser(this).settings().normalizeAttribute(attributeKey); attributes().putIgnoreCase(attributeKey, attributeValue); return this; } /** * Test if this Node has an attribute. Case insensitive. * @param attributeKey The attribute key to check. * @return true if the attribute exists, false if not. */ public boolean hasAttr(String attributeKey) { Validate.notNull(attributeKey); if (!hasAttributes()) return false; if (attributeKey.startsWith("abs:")) { String key = attributeKey.substring("abs:".length()); if (attributes().hasKeyIgnoreCase(key) && !absUrl(key).isEmpty()) return true; } return attributes().hasKeyIgnoreCase(attributeKey); } /** * Remove an attribute from this node. * @param attributeKey The attribute to remove. * @return this (for chaining) */ public Node removeAttr(String attributeKey) { Validate.notNull(attributeKey); if (hasAttributes()) attributes().removeIgnoreCase(attributeKey); return this; } /** * Clear (remove) all of the attributes in this node. * @return this, for chaining */ public Node clearAttributes() { if (hasAttributes()) { Iterator it = attributes().iterator(); while (it.hasNext()) { it.next(); it.remove(); } } return this; } /** Get the base URI that applies to this node. Will return an empty string if not defined. Used to make relative links absolute. @return base URI @see #absUrl */ public abstract String baseUri(); /** * Set the baseUri for just this node (not its descendants), if this Node tracks base URIs. * @param baseUri new URI */ protected abstract void doSetBaseUri(String baseUri); /** Update the base URI of this node and all of its descendants. @param baseUri base URI to set */ public void setBaseUri(final String baseUri) { Validate.notNull(baseUri); doSetBaseUri(baseUri); } /** * Get an absolute URL from a URL attribute that may be relative (such as an <a href> or * <img src>). *

* E.g.: String absUrl = linkEl.absUrl("href"); *

*

* If the attribute value is already absolute (i.e. it starts with a protocol, like * http:// or https:// etc), and it successfully parses as a URL, the attribute is * returned directly. Otherwise, it is treated as a URL relative to the element's {@link #baseUri}, and made * absolute using that. *

*

* As an alternate, you can use the {@link #attr} method with the abs: prefix, e.g.: * String absUrl = linkEl.attr("abs:href"); *

* * @param attributeKey The attribute key * @return An absolute URL if one could be made, or an empty string (not null) if the attribute was missing or * could not be made successfully into a URL. * @see #attr * @see java.net.URL#URL(java.net.URL, String) */ public String absUrl(String attributeKey) { Validate.notEmpty(attributeKey); if (!(hasAttributes() && attributes().hasKeyIgnoreCase(attributeKey))) // not using hasAttr, so that we don't recurse down hasAttr->absUrl return ""; return StringUtil.resolve(baseUri(), attributes().getIgnoreCase(attributeKey)); } protected abstract List ensureChildNodes(); /** Get a child node by its 0-based index. @param index index of child node @return the child node at this index. Throws a {@code IndexOutOfBoundsException} if the index is out of bounds. */ public Node childNode(int index) { return ensureChildNodes().get(index); } /** Get this node's children. Presented as an unmodifiable list: new children can not be added, but the child nodes themselves can be manipulated. @return list of children. If no children, returns an empty list. */ public List childNodes() { if (childNodeSize() == 0) return EmptyNodes; List children = ensureChildNodes(); List rewrap = new ArrayList<>(children.size()); // wrapped so that looping and moving will not throw a CME as the source changes rewrap.addAll(children); return Collections.unmodifiableList(rewrap); } /** * Returns a deep copy of this node's children. Changes made to these nodes will not be reflected in the original * nodes * @return a deep copy of this node's children */ public List childNodesCopy() { final List nodes = ensureChildNodes(); final ArrayList children = new ArrayList<>(nodes.size()); for (Node node : nodes) { children.add(node.clone()); } return children; } /** * Get the number of child nodes that this node holds. * @return the number of child nodes that this node holds. */ public abstract int childNodeSize(); protected Node[] childNodesAsArray() { return ensureChildNodes().toArray(new Node[0]); } /** * Delete all this node's children. * @return this node, for chaining */ public abstract Node empty(); /** Gets this node's parent node. @return parent node; or null if no parent. @see #hasParent() */ public @Nullable Node parent() { return parentNode; } /** Gets this node's parent node. Not overridable by extending classes, so useful if you really just need the Node type. @return parent node; or null if no parent. */ public @Nullable final Node parentNode() { return parentNode; } /** * Get this node's root node; that is, its topmost ancestor. If this node is the top ancestor, returns {@code this}. * @return topmost ancestor. */ public Node root() { Node node = this; while (node.parentNode != null) node = node.parentNode; return node; } /** * Gets the Document associated with this Node. * @return the Document associated with this Node, or null if there is no such Document. */ public @Nullable Document ownerDocument() { Node root = root(); return (root instanceof Document) ? (Document) root : null; } /** * Remove (delete) this node from the DOM tree. If this node has children, they are also removed. */ public void remove() { Validate.notNull(parentNode); parentNode.removeChild(this); } /** * Insert the specified HTML into the DOM before this node (as a preceding sibling). * @param html HTML to add before this node * @return this node, for chaining * @see #after(String) */ public Node before(String html) { addSiblingHtml(siblingIndex, html); return this; } /** * Insert the specified node into the DOM before this node (as a preceding sibling). * @param node to add before this node * @return this node, for chaining * @see #after(Node) */ public Node before(Node node) { Validate.notNull(node); Validate.notNull(parentNode); parentNode.addChildren(siblingIndex, node); return this; } /** * Insert the specified HTML into the DOM after this node (as a following sibling). * @param html HTML to add after this node * @return this node, for chaining * @see #before(String) */ public Node after(String html) { addSiblingHtml(siblingIndex + 1, html); return this; } /** * Insert the specified node into the DOM after this node (as a following sibling). * @param node to add after this node * @return this node, for chaining * @see #before(Node) */ public Node after(Node node) { Validate.notNull(node); Validate.notNull(parentNode); parentNode.addChildren(siblingIndex + 1, node); return this; } private void addSiblingHtml(int index, String html) { Validate.notNull(html); Validate.notNull(parentNode); Element context = parent() instanceof Element ? (Element) parent() : null; List nodes = NodeUtils.parser(this).parseFragmentInput(html, context, baseUri()); parentNode.addChildren(index, nodes.toArray(new Node[0])); } /** Wrap the supplied HTML around this node. @param html HTML to wrap around this node, e.g. {@code
}. Can be arbitrarily deep. If the input HTML does not parse to a result starting with an Element, this will be a no-op. @return this node, for chaining. */ public Node wrap(String html) { Validate.notEmpty(html); // Parse context - parent (because wrapping), this, or null Element context = parentNode != null && parentNode instanceof Element ? (Element) parentNode : this instanceof Element ? (Element) this : null; List wrapChildren = NodeUtils.parser(this).parseFragmentInput(html, context, baseUri()); Node wrapNode = wrapChildren.get(0); if (!(wrapNode instanceof Element)) // nothing to wrap with; noop return this; Element wrap = (Element) wrapNode; Element deepest = getDeepChild(wrap); if (parentNode != null) parentNode.replaceChild(this, wrap); deepest.addChildren(this); // side effect of tricking wrapChildren to lose first // remainder (unbalanced wrap, like

-- The

is remainder if (wrapChildren.size() > 0) { //noinspection ForLoopReplaceableByForEach (beacause it allocates an Iterator which is wasteful here) for (int i = 0; i < wrapChildren.size(); i++) { Node remainder = wrapChildren.get(i); // if no parent, this could be the wrap node, so skip if (wrap == remainder) continue; if (remainder.parentNode != null) remainder.parentNode.removeChild(remainder); wrap.after(remainder); } } return this; } /** * Removes this node from the DOM, and moves its children up into the node's parent. This has the effect of dropping * the node but keeping its children. *

* For example, with the input html: *

*

{@code

One Two Three
}

* Calling {@code element.unwrap()} on the {@code span} element will result in the html: *

{@code

One Two Three
}

* and the {@code "Two "} {@link TextNode} being returned. * * @return the first child of this node, after the node has been unwrapped. @{code Null} if the node had no children. * @see #remove() * @see #wrap(String) */ public @Nullable Node unwrap() { Validate.notNull(parentNode); Node firstChild = firstChild(); parentNode.addChildren(siblingIndex, this.childNodesAsArray()); this.remove(); return firstChild; } private Element getDeepChild(Element el) { List children = el.children(); if (children.size() > 0) return getDeepChild(children.get(0)); else return el; } void nodelistChanged() { // Element overrides this to clear its shadow children elements } /** * Replace this node in the DOM with the supplied node. * @param in the node that will will replace the existing node. */ public void replaceWith(Node in) { Validate.notNull(in); Validate.notNull(parentNode); parentNode.replaceChild(this, in); } protected void setParentNode(Node parentNode) { Validate.notNull(parentNode); if (this.parentNode != null) this.parentNode.removeChild(this); this.parentNode = parentNode; } protected void replaceChild(Node out, Node in) { Validate.isTrue(out.parentNode == this); Validate.notNull(in); if (in.parentNode != null) in.parentNode.removeChild(in); final int index = out.siblingIndex; ensureChildNodes().set(index, in); in.parentNode = this; in.setSiblingIndex(index); out.parentNode = null; } protected void removeChild(Node out) { Validate.isTrue(out.parentNode == this); final int index = out.siblingIndex; ensureChildNodes().remove(index); reindexChildren(index); out.parentNode = null; } protected void addChildren(Node... children) { //most used. short circuit addChildren(int), which hits reindex children and array copy final List nodes = ensureChildNodes(); for (Node child: children) { reparentChild(child); nodes.add(child); child.setSiblingIndex(nodes.size()-1); } } protected void addChildren(int index, Node... children) { Validate.notNull(children); if (children.length == 0) { return; } final List nodes = ensureChildNodes(); // fast path - if used as a wrap (index=0, children = child[0].parent.children - do inplace final Node firstParent = children[0].parent(); if (firstParent != null && firstParent.childNodeSize() == children.length) { boolean sameList = true; final List firstParentNodes = firstParent.ensureChildNodes(); // identity check contents to see if same int i = children.length; while (i-- > 0) { if (children[i] != firstParentNodes.get(i)) { sameList = false; break; } } if (sameList) { // moving, so OK to empty firstParent and short-circuit boolean wasEmpty = childNodeSize() == 0; firstParent.empty(); nodes.addAll(index, Arrays.asList(children)); i = children.length; while (i-- > 0) { children[i].parentNode = this; } if (!(wasEmpty && children[0].siblingIndex == 0)) // skip reindexing if we just moved reindexChildren(index); return; } } Validate.noNullElements(children); for (Node child : children) { reparentChild(child); } nodes.addAll(index, Arrays.asList(children)); reindexChildren(index); } protected void reparentChild(Node child) { child.setParentNode(this); } private void reindexChildren(int start) { final int size = childNodeSize(); if (size == 0) return; final List childNodes = ensureChildNodes(); for (int i = start; i < size; i++) { childNodes.get(i).setSiblingIndex(i); } } /** Retrieves this node's sibling nodes. Similar to {@link #childNodes() node.parent.childNodes()}, but does not include this node (a node is not a sibling of itself). @return node siblings. If the node has no parent, returns an empty list. */ public List siblingNodes() { if (parentNode == null) return Collections.emptyList(); List nodes = parentNode.ensureChildNodes(); List siblings = new ArrayList<>(nodes.size() - 1); for (Node node: nodes) if (node != this) siblings.add(node); return siblings; } /** Get this node's next sibling. @return next sibling, or @{code null} if this is the last sibling */ public @Nullable Node nextSibling() { if (parentNode == null) return null; // root final List siblings = parentNode.ensureChildNodes(); final int index = siblingIndex+1; if (siblings.size() > index) return siblings.get(index); else return null; } /** Get this node's previous sibling. @return the previous sibling, or @{code null} if this is the first sibling */ public @Nullable Node previousSibling() { if (parentNode == null) return null; // root if (siblingIndex > 0) return parentNode.ensureChildNodes().get(siblingIndex-1); else return null; } /** * Get the list index of this node in its node sibling list. E.g. if this is the first node * sibling, returns 0. * @return position in node sibling list * @see org.jsoup.nodes.Element#elementSiblingIndex() */ public int siblingIndex() { return siblingIndex; } protected void setSiblingIndex(int siblingIndex) { this.siblingIndex = siblingIndex; } /** Gets the first child node of this node, or {@code null} if there is none. This could be any Node type, such as an Element, TextNode, Comment, etc. Use {@link Element#firstElementChild()} to get the first Element child. @return the first child node, or null if there are no children. @see Element#firstElementChild() @see #lastChild() @since 1.15.2 */ public @Nullable Node firstChild() { if (childNodeSize() == 0) return null; return ensureChildNodes().get(0); } /** Gets the last child node of this node, or {@code null} if there is none. @return the last child node, or null if there are no children. @see Element#lastElementChild() @see #firstChild() @since 1.15.2 */ public @Nullable Node lastChild() { final int size = childNodeSize(); if (size == 0) return null; List children = ensureChildNodes(); return children.get(size - 1); } /** * Perform a depth-first traversal through this node and its descendants. * @param nodeVisitor the visitor callbacks to perform on each node * @return this node, for chaining */ public Node traverse(NodeVisitor nodeVisitor) { Validate.notNull(nodeVisitor); NodeTraversor.traverse(nodeVisitor, this); return this; } /** Perform the supplied action on this Node and each of its descendants, during a depth-first traversal. Nodes may be inspected, changed, added, replaced, or removed. @param action the function to perform on the node @return this Node, for chaining @see Element#forEach(Consumer) */ public Node forEachNode(Consumer action) { Validate.notNull(action); NodeTraversor.traverse((node, depth) -> action.accept(node), this); return this; } /** * Perform a depth-first filtering through this node and its descendants. * @param nodeFilter the filter callbacks to perform on each node * @return this node, for chaining */ public Node filter(NodeFilter nodeFilter) { Validate.notNull(nodeFilter); NodeTraversor.filter(nodeFilter, this); return this; } /** Get the outer HTML of this node. For example, on a {@code p} element, may return {@code

Para

}. @return outer HTML @see Element#html() @see Element#text() */ public String outerHtml() { StringBuilder accum = StringUtil.borrowBuilder(); outerHtml(accum); return StringUtil.releaseBuilder(accum); } protected void outerHtml(Appendable accum) { NodeTraversor.traverse(new OuterHtmlVisitor(accum, NodeUtils.outputSettings(this)), this); } /** Get the outer HTML of this node. @param accum accumulator to place HTML into @throws IOException if appending to the given accumulator fails. */ abstract void outerHtmlHead(final Appendable accum, int depth, final Document.OutputSettings out) throws IOException; abstract void outerHtmlTail(final Appendable accum, int depth, final Document.OutputSettings out) throws IOException; /** * Write this node and its children to the given {@link Appendable}. * * @param appendable the {@link Appendable} to write to. * @return the supplied {@link Appendable}, for chaining. */ public T html(T appendable) { outerHtml(appendable); return appendable; } /** Get the source range (start and end positions) in the original input source that this node was parsed from. Position tracking must be enabled prior to parsing the content. For an Element, this will be the positions of the start tag. @return the range for the start of the node. @see org.jsoup.parser.Parser#setTrackPosition(boolean) @see Element#endSourceRange() @since 1.15.2 */ public Range sourceRange() { return Range.of(this, true); } /** * Gets this node's outer HTML. * @return outer HTML. * @see #outerHtml() */ public String toString() { return outerHtml(); } protected void indent(Appendable accum, int depth, Document.OutputSettings out) throws IOException { accum.append('\n').append(StringUtil.padding(depth * out.indentAmount(), out.maxPaddingWidth())); } /** * Check if this node is the same instance of another (object identity test). *

For an node value equality check, see {@link #hasSameValue(Object)}

* @param o other object to compare to * @return true if the content of this node is the same as the other * @see Node#hasSameValue(Object) */ @Override public boolean equals(@Nullable Object o) { // implemented just so that javadoc is clear this is an identity test return this == o; } /** Provides a hashCode for this Node, based on it's object identity. Changes to the Node's content will not impact the result. @return an object identity based hashcode for this Node */ @Override public int hashCode() { // implemented so that javadoc and scanners are clear this is an identity test return super.hashCode(); } /** * Check if this node is has the same content as another node. A node is considered the same if its name, attributes and content match the * other node; particularly its position in the tree does not influence its similarity. * @param o other object to compare to * @return true if the content of this node is the same as the other */ public boolean hasSameValue(@Nullable Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; return this.outerHtml().equals(((Node) o).outerHtml()); } /** * Create a stand-alone, deep copy of this node, and all of its children. The cloned node will have no siblings or * parent node. As a stand-alone object, any changes made to the clone or any of its children will not impact the * original node. *

* The cloned node may be adopted into another Document or node structure using {@link Element#appendChild(Node)}. * @return a stand-alone cloned node, including clones of any children * @see #shallowClone() */ @SuppressWarnings("MethodDoesntCallSuperMethod") // because it does call super.clone in doClone - analysis just isn't following @Override public Node clone() { Node thisClone = doClone(null); // splits for orphan // Queue up nodes that need their children cloned (BFS). final LinkedList nodesToProcess = new LinkedList<>(); nodesToProcess.add(thisClone); while (!nodesToProcess.isEmpty()) { Node currParent = nodesToProcess.remove(); final int size = currParent.childNodeSize(); for (int i = 0; i < size; i++) { final List childNodes = currParent.ensureChildNodes(); Node childClone = childNodes.get(i).doClone(currParent); childNodes.set(i, childClone); nodesToProcess.add(childClone); } } return thisClone; } /** * Create a stand-alone, shallow copy of this node. None of its children (if any) will be cloned, and it will have * no parent or sibling nodes. * @return a single independent copy of this node * @see #clone() */ public Node shallowClone() { return doClone(null); } /* * Return a clone of the node using the given parent (which can be null). * Not a deep copy of children. */ protected Node doClone(@Nullable Node parent) { Node clone; try { clone = (Node) super.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } clone.parentNode = parent; // can be null, to create an orphan split clone.siblingIndex = parent == null ? 0 : siblingIndex; // if not keeping the parent, shallowClone the ownerDocument to preserve its settings if (parent == null && !(this instanceof Document)) { Document doc = ownerDocument(); if (doc != null) { Document docClone = doc.shallowClone(); clone.parentNode = docClone; docClone.ensureChildNodes().add(clone); } } return clone; } private static class OuterHtmlVisitor implements NodeVisitor { private final Appendable accum; private final Document.OutputSettings out; OuterHtmlVisitor(Appendable accum, Document.OutputSettings out) { this.accum = accum; this.out = out; out.prepareEncoder(); } public void head(Node node, int depth) { try { node.outerHtmlHead(accum, depth, out); } catch (IOException exception) { throw new SerializationException(exception); } } public void tail(Node node, int depth) { if (!node.nodeName().equals("#text")) { // saves a void hit. try { node.outerHtmlTail(accum, depth, out); } catch (IOException exception) { throw new SerializationException(exception); } } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy