org.htmlunit.cyberneko.html.dom.HTMLCollectionImpl Maven / Gradle / Ivy
/*
* Copyright (c) 2002-2009 Andy Clark, Marc Guillemot
* Copyright (c) 2017-2024 Ronald Brill
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.htmlunit.cyberneko.html.dom;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.html.HTMLAnchorElement;
import org.w3c.dom.html.HTMLAppletElement;
import org.w3c.dom.html.HTMLAreaElement;
import org.w3c.dom.html.HTMLCollection;
import org.w3c.dom.html.HTMLElement;
import org.w3c.dom.html.HTMLFormElement;
import org.w3c.dom.html.HTMLImageElement;
import org.w3c.dom.html.HTMLObjectElement;
import org.w3c.dom.html.HTMLOptionElement;
import org.w3c.dom.html.HTMLTableCellElement;
import org.w3c.dom.html.HTMLTableRowElement;
import org.w3c.dom.html.HTMLTableSectionElement;
/**
* Implements {@link org.w3c.dom.html.HTMLCollection} to traverse any named
* elements on a {@link org.w3c.dom.html.HTMLDocument}. The elements type to
* look for is identified in the constructor by code. This collection is not
* optimized for traversing large trees.
*
* The collection has to meet two requirements: it has to be live, and it has
* to traverse depth first and always return results in that order. As such,
* using an object container (such as {@link java.util.Vector}) is expensive on
* insert/remove operations. Instead, the collection has been implemented using
* three traversing functions. As a result, operations on large documents will
* result in traversal of the entire document tree and consume a considerable
* amount of time.
*
* Note that synchronization on the traversed document cannot be achieved.
* The document itself cannot be locked, and locking each traversed node is
* likely to lead to a dead lock condition. Therefore, there is a chance of the
* document being changed as results are fetched; in all likelihood, the results
* might be out dated, but not erroneous.
*
*
* @author Assaf Arkin
* @see org.w3c.dom.html.HTMLCollection
*/
class HTMLCollectionImpl implements HTMLCollection {
/**
* Request collection of all anchors in document: <A> elements that
* have a name
attribute.
*/
static final short ANCHOR = 1;
/**
* Request collection of all forms in document: <FORM> elements.
*/
static final short FORM = 2;
/**
* Request collection of all images in document: <IMG> elements.
*/
static final short IMAGE = 3;
/**
* Request collection of all Applets in document: <APPLET> and
* <OBJECT> elements (<OBJECT> must contain an Applet).
*/
static final short APPLET = 4;
/**
* Request collection of all links in document: <A> and <AREA>
* elements (must have a href
attribute).
*/
static final short LINK = 5;
/**
* Request collection of all options in selection: <OPTION> elements in
* <SELECT> or <OPTGROUP>.
*/
static final short OPTION = 6;
/**
* Request collection of all rows in table: <TR> elements in table or
* table section.
*/
static final short ROW = 7;
/**
* Request collection of all form elements: <INPUT>, <BUTTON>,
* <SELECT>, and <TEXTAREA> elements inside form <FORM>.
*/
static final short ELEMENT = 8;
/**
* Request collection of all areas in map: <AREA> element in <MAP>
* (non recursive).
*/
static final short AREA = -1;
/**
* Request collection of all table bodies in table: <TBODY> element in
* table <TABLE> (non recursive).
*/
static final short TBODY = -2;
/**
* Request collection of all cells in row: <TD> and <TH>
* elements in <TR> (non recursive).
*/
static final short CELL = -3;
/**
* Indicates what this collection is looking for. Holds one of the enumerated
* values and used by {@link #collectionMatch}. Set by the constructor and
* determine the collection's use for its life time.
*/
private final short lookingFor_;
/**
* This is the top level element underneath which the collection exists.
*/
private final Element topLevel_;
/**
* Construct a new collection that retrieves element of the specific type
* (lookingFor
) from the specific document portion
* (topLevel
).
*
* @param topLevel The element underneath which the collection exists
* @param lookingFor Code indicating what elements to look for
*/
HTMLCollectionImpl(final HTMLElement topLevel, final short lookingFor) {
if (topLevel == null) {
throw new NullPointerException("HTM011 Argument 'topLevel' is null.");
}
topLevel_ = topLevel;
lookingFor_ = lookingFor;
}
/**
* Returns the length of the collection. This method might traverse the
* entire document tree.
*
* @return Length of the collection
*/
@Override
public final int getLength() {
// Call recursive function on top-level element.
return getLength(topLevel_);
}
/**
* Retrieves the indexed node from the collection. Nodes are numbered in
* tree order - depth-first traversal order. This method might traverse
* the entire document tree.
*
* @param index The index of the node to return
* @return The specified node or null if no such node found
*/
@Override
public final Node item(final int index) {
if (index < 0) {
throw new IllegalArgumentException("HTM012 Argument 'index' is negative.");
}
// Call recursive function on top-level element.
return item(topLevel_, new CollectionIndex(index));
}
/**
* Retrieves the named node from the collection. The name is matched case
* sensitive against the id attribute of each element in the
* collection, returning the first match. The tree is traversed in
* depth-first order. This method might traverse the entire document tree.
*
* @param name The name of the node to return
* @return The specified node or null if no such node found
*/
@Override
public final Node namedItem(final String name) {
if (name == null) {
throw new NullPointerException("HTM013 Argument 'name' is null.");
}
// Call recursive function on top-level element.
return namedItem(topLevel_, name);
}
/**
* Recursive function returns the number of elements of a particular type
* that exist under the top level element. This is a recursive function
* and the top level element is passed along.
*
* @param topLevel Top level element from which to scan
* @return Number of elements
*/
private int getLength(final Element topLevel) {
int length;
Node node;
synchronized (topLevel) {
// Always count from zero and traverse all the childs of the
// current element in the order they appear.
length = 0;
node = topLevel.getFirstChild();
while (node != null) {
// If a particular node is an element (could be HTML or XML),
// do two things: if it's the one we're looking for, count
// another matched element; at any rate, traverse it's
// children as well.
if (node instanceof Element) {
if (collectionMatch((Element) node, null)) {
++length;
}
else if (recurse()) {
length += getLength((Element) node);
}
}
node = node.getNextSibling();
}
}
return length;
}
/**
* Recursive function returns the numbered element of a particular type
* that exist under the top level element. This is a recursive function
* and the top level element is passed along.
*
* Note that this function must call itself with an index and get back both
* the element (if one was found) and the new index which is decremeneted
* for any like element found. Since integers are only passed by value,
* this function makes use of a separate class ({@link CollectionIndex})
* to hold that index.
*
* @param topLevel Top level element from which to scan
* @param index The index of the item to retreive
* @return Number of elements
* @see CollectionIndex
*/
private Node item(final Element topLevel, final CollectionIndex index) {
Node node;
Node result;
synchronized (topLevel) {
// Traverse all the childs of the current element in the order
// they appear. Count from the index backwards until you reach
// matching element with an index of zero. Return that element.
node = topLevel.getFirstChild();
while (node != null) {
// If a particular node is an element (could be HTML or XML),
// do two things: if it's the one we're looking for, decrease
// the index and if zero, return this node; at any rate,
// traverse it's children as well.
if (node instanceof Element) {
if (collectionMatch((Element) node, null)) {
if (index.isZero()) {
return node;
}
index.decrement();
}
else if (recurse()) {
result = item((Element) node, index);
if (result != null) {
return result;
}
}
}
node = node.getNextSibling();
}
}
return null;
}
/**
* Recursive function returns an element of a particular type with the
* specified name (id attribute).
*
* @param topLevel Top level element from which to scan
* @param name The named element to look for
* @return The first named element found
*/
private Node namedItem(final Element topLevel, final String name) {
Node node;
Node result;
synchronized (topLevel) {
// Traverse all the childs of the current element in the order
// they appear.
node = topLevel.getFirstChild();
while (node != null) {
// If a particular node is an element (could be HTML or XML),
// do two things: if it's the one we're looking for, and the
// name (id attribute) attribute is the one we're looking for,
// return this element; otherwise, traverse it's children.
if (node instanceof Element) {
if (collectionMatch((Element) node, name)) {
return node;
}
else if (recurse()) {
result = namedItem((Element) node, name);
if (result != null) {
return result;
}
}
}
node = node.getNextSibling();
}
return node;
}
}
/**
* Returns true if scanning methods should iterate through the collection.
* When looking for elements in the document, recursing is needed to traverse
* the full document tree. When looking inside a specific element (e.g. for a
* cell inside a row), recursing can lead to erroneous results.
*
* @return True if methods should recurse to traverse entire tree
*/
protected boolean recurse() {
return lookingFor_ > 0;
}
/**
* Determines if current element matches based on what we're looking for.
* The element is passed along with an optional identifier name. If the
* element is the one we're looking for, return true. If the name is also
* specified, the name must match the id
attribute
* (match name
first for anchors).
*
* @param elem The current element
* @param name The identifier name or null
* @return The element matches what we're looking for
*/
protected boolean collectionMatch(final Element elem, final String name) {
boolean match;
synchronized (elem) {
// Begin with no matching. Depending on what we're looking for,
// attempt to match based on the element type. This is the quickest
// way to match involving only a cast. Do the expensive string
// comparison later on.
match = false;
switch (lookingFor_) {
case ANCHOR:
// Anchor is an element with a 'name' attribute. Otherwise, it's
// just a link.
match = (elem instanceof HTMLAnchorElement) && elem.getAttribute("name").length() > 0;
break;
case FORM:
// Any