All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.htmlunit.cyberneko.html.dom.HTMLCollectionImpl Maven / Gradle / Ivy

/*
 * Copyright (c) 2002-2009 Andy Clark, Marc Guillemot
 * Copyright (c) 2017-2024 Ronald Brill
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.htmlunit.cyberneko.html.dom;

import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.html.HTMLAnchorElement;
import org.w3c.dom.html.HTMLAppletElement;
import org.w3c.dom.html.HTMLAreaElement;
import org.w3c.dom.html.HTMLCollection;
import org.w3c.dom.html.HTMLElement;
import org.w3c.dom.html.HTMLFormElement;
import org.w3c.dom.html.HTMLImageElement;
import org.w3c.dom.html.HTMLObjectElement;
import org.w3c.dom.html.HTMLOptionElement;
import org.w3c.dom.html.HTMLTableCellElement;
import org.w3c.dom.html.HTMLTableRowElement;
import org.w3c.dom.html.HTMLTableSectionElement;

/**
 * Implements {@link org.w3c.dom.html.HTMLCollection} to traverse any named
 * elements on a {@link org.w3c.dom.html.HTMLDocument}. The elements type to
 * look for is identified in the constructor by code. This collection is not
 * optimized for traversing large trees.
 * 

* The collection has to meet two requirements: it has to be live, and it has * to traverse depth first and always return results in that order. As such, * using an object container (such as {@link java.util.Vector}) is expensive on * insert/remove operations. Instead, the collection has been implemented using * three traversing functions. As a result, operations on large documents will * result in traversal of the entire document tree and consume a considerable * amount of time. *

* Note that synchronization on the traversed document cannot be achieved. * The document itself cannot be locked, and locking each traversed node is * likely to lead to a dead lock condition. Therefore, there is a chance of the * document being changed as results are fetched; in all likelihood, the results * might be out dated, but not erroneous. *

* * @author Assaf Arkin * @see org.w3c.dom.html.HTMLCollection */ class HTMLCollectionImpl implements HTMLCollection { /** * Request collection of all anchors in document: <A> elements that * have a name attribute. */ static final short ANCHOR = 1; /** * Request collection of all forms in document: <FORM> elements. */ static final short FORM = 2; /** * Request collection of all images in document: <IMG> elements. */ static final short IMAGE = 3; /** * Request collection of all Applets in document: <APPLET> and * <OBJECT> elements (<OBJECT> must contain an Applet). */ static final short APPLET = 4; /** * Request collection of all links in document: <A> and <AREA> * elements (must have a href attribute). */ static final short LINK = 5; /** * Request collection of all options in selection: <OPTION> elements in * <SELECT> or <OPTGROUP>. */ static final short OPTION = 6; /** * Request collection of all rows in table: <TR> elements in table or * table section. */ static final short ROW = 7; /** * Request collection of all form elements: <INPUT>, <BUTTON>, * <SELECT>, and <TEXTAREA> elements inside form <FORM>. */ static final short ELEMENT = 8; /** * Request collection of all areas in map: <AREA> element in <MAP> * (non recursive). */ static final short AREA = -1; /** * Request collection of all table bodies in table: <TBODY> element in * table <TABLE> (non recursive). */ static final short TBODY = -2; /** * Request collection of all cells in row: <TD> and <TH> * elements in <TR> (non recursive). */ static final short CELL = -3; /** * Indicates what this collection is looking for. Holds one of the enumerated * values and used by {@link #collectionMatch}. Set by the constructor and * determine the collection's use for its life time. */ private final short lookingFor_; /** * This is the top level element underneath which the collection exists. */ private final Element topLevel_; /** * Construct a new collection that retrieves element of the specific type * (lookingFor) from the specific document portion * (topLevel). * * @param topLevel The element underneath which the collection exists * @param lookingFor Code indicating what elements to look for */ HTMLCollectionImpl(final HTMLElement topLevel, final short lookingFor) { if (topLevel == null) { throw new NullPointerException("HTM011 Argument 'topLevel' is null."); } topLevel_ = topLevel; lookingFor_ = lookingFor; } /** * Returns the length of the collection. This method might traverse the * entire document tree. * * @return Length of the collection */ @Override public final int getLength() { // Call recursive function on top-level element. return getLength(topLevel_); } /** * Retrieves the indexed node from the collection. Nodes are numbered in * tree order - depth-first traversal order. This method might traverse * the entire document tree. * * @param index The index of the node to return * @return The specified node or null if no such node found */ @Override public final Node item(final int index) { if (index < 0) { throw new IllegalArgumentException("HTM012 Argument 'index' is negative."); } // Call recursive function on top-level element. return item(topLevel_, new CollectionIndex(index)); } /** * Retrieves the named node from the collection. The name is matched case * sensitive against the id attribute of each element in the * collection, returning the first match. The tree is traversed in * depth-first order. This method might traverse the entire document tree. * * @param name The name of the node to return * @return The specified node or null if no such node found */ @Override public final Node namedItem(final String name) { if (name == null) { throw new NullPointerException("HTM013 Argument 'name' is null."); } // Call recursive function on top-level element. return namedItem(topLevel_, name); } /** * Recursive function returns the number of elements of a particular type * that exist under the top level element. This is a recursive function * and the top level element is passed along. * * @param topLevel Top level element from which to scan * @return Number of elements */ private int getLength(final Element topLevel) { int length; Node node; synchronized (topLevel) { // Always count from zero and traverse all the childs of the // current element in the order they appear. length = 0; node = topLevel.getFirstChild(); while (node != null) { // If a particular node is an element (could be HTML or XML), // do two things: if it's the one we're looking for, count // another matched element; at any rate, traverse it's // children as well. if (node instanceof Element) { if (collectionMatch((Element) node, null)) { ++length; } else if (recurse()) { length += getLength((Element) node); } } node = node.getNextSibling(); } } return length; } /** * Recursive function returns the numbered element of a particular type * that exist under the top level element. This is a recursive function * and the top level element is passed along. *

* Note that this function must call itself with an index and get back both * the element (if one was found) and the new index which is decremeneted * for any like element found. Since integers are only passed by value, * this function makes use of a separate class ({@link CollectionIndex}) * to hold that index. * * @param topLevel Top level element from which to scan * @param index The index of the item to retreive * @return Number of elements * @see CollectionIndex */ private Node item(final Element topLevel, final CollectionIndex index) { Node node; Node result; synchronized (topLevel) { // Traverse all the childs of the current element in the order // they appear. Count from the index backwards until you reach // matching element with an index of zero. Return that element. node = topLevel.getFirstChild(); while (node != null) { // If a particular node is an element (could be HTML or XML), // do two things: if it's the one we're looking for, decrease // the index and if zero, return this node; at any rate, // traverse it's children as well. if (node instanceof Element) { if (collectionMatch((Element) node, null)) { if (index.isZero()) { return node; } index.decrement(); } else if (recurse()) { result = item((Element) node, index); if (result != null) { return result; } } } node = node.getNextSibling(); } } return null; } /** * Recursive function returns an element of a particular type with the * specified name (id attribute). * * @param topLevel Top level element from which to scan * @param name The named element to look for * @return The first named element found */ private Node namedItem(final Element topLevel, final String name) { Node node; Node result; synchronized (topLevel) { // Traverse all the childs of the current element in the order // they appear. node = topLevel.getFirstChild(); while (node != null) { // If a particular node is an element (could be HTML or XML), // do two things: if it's the one we're looking for, and the // name (id attribute) attribute is the one we're looking for, // return this element; otherwise, traverse it's children. if (node instanceof Element) { if (collectionMatch((Element) node, name)) { return node; } else if (recurse()) { result = namedItem((Element) node, name); if (result != null) { return result; } } } node = node.getNextSibling(); } return node; } } /** * Returns true if scanning methods should iterate through the collection. * When looking for elements in the document, recursing is needed to traverse * the full document tree. When looking inside a specific element (e.g. for a * cell inside a row), recursing can lead to erroneous results. * * @return True if methods should recurse to traverse entire tree */ protected boolean recurse() { return lookingFor_ > 0; } /** * Determines if current element matches based on what we're looking for. * The element is passed along with an optional identifier name. If the * element is the one we're looking for, return true. If the name is also * specified, the name must match the id attribute * (match name first for anchors). * * @param elem The current element * @param name The identifier name or null * @return The element matches what we're looking for */ protected boolean collectionMatch(final Element elem, final String name) { boolean match; synchronized (elem) { // Begin with no matching. Depending on what we're looking for, // attempt to match based on the element type. This is the quickest // way to match involving only a cast. Do the expensive string // comparison later on. match = false; switch (lookingFor_) { case ANCHOR: // Anchor is an element with a 'name' attribute. Otherwise, it's // just a link. match = (elem instanceof HTMLAnchorElement) && elem.getAttribute("name").length() > 0; break; case FORM: // Any

element. match = elem instanceof HTMLFormElement; break; case IMAGE: // Any element. elements with images are not returned. match = elem instanceof HTMLImageElement; break; case APPLET: // Any element, and any element which represents an // Applet. This is determined by 'codetype' attribute being // 'application/java' or 'classid' attribute starting with 'java:'. match = (elem instanceof HTMLAppletElement) || (elem instanceof HTMLObjectElement && ("application/java".equals(elem.getAttribute("codetype")) || elem.getAttribute("classid").startsWith("java:"))); break; case ELEMENT: // All form elements implement HTMLFormControl for easy identification. match = elem instanceof HTMLFormControl; break; case LINK: // Any element, and any elements with an 'href' attribute. match = (elem instanceof HTMLAnchorElement || elem instanceof HTMLAreaElement) && elem.getAttribute("href").length() > 0; break; case AREA: // Any element. match = elem instanceof HTMLAreaElement; break; case OPTION: // Any elements for backward compatibility. if (match && name != null) { // If an anchor and 'name' attribute matches, return true. Otherwise, // try 'id' attribute. if (elem instanceof HTMLAnchorElement && name.equals(elem.getAttribute("name"))) { return true; } match = name.equals(elem.getAttribute("id")); } } return match; } } /** * {@link HTMLCollectionImpl#item(int)} must traverse down the tree and decrement the * index until it matches an element who's index is zero. Since integers are * passed by value, this class servers to pass the index into each recursion * by reference. It encompasses all the operations that need be performed on * the index, although direct access is possible. *

* * @see HTMLCollectionImpl#item(int) */ class CollectionIndex { /** * Holds the actual value that is passed by reference using this class. */ private int index_; /** * Decrements the index by one. */ void decrement() { --index_; } /** * Returns true if index is zero (or negative). * * @return True if index is zero */ boolean isZero() { return index_ <= 0; } /** * Constructs a new index with the specified initial value. The index will * then be decremeneted until it reaches zero. * * @param index The initial value */ CollectionIndex(final int index) { index_ = index; } }