org.htmlunit.cyberneko.html.dom.HTMLCollectionImpl Maven / Gradle / Ivy

Go to download
/*
 * Copyright (c) 2002-2009 Andy Clark, Marc Guillemot
 * Copyright (c) 2017-2024 Ronald Brill
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.htmlunit.cyberneko.html.dom;

import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.html.HTMLAnchorElement;
import org.w3c.dom.html.HTMLAppletElement;
import org.w3c.dom.html.HTMLAreaElement;
import org.w3c.dom.html.HTMLCollection;
import org.w3c.dom.html.HTMLElement;
import org.w3c.dom.html.HTMLFormElement;
import org.w3c.dom.html.HTMLImageElement;
import org.w3c.dom.html.HTMLObjectElement;
import org.w3c.dom.html.HTMLOptionElement;
import org.w3c.dom.html.HTMLTableCellElement;
import org.w3c.dom.html.HTMLTableRowElement;
import org.w3c.dom.html.HTMLTableSectionElement;

/**
 * Implements {@link org.w3c.dom.html.HTMLCollection} to traverse any named
 * elements on a {@link org.w3c.dom.html.HTMLDocument}. The elements type to
 * look for is identified in the constructor by code. This collection is not
 * optimized for traversing large trees.
 * 
 * The collection has to meet two requirements: it has to be live, and it has
 * to traverse depth first and always return results in that order. As such,
 * using an object container (such as {@link java.util.Vector}) is expensive on
 * insert/remove operations. Instead, the collection has been implemented using
 * three traversing functions. As a result, operations on large documents will
 * result in traversal of the entire document tree and consume a considerable
 * amount of time.
 * 

 * Note that synchronization on the traversed document cannot be achieved.
 * The document itself cannot be locked, and locking each traversed node is
 * likely to lead to a dead lock condition. Therefore, there is a chance of the
 * document being changed as results are fetched; in all likelihood, the results
 * might be out dated, but not erroneous.
 * 

 *
 * @author Assaf Arkin
 * @see org.w3c.dom.html.HTMLCollection
 */
class HTMLCollectionImpl implements HTMLCollection {

    /**
     * Request collection of all anchors in document: <A> elements that
     * have a name attribute.
     */
    static final short        ANCHOR = 1;

    /**
     * Request collection of all forms in document: <FORM> elements.
     */
    static final short        FORM = 2;

    /**
     * Request collection of all images in document: <IMG> elements.
     */
    static final short        IMAGE = 3;

    /**
     * Request collection of all Applets in document: <APPLET> and
     * <OBJECT> elements (<OBJECT> must contain an Applet).
     */
    static final short        APPLET = 4;

    /**
     * Request collection of all links in document: <A> and <AREA>
     * elements (must have a href attribute).
     */
    static final short        LINK = 5;

    /**
     * Request collection of all options in selection: <OPTION> elements in
     * <SELECT> or <OPTGROUP>.
     */
    static final short        OPTION = 6;

    /**
     * Request collection of all rows in table: <TR> elements in table or
     * table section.
     */
    static final short        ROW = 7;

    /**
     * Request collection of all form elements: <INPUT>, <BUTTON>,
     * <SELECT>, and <TEXTAREA> elements inside form <FORM>.
     */
    static final short        ELEMENT = 8;

    /**
     * Request collection of all areas in map: <AREA> element in <MAP>
     * (non recursive).
     */
    static final short        AREA = -1;

    /**
     * Request collection of all table bodies in table: <TBODY> element in
     * table <TABLE> (non recursive).
     */
    static final short        TBODY = -2;

    /**
     * Request collection of all cells in row: <TD> and <TH>
     * elements in <TR> (non recursive).
     */
    static final short        CELL = -3;

    /**
     * Indicates what this collection is looking for. Holds one of the enumerated
     * values and used by {@link #collectionMatch}. Set by the constructor and
     * determine the collection's use for its life time.
     */
    private final short lookingFor_;

    /**
     * This is the top level element underneath which the collection exists.
     */
    private final Element topLevel_;

    /**
     * Construct a new collection that retrieves element of the specific type
     * (lookingFor) from the specific document portion
     * (topLevel).
     *
     * @param topLevel The element underneath which the collection exists
     * @param lookingFor Code indicating what elements to look for
     */
    HTMLCollectionImpl(final HTMLElement topLevel, final short lookingFor) {
        if (topLevel == null) {
            throw new NullPointerException("HTM011 Argument 'topLevel' is null.");
        }
        topLevel_ = topLevel;
        lookingFor_ = lookingFor;
    }

    /**
     * Returns the length of the collection. This method might traverse the
     * entire document tree.
     *
     * @return Length of the collection
     */
    @Override
    public final int getLength() {
        // Call recursive function on top-level element.
        return getLength(topLevel_);
    }

    /**
     * Retrieves the indexed node from the collection. Nodes are numbered in
     * tree order - depth-first traversal order. This method might traverse
     * the entire document tree.
     *
     * @param index The index of the node to return
     * @return The specified node or null if no such node found
     */
    @Override
    public final Node item(final int index) {
        if (index < 0) {
            throw new IllegalArgumentException("HTM012 Argument 'index' is negative.");
        }

        // Call recursive function on top-level element.
        return item(topLevel_, new CollectionIndex(index));
    }

    /**
     * Retrieves the named node from the collection. The name is matched case
     * sensitive against the id attribute of each element in the
     * collection, returning the first match. The tree is traversed in
     * depth-first order. This method might traverse the entire document tree.
     *
     * @param name The name of the node to return
     * @return The specified node or null if no such node found
     */
    @Override
    public final Node namedItem(final String name) {
        if (name == null) {
            throw new NullPointerException("HTM013 Argument 'name' is null.");
        }

        // Call recursive function on top-level element.
        return namedItem(topLevel_, name);
    }

    /**
     * Recursive function returns the number of elements of a particular type
     * that exist under the top level element. This is a recursive function
     * and the top level element is passed along.
     *
     * @param topLevel Top level element from which to scan
     * @return Number of elements
     */
    private int getLength(final Element topLevel) {
        int length;
        Node node;

        synchronized (topLevel) {
            // Always count from zero and traverse all the childs of the
            // current element in the order they appear.
            length = 0;
            node = topLevel.getFirstChild();
            while (node != null) {
                // If a particular node is an element (could be HTML or XML),
                // do two things: if it's the one we're looking for, count
                // another matched element; at any rate, traverse it's
                // children as well.
                if (node instanceof Element) {
                    if (collectionMatch((Element) node, null)) {
                        ++length;
                    }
                    else if (recurse()) {
                        length += getLength((Element) node);
                    }
                }
                node = node.getNextSibling();
            }
        }
        return length;
    }

    /**
     * Recursive function returns the numbered element of a particular type
     * that exist under the top level element. This is a recursive function
     * and the top level element is passed along.
     * 

     * Note that this function must call itself with an index and get back both
     * the element (if one was found) and the new index which is decremeneted
     * for any like element found. Since integers are only passed by value,
     * this function makes use of a separate class ({@link CollectionIndex})
     * to hold that index.
     *
     * @param topLevel Top level element from which to scan
     * @param index The index of the item to retreive
     * @return Number of elements
     * @see CollectionIndex
     */
    private Node item(final Element topLevel, final CollectionIndex index) {
        Node    node;
        Node    result;

        synchronized (topLevel) {
            // Traverse all the childs of the current element in the order
            // they appear. Count from the index backwards until you reach
            // matching element with an index of zero. Return that element.
            node = topLevel.getFirstChild();
            while (node != null) {
                // If a particular node is an element (could be HTML or XML),
                // do two things: if it's the one we're looking for, decrease
                // the index and if zero, return this node; at any rate,
                // traverse it's children as well.
                if (node instanceof Element) {
                    if (collectionMatch((Element) node, null)) {
                        if (index.isZero()) {
                            return node;
                        }
                        index.decrement();
                    }
                    else if (recurse()) {
                        result = item((Element) node, index);
                        if (result != null) {
                            return result;
                        }
                    }
                }
                node = node.getNextSibling();
            }
        }
        return null;
    }

    /**
     * Recursive function returns an element of a particular type with the
     * specified name (id attribute).
     *
     * @param topLevel Top level element from which to scan
     * @param name The named element to look for
     * @return The first named element found
     */
    private  Node namedItem(final Element topLevel, final String name) {
        Node    node;
        Node    result;

        synchronized (topLevel) {
            // Traverse all the childs of the current element in the order
            // they appear.
            node = topLevel.getFirstChild();
            while (node != null) {
                // If a particular node is an element (could be HTML or XML),
                // do two things: if it's the one we're looking for, and the
                // name (id attribute) attribute is the one we're looking for,
                // return this element; otherwise, traverse it's children.
                if (node instanceof Element) {
                    if (collectionMatch((Element) node, name)) {
                        return node;
                    }
                    else if (recurse()) {
                        result = namedItem((Element) node, name);
                        if (result != null) {
                            return result;
                        }
                    }
                }
                node = node.getNextSibling();
            }
            return node;
        }
    }

    /**
     * Returns true if scanning methods should iterate through the collection.
     * When looking for elements in the document, recursing is needed to traverse
     * the full document tree. When looking inside a specific element (e.g. for a
     * cell inside a row), recursing can lead to erroneous results.
     *
     * @return True if methods should recurse to traverse entire tree
     */
    protected boolean recurse() {
        return lookingFor_ > 0;
    }

    /**
     * Determines if current element matches based on what we're looking for.
     * The element is passed along with an optional identifier name. If the
     * element is the one we're looking for, return true. If the name is also
     * specified, the name must match the id attribute
     * (match name first for anchors).
     *
     * @param elem The current element
     * @param name The identifier name or null
     * @return The element matches what we're looking for
     */
    protected boolean collectionMatch(final Element elem, final String name) {
        boolean    match;

        synchronized (elem) {
            // Begin with no matching. Depending on what we're looking for,
            // attempt to match based on the element type. This is the quickest
            // way to match involving only a cast. Do the expensive string
            // comparison later on.
            match = false;
            switch (lookingFor_) {
                case ANCHOR:
                // Anchor is an  element with a 'name' attribute. Otherwise, it's
                // just a link.
                    match = (elem instanceof HTMLAnchorElement) && elem.getAttribute("name").length() > 0;
                    break;
                case FORM:
                    // Any 
 element.
                    match = elem instanceof HTMLFormElement;
                    break;
                case IMAGE:
                    // Any  element.