org.htmlparser.nodes.AbstractNode Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of htmllexer Show documentation
HTML Lexer is the low level lexical analyzer.
The newest version!
// HTMLParser Library - A java-based parser for HTML
// http://htmlparser.org
// Copyright (C) 2006 Somik Raha
//
// Revision Control Information
//
// $URL: https://htmlparser.svn.sourceforge.net/svnroot/htmlparser/tags/HTMLParserProject-2.1/lexer/src/main/java/org/htmlparser/nodes/AbstractNode.java $
// $Author: niveshkalra $
// $Date: 2009-12-20 00:17:20 +0100 (Sun, 20 Dec 2009) $
// $Revision: 55 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the Common Public License; either
// version 1.0 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// Common Public License for more details.
//
// You should have received a copy of the Common Public License
// along with this library; if not, the license is available from
// the Open Source Initiative (OSI) website:
//   http://opensource.org/licenses/cpl1.0.php

package org.htmlparser.nodes;

import java.io.Serializable;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import org.htmlparser.visitors.NodeVisitor;

/**
 * The concrete base class for all types of nodes (tags, text remarks).
 * This class provides basic functionality to hold the {@link Page}, the
 * starting and ending position in the page, the parent and the list of
 * {@link NodeList children}.
 */
public abstract class AbstractNode implements Node, Serializable
{
    /**
     * The page this node came from.
     */
    protected Page mPage;

    /**
     * The beginning position of the tag in the line
     */
    protected int nodeBegin;

    /**
     * The ending position of the tag in the line
     */
    protected int nodeEnd;

    /**
     * The parent of this node.
     */
    protected Node parent;

    /**
     * The children of this node.
     */
    protected NodeList children;

    /**
     * Create an abstract node with the page positions given.
     * Remember the page and start & end cursor positions.
     * @param page The page this tag was read from.
     * @param start The starting offset of this node within the page.
     * @param end The ending offset of this node within the page.
     */
    public AbstractNode (Page page, int start, int end)
    {
        mPage = page;
        nodeBegin = start;
        nodeEnd = end;
        parent = null;
        children = null;
    }

    /**
     * Clone this object.
     * Exposes java.lang.Object clone as a public method.
     * @return A clone of this object.
     * @exception CloneNotSupportedException This shouldn't be thrown since
     * the {@link Node} interface extends Cloneable.
     */
    public Object clone() throws CloneNotSupportedException
    {
        return (super.clone ());
    }

    /**
     * Returns a string representation of the node.
     * It allows a simple string transformation
     * of a web page, regardless of node type.

     * Typical application code (for extracting only the text from a web page)
     * would then be simplified to:

     *      * Node node;
     * for (Enumeration e = parser.elements (); e.hasMoreElements (); )
     * {
     *     node = (Node)e.nextElement();
     *     System.out.println (node.toPlainTextString ());
     *     // or do whatever processing you wish with the plain text string
     * }
     * 
     * @return The 'browser' content of this node.
     */
    public abstract String toPlainTextString ();

    /**
     * Return the HTML for this node.
     * This should be the sequence of characters that were encountered by
     * the parser that caused this node to be created. Where this breaks down is
     * where broken nodes (tags and remarks) have been encountered and fixed.
     * Applications reproducing html can use this method on nodes which are to
     * be used or transferred as they were received or created.
     * @return The sequence of characters that would cause this node
     * to be returned by the parser or lexer.
     */
    public String toHtml ()
    {
        return (toHtml (false));
    }

    /**
     * Return the HTML for this node.
     * This should be the exact sequence of characters that were encountered by
     * the parser that caused this node to be created. Where this breaks down is
     * where broken nodes (tags and remarks) have been encountered and fixed.
     * Applications reproducing html can use this method on nodes which are to
     * be used or transferred as they were received or created.
     * @param verbatim If true return as close to the original
     * page text as possible.
     * @return The (exact) sequence of characters that would cause this node
     * to be returned by the parser or lexer.
     */
    public abstract String toHtml (boolean verbatim);

    /**
     * Return a string representation of the node.
     * Subclasses must define this method, and this is typically to be used in the manner

     * System.out.println(node)
     * @return A textual representation of the node suitable for debugging
     */
    public abstract String toString ();

    /**
     * Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node
     * satisfies the filtering criteria.
     * 
     * This mechanism allows powerful filtering code to be written very easily,
     * without bothering about collection of embedded tags separately.
     * e.g. when we try to get all the links on a page, it is not possible to
     * get it at the top-level, as many tags (like form tags), can contain
     * links embedded in them. We could get the links out by checking if the
     * current node is a {@link org.htmlparser.tags.CompositeTag}, and going through its children.
     * So this method provides a convenient way to do this.

     * 
     * Using collectInto(), programs get a lot shorter. Now, the code to
     * extract all links from a page would look like:
     * 
     * NodeList collectionList = new NodeList();
     * NodeFilter filter = new TagNameFilter ("A");
     * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
     *      e.nextNode().collectInto(collectionList, filter);
     * 
     * Thus, collectionList will hold all the link nodes, irrespective of how
     * deep the links are embedded.
     * 
     * Another way to accomplish the same objective is:
     * 
     * NodeList collectionList = new NodeList();
     * NodeFilter filter = new TagClassFilter (LinkTag.class);
     * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
     *      e.nextNode().collectInto(collectionList, filter);
     * 
     * This is slightly less specific because the LinkTag class may be
     * registered for more than one node name, e.g. <LINK> tags too.
     * @param list The node list to collect acceptable nodes into.
     * @param filter The filter to determine which nodes are retained.
     */
    public void collectInto (NodeList list, NodeFilter filter)
    {
        if (filter.accept (this))
            list.add (this);
    }

    /**
     * Get the page this node came from.
     * @return The page that supplied this node.
     */
    public Page getPage ()
    {
        return (mPage);
    }

    /**
     * Set the page this node came from.
     * @param page The page that supplied this node.
     */
    public void setPage (Page page)
    {
        mPage = page;
    }

    /**
     * Gets the starting position of the node.
     * @return The start position.
     */
    public int getStartPosition ()
    {
        return (nodeBegin);
    }

    /**
     * Sets the starting position of the node.
     * @param position The new start position.
     */
    public void setStartPosition (int position)
    {
        nodeBegin = position;
    }

    /**
     * Gets the ending position of the node.
     * @return The end position.
     */
    public int getEndPosition ()
    {
        return (nodeEnd);
    }

    /**
     * Sets the ending position of the node.
     * @param position The new end position.
     */
    public void setEndPosition (int position)
    {
        nodeEnd = position;
    }

    /**
     * Visit this node.
     * @param visitor The visitor that is visiting this node.
     */
    public abstract void accept (NodeVisitor visitor);

    /**
     * Get the parent of this node.
     * This will always return null when parsing without scanners,
     * i.e. if semantic parsing was not performed.
     * The object returned from this method can be safely cast to a CompositeTag.
     * @return The parent of this node, if it's been set, null otherwise.
     */
    public Node getParent ()
    {
        return (parent);
    }

    /**
     * Sets the parent of this node.
     * @param node The node that contains this node. Must be a CompositeTag.
     */
    /* See bug: https://sourceforge.net/tracker/?func=detail&aid=1755537&group_id=24399&atid=381399
     * A check needs to be performed to see that a tag cannot be its own parent or child and if it
     * is then just ignore it 
     */
    public void setParent (Node node)
    {
      if(this != node){
        parent = node;
      }        
    }

    /**
     * Get the children of this node.
     * @return The list of children contained by this node, if it's been set, null otherwise.
     */
    public NodeList getChildren ()
    {
        return (children);
    }

    /**
     * Set the children of this node.
     * @param children The new list of children this node contains.
     */
    /* See bug: https://sourceforge.net/tracker/?func=detail&aid=1755537&group_id=24399&atid=381399
     * A check needs to be performed to see that a tag cannot be its own parent
     * or child and if it is the case then just ignore it 
     */
    public void setChildren (NodeList children)
    {
      /* Always Initialize the children field as in the constructor its being
       * initialized to null
       */
      this.children = new NodeList();
      /* Do nothing if the children node list contains the node
       * (i.e. the node whose children is being set) itself
       */
      for(SimpleNodeIterator  it = children.elements(); it.hasMoreNodes();){
        Node nodetoadd = it.nextNode();
        if(this != nodetoadd){
          this.children.add(nodetoadd);
        }
      }
      //this.children = children;
    }
    
    /**
     * Get the first child of this node.
     * @return The first child in the list of children contained by this node,
     * null otherwise.
     */
    public Node getFirstChild ()
    {
        if (children == null)
            return null;
        if (children.size() == 0)
            return null;
        return children.elementAt(0);
    }
    
    /**
     * Get the last child of this node.
     * @return The last child in the list of children contained by this node,
     * null otherwise.
     */
    public Node getLastChild ()
    {
        if (children == null)
            return null;
        int numChildren = children.size();
        if (numChildren == 0)
            return null;
        return children.elementAt(numChildren - 1);
    }
    
    /**
     * Get the previous sibling to this node.
     * @return The previous sibling to this node if one exists,
     * null otherwise.
     */
    public Node getPreviousSibling ()
    {
        Node parentNode = this.getParent();
        if (parentNode == null)//root node
            return null;
        NodeList siblings = parentNode.getChildren();
        if (siblings == null)//this should actually be an error
            return null;
        int numSiblings = siblings.size();
        if (numSiblings < 2)//need at least one other node to have a chance of having any siblings
            return null;
        int positionInParent = -1;
        for (int i = 0; i < numSiblings; i++)
        {
            if (siblings.elementAt(i) == this)
            {
                positionInParent = i;
                break;
            }
        }
        if (positionInParent < 1)//no previous siblings
            return null;
        return siblings.elementAt(positionInParent - 1);
    }
    
    /**
     * Get the next sibling to this node.
     * @return The next sibling to this node if one exists,
     * null otherwise.
     */
    public Node getNextSibling ()
    {
        Node parentNode = this.getParent();
        if (parentNode == null)//root node
            return null;
        NodeList siblings = parentNode.getChildren();
        if (siblings == null)//this should actually be an error
            return null;
        int numSiblings = siblings.size();
        if (numSiblings < 2)//need at least one other node to have a chance of having any siblings
            return null;
        int positionInParent = -1;
        for (int i = 0; i < numSiblings; i++)
        {
            if (siblings.elementAt(i) == this)
            {
                positionInParent = i;
                break;
            }
        }
        if (positionInParent == -1)//this should actually be an error
            return null;
        if (positionInParent == (numSiblings - 1))//no next sibling
            return null;
        return siblings.elementAt(positionInParent + 1);
    }

    /**
     * Returns the text of the node.
     * @return The text of this node. The default is null.
     */
    public String getText ()
    {
        return null;
    }

    /**
     * Sets the string contents of the node.
     * @param text The new text for the node.
     */
    public void setText(String text)
    {
    }

    /**
     * Perform the meaning of this tag.
     * The default action is to do nothing.
     * @exception ParserException Not used. Provides for subclasses
     * that may want to indicate an exceptional condition.
     */
    public void doSemanticAction ()
        throws
            ParserException
    {
    }
}