org.htmlparser.nodes.AbstractNode Maven / Gradle / Ivy
Show all versions of bboss-htmlparser Show documentation
// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Somik Raha
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes/AbstractNode.java,v $
// $Author: derrickoswald $
// $Date: 2005/04/10 23:20:44 $
// $Revision: 1.4 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
package org.htmlparser.nodes;
import java.io.Serializable;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.NodeVisitor;
/**
* The concrete base class for all types of nodes (tags, text remarks).
* This class provides basic functionality to hold the {@link Page}, the
* starting and ending position in the page, the parent and the list of
* {@link NodeList children}.
*/
public abstract class AbstractNode implements Node, Serializable
{
protected boolean isresource = false;
/**
* The page this node came from.
*/
protected Page mPage;
/**
* The beginning position of the tag in the line
*/
protected int nodeBegin;
/**
* The ending position of the tag in the line
*/
protected int nodeEnd;
/**
* The parent of this node.
*/
protected Node parent;
/**
* The children of this node.
*/
protected NodeList children;
/**
* Create an abstract node with the page positions given.
* Remember the page and start & end cursor positions.
* @param page The page this tag was read from.
* @param start The starting offset of this node within the page.
* @param end The ending offset of this node within the page.
*/
public AbstractNode (Page page, int start, int end)
{
mPage = page;
nodeBegin = start;
nodeEnd = end;
parent = null;
children = null;
}
/**
* Clone this object.
* Exposes java.lang.Object clone as a public method.
* @return A clone of this object.
* @exception CloneNotSupportedException This shouldn't be thrown since
* the {@link Node} interface extends Cloneable.
*/
public Object clone() throws CloneNotSupportedException
{
return (super.clone ());
}
/**
* Returns a string representation of the node.
* It allows a simple string transformation
* of a web page, regardless of node type.
* Typical application code (for extracting only the text from a web page)
* would then be simplified to:
*
* Node node;
* for (Enumeration e = parser.elements (); e.hasMoreElements (); )
* {
* node = (Node)e.nextElement();
* System.out.println (node.toPlainTextString ());
* // or do whatever processing you wish with the plain text string
* }
*
* @return The 'browser' content of this node.
*/
public abstract String toPlainTextString ();
/**
* Return the HTML that generated this node.
* This method will make it easier when using html parser to reproduce html
* pages (with or without modifications).
* Applications reproducing html can use this method on nodes which are to
* be used or transferred as they were recieved, with the original html.
* @return The HTML code for this node.
*/
public abstract String toHtml ();
/**
* Return a string representation of the node.
* Subclasses must define this method, and this is typically to be used in the manner
* System.out.println(node)
* @return A textual representation of the node suitable for debugging
*/
public abstract String toString ();
/**
* Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node
* satisfies the filtering criteria.
*
* This mechanism allows powerful filtering code to be written very easily,
* without bothering about collection of embedded tags separately.
* e.g. when we try to get all the links on a page, it is not possible to
* get it at the top-level, as many tags (like form tags), can contain
* links embedded in them. We could get the links out by checking if the
* current node is a {@link org.htmlparser.tags.CompositeTag}, and going through its children.
* So this method provides a convenient way to do this.
*
* Using collectInto(), programs get a lot shorter. Now, the code to
* extract all links from a page would look like:
*
* NodeList collectionList = new NodeList();
* NodeFilter filter = new TagNameFilter ("A");
* for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
* e.nextNode().collectInto(collectionList, filter);
*
* Thus, collectionList will hold all the link nodes, irrespective of how
* deep the links are embedded.
*
* Another way to accomplish the same objective is:
*
* NodeList collectionList = new NodeList();
* NodeFilter filter = new TagClassFilter (LinkTag.class);
* for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
* e.nextNode().collectInto(collectionList, filter);
*
* This is slightly less specific because the LinkTag class may be
* registered for more than one node name, e.g. <LINK> tags too.
* @param list The node list to collect acceptable nodes into.
* @param filter The filter to determine which nodes are retained.
*/
public void collectInto (NodeList list, NodeFilter filter)
{
if (filter.accept (this))
list.add (this);
}
/**
* Get the page this node came from.
* @return The page that supplied this node.
*/
public Page getPage ()
{
return (mPage);
}
public boolean isResource()
{
return this.isresource;
}
public void setResource(boolean isresource)
{
this.isresource = isresource;
}
/**
* Set the page this node came from.
* @param page The page that supplied this node.
*/
public void setPage (Page page)
{
mPage = page;
}
/**
* Gets the starting position of the node.
* @return The start position.
*/
public int getStartPosition ()
{
return (nodeBegin);
}
/**
* Sets the starting position of the node.
* @param position The new start position.
*/
public void setStartPosition (int position)
{
nodeBegin = position;
}
/**
* Gets the ending position of the node.
* @return The end position.
*/
public int getEndPosition ()
{
return (nodeEnd);
}
/**
* Sets the ending position of the node.
* @param position The new end position.
*/
public void setEndPosition (int position)
{
nodeEnd = position;
}
/**
* Visit this node.
* @param visitor The visitor that is visiting this node.
*/
public abstract void accept (NodeVisitor visitor);
/**
* Get the parent of this node.
* This will always return null when parsing without scanners,
* i.e. if semantic parsing was not performed.
* The object returned from this method can be safely cast to a CompositeTag
.
* @return The parent of this node, if it's been set, null
otherwise.
*/
public Node getParent ()
{
return (parent);
}
/**
* Sets the parent of this node.
* @param node The node that contains this node. Must be a CompositeTag
.
*/
public void setParent (Node node)
{
parent = node;
}
/**
* Get the children of this node.
* @return The list of children contained by this node, if it's been set, null
otherwise.
*/
public NodeList getChildren ()
{
return (children);
}
/**
* Set the children of this node.
* @param children The new list of children this node contains.
*/
public void setChildren (NodeList children)
{
this.children = children;
}
/**
* Returns the text of the node.
* @return The text of this node. The default is null
.
*/
public String getText ()
{
return null;
}
/**
* Sets the string contents of the node.
* @param text The new text for the node.
*/
public void setText(String text)
{
}
/**
* Perform the meaning of this tag.
* The default action is to do nothing.
* @exception ParserException Not used. Provides for subclasses
* that may want to indicate an exceptional condition.
*/
public void doSemanticAction ()
throws
ParserException
{
}
}