All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.htmlparser.PrototypicalNodeFactory Maven / Gradle / Ivy

// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2003 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/PrototypicalNodeFactory.java,v $
// $Author: derrickoswald $
// $Date: 2005/04/24 17:48:27 $
// $Revision: 1.15 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.List;

import org.htmlparser.lexer.Page;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.AppletTag;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.tags.Bullet;
import org.htmlparser.tags.BulletList;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.DoctypeTag;
import org.htmlparser.tags.FormTag;
import org.htmlparser.tags.FrameSetTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.HeadTag;
import org.htmlparser.tags.Html;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.InputTag;
import org.htmlparser.tags.JspTag;
import org.htmlparser.tags.LabelTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.ObjectTag;
import org.htmlparser.tags.OptionTag;
import org.htmlparser.tags.ResourceTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.SelectTag;
import org.htmlparser.tags.Span;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableHeader;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.tags.TextareaTag;
import org.htmlparser.tags.TitleTag;

/**
 * A node factory based on the prototype pattern.
 * This factory uses the prototype pattern to generate new nodes.
 * These are cloned as needed to form new {@link Text}, {@link Remark} and
 * {@link Tag} nodes.

*

Text and remark nodes are generated from prototypes accessed * via the {@link #setTextPrototype(Text) textPrototype} and * {@link #setRemarkPrototype(Remark) remarkPrototype} properties respectively. * Tag nodes are generated as follows: *

Prototype tags, in the form of undifferentiated tags, are held in a hash * table. On a request for a tag, the attributes are examined for the name * of the tag to be created. If a prototype of that name has been registered * (exists in the hash table), it is cloned and the clone is given the * characteristics ({@link Attribute Attributes}, start and end position) * of the requested tag.

*

In the case that no tag has been registered under that name, * a generic tag is created from the prototype acessed via the * {@link #setTagPrototype(Tag) tagPrototype} property.

*

The hash table of registered tags can be automatically populated with * all the know tags from the {@link org.htmlparser.tags} package when * the factory is constructed, or it can start out empty and be populated * explicitly.

*

Here is an example of how to override all text issued from * {@link org.htmlparser.nodes.TextNode#toPlainTextString() * Text.toPlainTextString()}, * in this case decoding (converting character references), * which illustrates the use of setting the text prototype: *

 * PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();
 * factory.setTextPrototype (
 *     // create a inner class that is a subclass of TextNode
 *     new TextNode () {
 *         public String toPlainTextString()
 *         {
 *             String original = super.toPlainTextString ();
 *             return (org.htmlparser.util.Translate.decode (original));
 *         }
 *     });
 * Parser parser = new Parser ();
 * parser.setNodeFactory (factory);
 * 

*

Here is an example of using a custom link tag, in this case just * printing the URL, which illustrates registering a tag: *

 *
 * class PrintingLinkTag extends LinkTag
 * {
 *     public void doSemanticAction ()
 *         throws
 *             ParserException
 *     {
 *         System.out.println (getLink ());
 *     }
 * }
 * PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();
 * factory.registerTag (new PrintingLinkTag ());
 * Parser parser = new Parser ();
 * parser.setNodeFactory (factory);
 * 

*/ public class PrototypicalNodeFactory implements Serializable, NodeFactory { /** * The prototypical text node. */ protected Text mText; /** * The prototypical remark node. */ protected Remark mRemark; /** * The prototypical tag node. */ protected Tag mTag; /** * The list of tags to return. * The list is keyed by tag name. */ protected Map mBlastocyst; /** * Create a new factory with all tags registered. * Equivalent to * {@link #PrototypicalNodeFactory() PrototypicalNodeFactory(false)}. */ public PrototypicalNodeFactory () { this (false); } /** * Create a new factory. * @param empty If true, creates an empty factory, * otherwise create a new factory with all tags registered. */ public PrototypicalNodeFactory (boolean empty) { clear (); mText = new TextNode (null, 0, 0); mRemark = new RemarkNode (null, 0, 0); mTag = new TagNode (null, 0, 0, null); if (!empty) registerTags (); } /** * Create a new factory with the given tag as the only registered tag. * @param tag The single tag to register in the otherwise empty factory. */ public PrototypicalNodeFactory (Tag tag) { this (true); registerTag (tag); } /** * Create a new factory with the given tags registered. * @param tags The tags to register in the otherwise empty factory. */ public PrototypicalNodeFactory (Tag[] tags) { this (true); for (int i = 0; i < tags.length; i++) registerTag (tags[i]); } /** * Adds a tag to the registry. * @param id The name under which to register the tag. * For proper operation, the id should be uppercase so it * will be matched by a Map lookup. * @param tag The tag to be returned from a {@link #createTagNode} call. * @return The tag previously registered with that id if any, * or null if none. */ public Tag put (String id, Tag tag) { return ((Tag)mBlastocyst.put (id, tag)); } /** * Gets a tag from the registry. * @param id The name of the tag to return. * @return The tag registered under the id name, * or null if none. */ public Tag get (String id) { return ((Tag)mBlastocyst.get (id)); } /** * Remove a tag from the registry. * @param id The name of the tag to remove. * @return The tag that was registered with that id, * or null if none. */ public Tag remove (String id) { return ((Tag)mBlastocyst.remove (id)); } /** * Clean out the registry. */ public void clear () { mBlastocyst = new HashMap (); } /** * Get the list of tag names. * @return The names of the tags currently registered. */ public Set getTagNames () { return (mBlastocyst.keySet ()); } /** * Register a tag. * Registers the given tag under every {@link Tag#getIds() id} that the * tag has (i.e. all names returned by {@link Tag#getIds() tag.getIds()}. *

For proper operation, the ids are converted to uppercase so * they will be matched by a Map lookup. * @param tag The tag to register. */ public void registerTag (Tag tag) { String[] ids; ids = tag.getIds (); for (int i = 0; i < ids.length; i++) put (ids[i].toUpperCase (Locale.ENGLISH), tag); } /** * Unregister a tag. * Unregisters the given tag from every {@link Tag#getIds() id} the tag has. *

The ids are converted to uppercase to undo the operation * of registerTag. * @param tag The tag to unregister. */ public void unregisterTag (Tag tag) { String[] ids; ids = tag.getIds (); for (int i = 0; i < ids.length; i++) remove (ids[i].toUpperCase (Locale.ENGLISH)); } /** * Register all known tags in the tag package. * Registers tags from the {@link org.htmlparser.tags tag package} by * calling {@link #registerTag(Tag) registerTag()}. * @return 'this' nodefactory as a convenience. */ public PrototypicalNodeFactory registerTags () { registerTag (new AppletTag ()); registerTag (new BaseHrefTag ()); registerTag (new Bullet ()); registerTag (new BulletList ()); registerTag (new DoctypeTag ()); registerTag (new FormTag ()); registerTag (new FrameSetTag ()); registerTag (new FrameTag ()); registerTag (new ImageTag ()); registerTag (new InputTag ()); registerTag (new JspTag ()); registerTag (new LabelTag ()); registerTag (new LinkTag ()); registerTag (new MetaTag ()); registerTag (new ObjectTag ()); registerTag (new OptionTag ()); registerTag (new ScriptTag ()); registerTag (new SelectTag ()); registerTag (new StyleTag ()); registerTag (new TableColumn ()); registerTag (new TableHeader ()); registerTag (new TableRow ()); registerTag (new TableTag ()); registerTag (new TextareaTag ()); registerTag (new TitleTag ()); registerTag (new Div ()); registerTag (new Span ()); registerTag (new BodyTag ()); registerTag (new HeadTag ()); registerTag (new Html ()); registerTag (new ResourceTag ()); return (this); } /** * Get the object that is cloned to generate text nodes. * @return The prototype for {@link Text} nodes. */ public Text getTextPrototype () { return (mText); } /** * Set the object to be used to generate text nodes. * @param text The prototype for {@link Text} nodes. * If null the prototype is set to the default * ({@link TextNode}). */ public void setTextPrototype (Text text) { if (null == text) mText = new TextNode (null, 0, 0); else mText = text; } /** * Get the object that is cloned to generate remark nodes. * @return The prototype for {@link Remark} nodes. */ public Remark getRemarkPrototype () { return (mRemark); } /** * Set the object to be used to generate remark nodes. * @param remark The prototype for {@link Remark} nodes. * If null the prototype is set to the default * ({@link RemarkNode}). */ public void setRemarkPrototype (Remark remark) { if (null == remark) mRemark = new RemarkNode (null, 0, 0); else mRemark = remark; } /** * Get the object that is cloned to generate tag nodes. * Clones of this object are returned from {@link #createTagNode} when no * specific tag is found in the list of registered tags. * @return The prototype for {@link Tag} nodes. */ public Tag getTagPrototype () { return (mTag); } /** * Set the object to be used to generate tag nodes. * Clones of this object are returned from {@link #createTagNode} when no * specific tag is found in the list of registered tags. * @param tag The prototype for {@link Tag} nodes. * If null the prototype is set to the default * ({@link TagNode}). */ public void setTagPrototype (Tag tag) { if (null == tag) mTag = new TagNode (null, 0, 0, null); else mTag = tag; } // // NodeFactory interface // /** * Create a new string node. * @param page The page the node is on. * @param start The beginning position of the string. * @param end The ending position of the string. * @return A text node comprising the indicated characters from the page. */ public Text createStringNode (Page page, int start, int end) { Text ret; try { ret = (Text)(getTextPrototype ().clone ()); ret.setPage (page); ret.setStartPosition (start); ret.setEndPosition (end); } catch (CloneNotSupportedException cnse) { ret = new TextNode (page, start, end); } return (ret); } /** * Create a new remark node. * @param page The page the node is on. * @param start The beginning position of the remark. * @param end The ending positiong of the remark. * @return A remark node comprising the indicated characters from the page. */ public Remark createRemarkNode (Page page, int start, int end) { Remark ret; try { ret = (Remark)(getRemarkPrototype ().clone ()); ret.setPage (page); ret.setStartPosition (start); ret.setEndPosition (end); } catch (CloneNotSupportedException cnse) { ret = new RemarkNode (page, start, end); } return (ret); } /** * Create a new tag node. * Note that the attributes List contains at least one element, * which is the tag name (standalone attribute) at position zero. * This can be used to decide which type of node to create, or * gate other processing that may be appropriate. * @param page The page the node is on. * @param start The beginning position of the tag. * @param end The ending positiong of the tag. * @param attributes The attributes contained in this tag. * @return A tag node comprising the indicated characters from the page. */ public Tag createTagNode (Page page, int start, int end, List attributes) { Attribute attribute; String id; Tag prototype; Tag ret; ret = null; if (0 != attributes.size ()) { attribute = (Attribute)attributes.get (0); id = attribute.getName (); if (null != id) { try { id = id.toUpperCase (Locale.ENGLISH); if (!id.startsWith ("/")) { if (id.endsWith ("/")) id = id.substring (0, id.length () - 1); prototype = (Tag)mBlastocyst.get (id); if (null != prototype) { ret = (Tag)prototype.clone (); ret.setPage (page); ret.setStartPosition (start); ret.setEndPosition (end); ret.setAttributesEx (attributes); } } } catch (CloneNotSupportedException cnse) { // default to creating a generic one } } } if (null == ret) { // generate a generic node try { ret = (Tag)getTagPrototype ().clone (); ret.setPage (page); ret.setStartPosition (start); ret.setEndPosition (end); ret.setAttributesEx (attributes); } catch (CloneNotSupportedException cnse) { ret = new TagNode (page, start, end, attributes); } } return (ret); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy