org.htmlparser.Parser Maven / Gradle / Ivy
Show all versions of htmlparser Show documentation
// HTMLParser Library - A java-based parser for HTML
// http://htmlparser.org
// Copyright (C) 2006 Somik Raha
//
// Revision Control Information
//
// $URL: https://htmlparser.svn.sourceforge.net/svnroot/htmlparser/tags/HTMLParserProject-2.1/parser/src/main/java/org/htmlparser/Parser.java $
// $Author: derrickoswald $
// $Date: 2006-09-18 03:02:25 +0200 (Mon, 18 Sep 2006) $
// $Revision: 8 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the Common Public License; either
// version 1.0 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// Common Public License for more details.
//
// You should have received a copy of the Common Public License
// along with this library; if not, the license is available from
// the Open Source Initiative (OSI) website:
// http://opensource.org/licenses/cpl1.0.php
package org.htmlparser;
import java.io.Serializable;
import java.net.HttpURLConnection;
import java.net.URLConnection;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.http.ConnectionManager;
import org.htmlparser.http.ConnectionMonitor;
import org.htmlparser.http.HttpHeader;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.DefaultParserFeedback;
import org.htmlparser.util.IteratorImpl;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.ParserFeedback;
import org.htmlparser.visitors.NodeVisitor;
/**
* The main parser class.
* This is the primary class of the HTML Parser library. It provides
* constructors that take a {@link #Parser(String) String},
* a {@link #Parser(URLConnection) URLConnection}, or a
* {@link #Parser(Lexer) Lexer}. In the case of a String,
* a check is made to see if the first non-whitespace character is a <, in
* which case it is assumed to be HTML. Otherwise an
* attempt is made to open it as a URL, and if that fails it assumes it is a
* local disk file. If you want to parse a String after using the
* {@link #Parser() no-args} constructor, use
* {@link #setInputHTML setInputHTML()}, or you can use {@link #createParser}.
* The Parser provides access to the contents of the
* page, via a {@link #elements() NodeIterator}, a
* {@link #parse(NodeFilter) NodeList} or a
* {@link #visitAllNodesWith NodeVisitor}.
*
Typical usage of the parser is:
*
*
* Parser parser = new Parser ("http://whatever");
* NodeList list = parser.parse (null);
* // do something with your list of nodes.
*
*
* What types of nodes and what can be done with them is dependant on the
* setup, but in general a node can be converted back to HTML and it's
* children (enclosed nodes) and parent can be obtained, because nodes are
* nested. See the {@link Node} interface.
* For example, if the URL contains:
*
* {@.html
*
*
* Mondays -- What a bad idea.
*
*
* Most people have a pathological hatred of Mondays...
*
* }
*
* and the example code above is used, the list contain only one element, the
* {@.html } node. This node is a {@link org.htmlparser.tags tag},
* which is an object of class
* {@link org.htmlparser.tags.Html Html} if the default {@link NodeFactory}
* (a {@link PrototypicalNodeFactory}) is used.
* To get at further content, the children of the top
* level nodes must be examined. When digging through a node list one must be
* conscious of the possibility of whitespace between nodes, e.g. in the example
* above:
*
*
* Node node = list.elementAt (0);
* NodeList sublist = node.getChildren ();
* System.out.println (sublist.size ());
*
*
* would print out 5, not 2, because there are newlines after {@.html },
* {@.html } and {@.html } that are children of the HTML node
* besides the {@.html
} and {@.html } nodes.
* Because processing nodes is so common, two interfaces are provided to
* ease this task, {@link org.htmlparser.filters filters}
* and {@link org.htmlparser.visitors visitors}.
*/
public class Parser
implements
Serializable,
ConnectionMonitor
{
// Please don't change the formatting of the version variables below.
// This is done so as to facilitate ant script processing.
/**
* The floating point version number ({@value}).
*/
public static final double
VERSION_NUMBER = 2.0
;
/**
* The type of version ({@value}).
*/
public static final String
VERSION_TYPE = "Release Build"
;
/**
* The date of the version ({@value}).
*/
public static final String
VERSION_DATE = "Sep 17, 2006"
;
// End of formatting
/**
* The display version ({@value}).
*/
public static final String VERSION_STRING =
"" + VERSION_NUMBER
+ " (" + VERSION_TYPE + " " + VERSION_DATE + ")";
/**
* Feedback object.
*/
protected ParserFeedback mFeedback;
/**
* The html lexer associated with this parser.
*/
protected Lexer mLexer;
/**
* A quiet message sink.
* Use this for no feedback.
*/
public static final ParserFeedback DEVNULL =
new DefaultParserFeedback (DefaultParserFeedback.QUIET);
/**
* A verbose message sink.
* Use this for output on System.out
.
*/
public static final ParserFeedback STDOUT = new DefaultParserFeedback ();
static
{
getConnectionManager ().getDefaultRequestProperties ().put (
"User-Agent", "HTMLParser/" + getVersionNumber ());
}
//
// Static methods
//
/**
* Return the version string of this parser.
* @return A string of the form:
*
* "[floating point number] ([build-type] [build-date])"
*
*/
public static String getVersion ()
{
return (VERSION_STRING);
}
/**
* Return the version number of this parser.
* @return A floating point number, the whole number part is the major
* version, and the fractional part is the minor version.
*/
public static double getVersionNumber ()
{
return (VERSION_NUMBER);
}
/**
* Get the connection manager all Parsers use.
* @return The connection manager.
* @see #setConnectionManager
*/
public static ConnectionManager getConnectionManager ()
{
return (Page.getConnectionManager ());
}
/**
* Set the connection manager all Parsers use.
* @param manager The new connection manager.
* @see #getConnectionManager
*/
public static void setConnectionManager (ConnectionManager manager)
{
Page.setConnectionManager (manager);
}
/**
* Creates the parser on an input string.
* @param html The string containing HTML.
* @param charset Optional. The character set encoding that will
* be reported by {@link #getEncoding}. If charset is null
* the default character set is used.
* @return A parser with the html
string as input.
* @exception IllegalArgumentException if html
is null
.
*/
public static Parser createParser (String html, String charset)
{
Parser ret;
if (null == html)
throw new IllegalArgumentException ("html cannot be null");
ret = new Parser (new Lexer (new Page (html, charset)));
return (ret);
}
//
// Constructors
//
/**
* Zero argument constructor.
* The parser is in a safe but useless state parsing an empty string.
* Set the lexer or connection using {@link #setLexer}
* or {@link #setConnection}.
* @see #setLexer(Lexer)
* @see #setConnection(URLConnection)
*/
public Parser ()
{
this (new Lexer (new Page ("")), DEVNULL);
}
/**
* Construct a parser using the provided lexer and feedback object.
* This would be used to create a parser for special cases where the
* normal creation of a lexer on a URLConnection needs to be customized.
* @param lexer The lexer to draw characters from.
* @param fb The object to use when information,
* warning and error messages are produced. If null no feedback
* is provided.
*/
public Parser (Lexer lexer, ParserFeedback fb)
{
setFeedback (fb);
setLexer (lexer);
setNodeFactory (new PrototypicalNodeFactory ());
}
/**
* Constructor for custom HTTP access.
* This would be used to create a parser for a URLConnection that needs
* a special setup or negotiation conditioning beyond what is available
* from the {@link #getConnectionManager ConnectionManager}.
* @param connection A fully conditioned connection. The connect()
* method will be called so it need not be connected yet.
* @param fb The object to use for message communication.
* @throws ParserException If the creation of the underlying Lexer
* cannot be performed.
*/
public Parser (URLConnection connection, ParserFeedback fb)
throws
ParserException
{
this (new Lexer (connection), fb);
}
/**
* Creates a Parser object with the location of the resource (URL or file)
* You would typically create a DefaultHTMLParserFeedback object and pass
* it in.
* @see #Parser(URLConnection,ParserFeedback)
* @param resource Either a URL, a filename or a string of HTML.
* The string is considered HTML if the first non-whitespace character
* is a <. The use of a url or file is autodetected by first attempting
* to open the resource as a URL, if that fails it is assumed to be a file
* name.
* A standard HTTP GET is performed to read the content of the URL.
* @param feedback The HTMLParserFeedback object to use when information,
* warning and error messages are produced. If null no feedback
* is provided.
* @throws ParserException If the URL is invalid.
*/
public Parser (String resource, ParserFeedback feedback)
throws
ParserException
{
setFeedback (feedback);
setResource (resource);
setNodeFactory (new PrototypicalNodeFactory ());
}
/**
* Creates a Parser object with the location of the resource (URL or file).
* A DefaultHTMLParserFeedback object is used for feedback.
* @param resource Either HTML, a URL or a filename (autodetects).
* @throws ParserException If the resourceLocn argument does not resolve
* to a valid page or file.
* @see #Parser(String,ParserFeedback)
*/
public Parser (String resource) throws ParserException
{
this (resource, STDOUT);
}
/**
* Construct a parser using the provided lexer.
* A feedback object printing to {@link #STDOUT System.out} is used.
* This would be used to create a parser for special cases where the
* normal creation of a lexer on a URLConnection needs to be customized.
* @param lexer The lexer to draw characters from.
*/
public Parser (Lexer lexer)
{
this (lexer, STDOUT);
}
/**
* Construct a parser using the provided URLConnection.
* This would be used to create a parser for a URLConnection that needs
* a special setup or negotiation conditioning beyond what is available
* from the {@link #getConnectionManager ConnectionManager}.
* A feedback object printing to {@link #STDOUT System.out} is used.
* @see #Parser(URLConnection,ParserFeedback)
* @param connection A fully conditioned connection. The connect()
* method will be called so it need not be connected yet.
* @throws ParserException If the creation of the underlying Lexer
* cannot be performed.
*/
public Parser (URLConnection connection) throws ParserException
{
this (connection, STDOUT);
}
//
// Bean patterns
//
/**
* Set the html, a url, or a file.
* @param resource The resource to use.
* @exception IllegalArgumentException if resource
is null
.
* @exception ParserException if a problem occurs in connecting.
*/
public void setResource (String resource)
throws
ParserException
{
int length;
boolean html;
char ch;
if (null == resource)
throw new IllegalArgumentException ("resource cannot be null");
length = resource.length ();
html = false;
for (int i = 0; i < length; i++)
{
ch = resource.charAt (i);
if (!Character.isWhitespace (ch))
{
if ('<' == ch)
html = true;
break;
}
}
if (html)
setLexer (new Lexer (new Page (resource)));
else
setLexer (new Lexer (getConnectionManager ().openConnection (resource)));
}
/**
* Set the connection for this parser.
* This method creates a new Lexer
reading from the connection.
* @param connection A fully conditioned connection. The connect()
* method will be called so it need not be connected yet.
* @exception ParserException if the character set specified in the
* HTTP header is not supported, or an i/o exception occurs creating the
* lexer.
* @see #setLexer
* @see #getConnection
* @exception IllegalArgumentException if connection
is null
.
* @exception ParserException if a problem occurs in connecting.
*/
public void setConnection (URLConnection connection)
throws
ParserException
{
if (null == connection)
throw new IllegalArgumentException ("connection cannot be null");
setLexer (new Lexer (connection));
}
/**
* Return the current connection.
* @return The connection either created by the parser or passed into this
* parser via {@link #setConnection}.
* @see #setConnection(URLConnection)
*/
public URLConnection getConnection ()
{
return (getLexer ().getPage ().getConnection ());
}
/**
* Set the URL for this parser.
* This method creates a new Lexer reading from the given URL.
* Trying to set the url to null or an empty string is a no-op.
* @param url The new URL for the parser.
* @throws ParserException If the url is invalid or creation of the
* underlying Lexer cannot be performed.
* @exception ParserException if a problem occurs in connecting.
* @see #getURL
*/
public void setURL (String url)
throws
ParserException
{
if ((null != url) && !"".equals (url))
setConnection (getConnectionManager ().openConnection (url));
}
/**
* Return the current URL being parsed.
* @return The current url. This is the URL for the current page.
* A string passed into the constructor or set via setURL may be altered,
* for example, a file name may be modified to be a URL.
* @see Page#getUrl
* @see #setURL
*/
public String getURL ()
{
return (getLexer ().getPage ().getUrl ());
}
/**
* Set the encoding for the page this parser is reading from.
* @param encoding The new character set to use.
* @throws ParserException If the encoding change causes characters that
* have already been consumed to differ from the characters that would
* have been seen had the new encoding been in force.
* @see org.htmlparser.util.EncodingChangeException
* @see #getEncoding
*/
public void setEncoding (String encoding)
throws
ParserException
{
getLexer ().getPage ().setEncoding (encoding);
}
/**
* Get the encoding for the page this parser is reading from.
* This item is set from the HTTP header but may be overridden by meta
* tags in the head, so this may change after the head has been parsed.
* @return The encoding currently in force.
* @see #setEncoding
*/
public String getEncoding ()
{
return (getLexer ().getPage ().getEncoding ());
}
/**
* Set the lexer for this parser.
* The current NodeFactory is transferred to (set on) the given lexer,
* since the lexer owns the node factory object.
* It does not adjust the feedback
object.
* @param lexer The lexer object to use.
* @see #setNodeFactory
* @see #getLexer
* @exception IllegalArgumentException if lexer
is null
.
*/
public void setLexer (Lexer lexer)
{
NodeFactory factory;
String type;
if (null == lexer)
throw new IllegalArgumentException ("lexer cannot be null");
// move a node factory that's been set to the new lexer
factory = null;
if (null != getLexer ())
factory = getLexer ().getNodeFactory ();
if (null != factory)
lexer.setNodeFactory (factory);
mLexer = lexer;
// warn about content that's not likely text
type = mLexer.getPage ().getContentType ();
if (type != null && !type.startsWith ("text"))
getFeedback ().warning (
"URL "
+ mLexer.getPage ().getUrl ()
+ " does not contain text");
}
/**
* Returns the lexer associated with the parser.
* @return The current lexer.
* @see #setLexer
*/
public Lexer getLexer ()
{
return (mLexer);
}
/**
* Get the current node factory.
* @return The current lexer's node factory.
* @see #setNodeFactory
*/
public NodeFactory getNodeFactory ()
{
return (getLexer ().getNodeFactory ());
}
/**
* Set the current node factory.
* @param factory The new node factory for the current lexer.
* @see #getNodeFactory
* @exception IllegalArgumentException if factory
is null
.
*/
public void setNodeFactory (NodeFactory factory)
{
if (null == factory)
throw new IllegalArgumentException ("node factory cannot be null");
getLexer ().setNodeFactory (factory);
}
/**
* Sets the feedback object used in scanning.
* @param fb The new feedback object to use. If this is null a
* {@link #DEVNULL silent feedback object} is used.
* @see #getFeedback
*/
public void setFeedback (ParserFeedback fb)
{
if (null == fb)
mFeedback = DEVNULL;
else
mFeedback = fb;
}
/**
* Returns the current feedback object.
* @return The feedback object currently being used.
* @see #setFeedback
*/
public ParserFeedback getFeedback()
{
return (mFeedback);
}
//
// Public methods
//
/**
* Reset the parser to start from the beginning again.
* This assumes support for a reset from the underlying
* {@link org.htmlparser.lexer.Source} object.
* This is cheaper (in terms of time) than resetting the URL, i.e.
*
* parser.setURL (parser.getURL ());
*
* because the page is not refetched from the internet.
* Note: the nodes returned on the second parse are new
* nodes and not the same nodes returned on the first parse. If you
* want the same nodes for re-use, collect them in a NodeList with
* {@link #parse(NodeFilter) parse(null)} and operate on the NodeList.
*/
public void reset ()
{
getLexer ().reset ();
}
/**
* Returns an iterator (enumeration) over the html nodes.
* {@link org.htmlparser.nodes Nodes} can be of three main types:
*
* - {@link org.htmlparser.nodes.TagNode TagNode}
* - {@link org.htmlparser.nodes.TextNode TextNode}
* - {@link org.htmlparser.nodes.RemarkNode RemarkNode}
*
* In general, when parsing with an iterator or processing a NodeList,
* you will need to use recursion. For example:
*
*
* void processMyNodes (Node node)
* {
* if (node instanceof TextNode)
* {
* // downcast to TextNode
* TextNode text = (TextNode)node;
* // do whatever processing you want with the text
* System.out.println (text.getText ());
* }
* if (node instanceof RemarkNode)
* {
* // downcast to RemarkNode
* RemarkNode remark = (RemarkNode)node;
* // do whatever processing you want with the comment
* }
* else if (node instanceof TagNode)
* {
* // downcast to TagNode
* TagNode tag = (TagNode)node;
* // do whatever processing you want with the tag itself
* // ...
* // process recursively (nodes within nodes) via getChildren()
* NodeList nl = tag.getChildren ();
* if (null != nl)
* for (NodeIterator i = nl.elements (); i.hasMoreElements (); )
* processMyNodes (i.nextNode ());
* }
* }
*
* Parser parser = new Parser ("http://www.yahoo.com");
* for (NodeIterator i = parser.elements (); i.hasMoreElements (); )
* processMyNodes (i.nextNode ());
*
*
* @throws ParserException If a parsing error occurs.
* @return An iterator over the top level nodes (usually {@.html }).
*/
public NodeIterator elements () throws ParserException
{
return (new IteratorImpl (getLexer (), getFeedback ()));
}
/**
* Parse the given resource, using the filter provided.
* This can be used to extract information from specific nodes.
* When used with a null
filter it returns an
* entire page which can then be modified and converted back to HTML
* (Note: the synthesis use-case is not handled very well; the parser
* is more often used to extract information from a web page).
* For example, to replace the entire contents of the HEAD with a
* single TITLE tag you could do this:
*
* NodeList nl = parser.parse (null); // here is your two node list
* NodeList heads = nl.extractAllNodesThatMatch (new TagNameFilter ("HEAD"))
* if (heads.size () > 0) // there may not be a HEAD tag
* {
* Head head = heads.elementAt (0); // there should be only one
* head.removeAll (); // clean out the contents
* Tag title = new TitleTag ();
* title.setTagName ("title");
* title.setChildren (new NodeList (new TextNode ("The New Title")));
* Tag title_end = new TitleTag ();
* title_end.setTagName ("/title");
* title.setEndTag (title_end);
* head.add (title);
* }
* System.out.println (nl.toHtml ()); // output the modified HTML
*
* @return The list of matching nodes (for a null
* filter this is all the top level nodes).
* @param filter The filter to apply to the parsed nodes,
* or null
to retrieve all the top level nodes.
* @throws ParserException If a parsing error occurs.
*/
public NodeList parse (NodeFilter filter) throws ParserException
{
NodeIterator e;
Node node;
NodeList ret;
ret = new NodeList ();
for (e = elements (); e.hasMoreNodes (); )
{
node = e.nextNode ();
if (null != filter)
node.collectInto (ret, filter);
else
ret.add (node);
}
return (ret);
}
/**
* Apply the given visitor to the current page.
* The visitor is passed to the accept()
method of each node
* in the page in a depth first traversal. The visitor
* beginParsing()
method is called prior to processing the
* page and finishedParsing()
is called after the processing.
* @param visitor The visitor to visit all nodes with.
* @throws ParserException If a parse error occurs while traversing
* the page with the visitor.
*/
public void visitAllNodesWith (NodeVisitor visitor) throws ParserException
{
Node node;
visitor.beginParsing();
for (NodeIterator e = elements(); e.hasMoreNodes(); )
{
node = e.nextNode();
node.accept(visitor);
}
visitor.finishedParsing();
}
/**
* Initializes the parser with the given input HTML String.
* @param inputHTML the input HTML that is to be parsed.
* @throws ParserException If a error occurs in setting up the
* underlying Lexer.
* @exception IllegalArgumentException if inputHTML
is null
.
*/
public void setInputHTML (String inputHTML)
throws
ParserException
{
if (null == inputHTML)
throw new IllegalArgumentException ("html cannot be null");
if (!"".equals (inputHTML))
setLexer (new Lexer (new Page (inputHTML)));
}
/**
* Extract all nodes matching the given filter.
* @see Node#collectInto(NodeList, NodeFilter)
* @param filter The filter to be applied to the nodes.
* @throws ParserException If a parse error occurs.
* @return A list of nodes matching the filter criteria,
* i.e. for which the filter's accept method
* returned true
.
*/
public NodeList extractAllNodesThatMatch (NodeFilter filter)
throws
ParserException
{
NodeIterator e;
NodeList ret;
ret = new NodeList ();
for (e = elements (); e.hasMoreNodes (); )
e.nextNode ().collectInto (ret, filter);
return (ret);
}
//
// ConnectionMonitor interface
//
/**
* Called just prior to calling connect.
* Part of the ConnectionMonitor interface, this implementation just
* sends the request header to the feedback object if any.
* @param connection The connection which is about to be connected.
* @throws ParserException Not used
* @see ConnectionMonitor#preConnect
*/
public void preConnect (HttpURLConnection connection)
throws
ParserException
{
getFeedback ().info (HttpHeader.getRequestHeader (connection));
}
/**
* Called just after calling connect.
* Part of the ConnectionMonitor interface, this implementation just
* sends the response header to the feedback object if any.
* @param connection The connection that was just connected.
* @throws ParserException Not used.
* @see ConnectionMonitor#postConnect
*/
public void postConnect (HttpURLConnection connection)
throws
ParserException
{
getFeedback ().info (HttpHeader.getResponseHeader (connection));
}
/**
* The main program, which can be executed from the command line.
* @param args A URL or file name to parse, and an optional tag name to be
* used as a filter.
*/
public static void main (String [] args)
{
Parser parser;
NodeFilter filter;
if (args.length < 1 || args[0].equals ("-help"))
{
System.out.println ("HTML Parser v" + getVersion () + "\n");
System.out.println ();
System.out.println ("Syntax : java -jar htmlparser.jar"
+ " [type]");
System.out.println (" the URL or file to be parsed");
System.out.println (" type the node type, for example:");
System.out.println (" A - Show only the link tags");
System.out.println (" IMG - Show only the image tags");
System.out.println (" TITLE - Show only the title tag");
System.out.println ();
System.out.println ("Example : java -jar htmlparser.jar"
+ " http://www.yahoo.com");
System.out.println ();
}
else
try
{
parser = new Parser ();
if (1 < args.length)
filter = new TagNameFilter (args[1]);
else
{
filter = null;
// for a simple dump, use more verbose settings
parser.setFeedback (Parser.STDOUT);
getConnectionManager ().setMonitor (parser);
}
getConnectionManager ().setRedirectionProcessingEnabled (true);
getConnectionManager ().setCookieProcessingEnabled (true);
parser.setResource (args[0]);
System.out.println (parser.parse (filter));
}
catch (ParserException e)
{
e.printStackTrace ();
}
}
}