org.htmlparser.sax.XMLReader Maven / Gradle / Ivy
Show all versions of bboss-htmlparser Show documentation
// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/sax/XMLReader.java,v $
// $Author: derrickoswald $
// $Date: 2005/05/13 10:44:15 $
// $Revision: 1.3 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
package org.htmlparser.sax;
import java.io.IOException;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.xml.sax.ContentHandler;
import org.xml.sax.DTDHandler;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.NamespaceSupport;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Remark;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.util.DefaultParserFeedback;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.ParserFeedback;
/**
* SAX parser.
* Generates callbacks on the {@link ContentHandler} based on encountered nodes.
*
Preliminary.
*
* org.xml.sax.XMLReader reader = org.xml.sax.helpers.XMLReaderFactory.createXMLReader ("org.htmlparser.sax.XMLReader");
* org.xml.sax.ContentHandler content = new MyContentHandler ();
* reader.setContentHandler (content);
* org.xml.sax.ErrorHandler errors = new MyErrorHandler ();
* reader.setErrorHandler (errors);
* reader.parse ("http://cbc.ca");
*
*/
public class XMLReader
implements
org.xml.sax.XMLReader
{
/**
* Determines if namespace handling is on.
* All XMLReaders are required to recognize the feature names:
*
* http://xml.org/sax/features/namespaces
-
* a value of "true" indicates namespace URIs and unprefixed
* local names for element and attribute names will be available
* http://xml.org/sax/features/namespace-prefixes
-
* a value of "true" indicates that XML qualified names (with
* prefixes) and attributes (including xmlns* attributes) will
* be available.
*
*/
protected boolean mNameSpaces; // namespaces
/**
* Determines if namespace prefix handling is on.
* @see #mNameSpaces
*/
protected boolean mNameSpacePrefixes; // namespace-prefixes
/**
* not implemented
*/
protected EntityResolver mEntityResolver;
/**
* not implemented
*/
protected DTDHandler mDTDHandler;
/**
* The content callback object.
*/
protected ContentHandler mContentHandler;
/**
* The error handler object.
*/
protected ErrorHandler mErrorHandler;
/**
* The underlying DOM parser.
*/
protected Parser mParser;
/**
* Namspace utility object.
*/
protected NamespaceSupport mSupport;
/**
* Qualified name parts.
*/
protected String mParts[];
/**
* Create an SAX parser.
*/
public XMLReader ()
{
mNameSpaces = true;
mNameSpacePrefixes = false;
mEntityResolver = null;
mDTDHandler = null;
mContentHandler = null;
mErrorHandler = null;
mSupport = new NamespaceSupport ();
mSupport.pushContext ();
mSupport.declarePrefix ("", "http://www.w3.org/TR/REC-html40");
// todo:
// xmlns:html='http://www.w3.org/TR/REC-html40'
// or xmlns:html='http://www.w3.org/1999/xhtml'
mParts = new String[3];
}
////////////////////////////////////////////////////////////////////
// Configuration.
////////////////////////////////////////////////////////////////////
/**
* Look up the value of a feature flag.
*
* The feature name is any fully-qualified URI. It is
* possible for an XMLReader to recognize a feature name but
* temporarily be unable to return its value.
* Some feature values may be available only in specific
* contexts, such as before, during, or after a parse.
* Also, some feature values may not be programmatically accessible.
* (In the case of an adapter for SAX1 {@link Parser}, there is no
* implementation-independent way to expose whether the underlying
* parser is performing validation, expanding external entities,
* and so forth.)
*
* All XMLReaders are required to recognize the
* http://xml.org/sax/features/namespaces and the
* http://xml.org/sax/features/namespace-prefixes feature names.
*
* Typical usage is something like this:
*
*
* XMLReader r = new MySAXDriver();
*
* // try to activate validation
* try {
* r.setFeature("http://xml.org/sax/features/validation", true);
* } catch (SAXException e) {
* System.err.println("Cannot activate validation.");
* }
*
* // register event handlers
* r.setContentHandler(new MyContentHandler());
* r.setErrorHandler(new MyErrorHandler());
*
* // parse the first document
* try {
* r.parse("http://www.foo.com/mydoc.xml");
* } catch (IOException e) {
* System.err.println("I/O exception reading XML document");
* } catch (SAXException e) {
* System.err.println("XML exception reading document.");
* }
*
*
* Implementors are free (and encouraged) to invent their own features,
* using names built on their own URIs.
*
* @param name The feature name, which is a fully-qualified URI.
* @return The current value of the feature (true or false).
* @exception org.xml.sax.SAXNotRecognizedException If the feature
* value can't be assigned or retrieved.
* @exception org.xml.sax.SAXNotSupportedException When the
* XMLReader recognizes the feature name but
* cannot determine its value at this time.
* @see #setFeature
*/
public boolean getFeature (String name)
throws SAXNotRecognizedException, SAXNotSupportedException
{
boolean ret;
if (name.equals ("http://xml.org/sax/features/namespaces"))
ret = mNameSpaces;
else if (name.equals ("http://xml.org/sax/features/namespace-prefixes"))
ret = mNameSpacePrefixes;
else
throw new SAXNotSupportedException (name + " not yet understood");
return (ret);
}
/**
* Set the value of a feature flag.
*
* The feature name is any fully-qualified URI. It is
* possible for an XMLReader to expose a feature value but
* to be unable to change the current value.
* Some feature values may be immutable or mutable only
* in specific contexts, such as before, during, or after
* a parse.
*
* All XMLReaders are required to support setting
* http://xml.org/sax/features/namespaces to true and
* http://xml.org/sax/features/namespace-prefixes to false.
*
* @param name The feature name, which is a fully-qualified URI.
* @param value The requested value of the feature (true or false).
* @exception org.xml.sax.SAXNotRecognizedException If the feature
* value can't be assigned or retrieved.
* @exception org.xml.sax.SAXNotSupportedException When the
* XMLReader recognizes the feature name but
* cannot set the requested value.
* @see #getFeature
*/
public void setFeature (String name, boolean value)
throws SAXNotRecognizedException, SAXNotSupportedException
{
if (name.equals ("http://xml.org/sax/features/namespaces"))
mNameSpaces = value;
else if (name.equals ("http://xml.org/sax/features/namespace-prefixes"))
mNameSpacePrefixes = value;
else
throw new SAXNotSupportedException (name + " not yet understood");
}
/**
* Look up the value of a property.
*
* The property name is any fully-qualified URI. It is
* possible for an XMLReader to recognize a property name but
* temporarily be unable to return its value.
* Some property values may be available only in specific
* contexts, such as before, during, or after a parse.
*
* XMLReaders are not required to recognize any specific
* property names, though an initial core set is documented for
* SAX2.
*
* Implementors are free (and encouraged) to invent their own properties,
* using names built on their own URIs.
*
* @param name The property name, which is a fully-qualified URI.
* @return The current value of the property.
* @exception org.xml.sax.SAXNotRecognizedException If the property
* value can't be assigned or retrieved.
* @exception org.xml.sax.SAXNotSupportedException When the
* XMLReader recognizes the property name but
* cannot determine its value at this time.
* @see #setProperty
*/
public Object getProperty (String name)
throws SAXNotRecognizedException, SAXNotSupportedException
{
throw new SAXNotSupportedException (name + " not yet understood");
}
/**
* Set the value of a property.
*
* The property name is any fully-qualified URI. It is
* possible for an XMLReader to recognize a property name but
* to be unable to change the current value.
* Some property values may be immutable or mutable only
* in specific contexts, such as before, during, or after
* a parse.
*
* XMLReaders are not required to recognize setting
* any specific property names, though a core set is defined by
* SAX2.
*
* This method is also the standard mechanism for setting
* extended handlers.
*
* @param name The property name, which is a fully-qualified URI.
* @param value The requested value for the property.
* @exception org.xml.sax.SAXNotRecognizedException If the property
* value can't be assigned or retrieved.
* @exception org.xml.sax.SAXNotSupportedException When the
* XMLReader recognizes the property name but
* cannot set the requested value.
*/
public void setProperty (String name, Object value)
throws SAXNotRecognizedException, SAXNotSupportedException
{
throw new SAXNotSupportedException (name + " not yet understood");
}
////////////////////////////////////////////////////////////////////
// Event handlers.
////////////////////////////////////////////////////////////////////
/**
* Allow an application to register an entity resolver.
*
* If the application does not register an entity resolver,
* the XMLReader will perform its own default resolution.
*
* Applications may register a new or different resolver in the
* middle of a parse, and the SAX parser must begin using the new
* resolver immediately.
*
* @param resolver The entity resolver.
* @see #getEntityResolver
*/
public void setEntityResolver (EntityResolver resolver)
{
mEntityResolver = resolver;
}
/**
* Return the current entity resolver.
*
* @return The current entity resolver, or null if none
* has been registered.
* @see #setEntityResolver
*/
public EntityResolver getEntityResolver ()
{
return (mEntityResolver);
}
/**
* Allow an application to register a DTD event handler.
*
* If the application does not register a DTD handler, all DTD
* events reported by the SAX parser will be silently ignored.
*
* Applications may register a new or different handler in the
* middle of a parse, and the SAX parser must begin using the new
* handler immediately.
*
* @param handler The DTD handler.
* @see #getDTDHandler
*/
public void setDTDHandler (DTDHandler handler)
{
mDTDHandler = handler;
}
/**
* Return the current DTD handler.
*
* @return The current DTD handler, or null if none
* has been registered.
* @see #setDTDHandler
*/
public DTDHandler getDTDHandler ()
{
return (mDTDHandler);
}
/**
* Allow an application to register a content event handler.
*
* If the application does not register a content handler, all
* content events reported by the SAX parser will be silently
* ignored.
*
* Applications may register a new or different handler in the
* middle of a parse, and the SAX parser must begin using the new
* handler immediately.
*
* @param handler The content handler.
* @see #getContentHandler
*/
public void setContentHandler (ContentHandler handler)
{
mContentHandler = handler;
}
/**
* Return the current content handler.
*
* @return The current content handler, or null if none
* has been registered.
* @see #setContentHandler
*/
public ContentHandler getContentHandler ()
{
return (mContentHandler);
}
/**
* Allow an application to register an error event handler.
*
* If the application does not register an error handler, all
* error events reported by the SAX parser will be silently
* ignored; however, normal processing may not continue. It is
* highly recommended that all SAX applications implement an
* error handler to avoid unexpected bugs.
*
* Applications may register a new or different handler in the
* middle of a parse, and the SAX parser must begin using the new
* handler immediately.
*
* @param handler The error handler.
* @see #getErrorHandler
*/
public void setErrorHandler (ErrorHandler handler)
{
mErrorHandler = handler;
}
/**
* Return the current error handler.
*
* @return The current error handler, or null if none
* has been registered.
* @see #setErrorHandler
*/
public ErrorHandler getErrorHandler ()
{
return (mErrorHandler);
}
////////////////////////////////////////////////////////////////////
// Parsing.
////////////////////////////////////////////////////////////////////
/**
* Parse an XML document.
*
* The application can use this method to instruct the XML
* reader to begin parsing an XML document from any valid input
* source (a character stream, a byte stream, or a URI).
*
* Applications may not invoke this method while a parse is in
* progress (they should create a new XMLReader instead for each
* nested XML document). Once a parse is complete, an
* application may reuse the same XMLReader object, possibly with a
* different input source.
* Configuration of the XMLReader object (such as handler bindings and
* values established for feature flags and properties) is unchanged
* by completion of a parse, unless the definition of that aspect of
* the configuration explicitly specifies other behavior.
* (For example, feature flags or properties exposing
* characteristics of the document being parsed.)
*
*
* During the parse, the XMLReader will provide information
* about the XML document through the registered event
* handlers.
*
* This method is synchronous: it will not return until parsing
* has ended. If a client application wants to terminate
* parsing early, it should throw an exception.
*
* @param input The input source for the top-level of the
* XML document.
* @exception org.xml.sax.SAXException Any SAX exception, possibly
* wrapping another exception.
* @exception java.io.IOException An IO exception from the parser,
* possibly from a byte stream or character stream
* supplied by the application.
* @see org.xml.sax.InputSource
* @see #parse(java.lang.String)
* @see #setEntityResolver
* @see #setDTDHandler
* @see #setContentHandler
* @see #setErrorHandler
*/
public void parse (InputSource input)
throws IOException, SAXException
{
Locator locator;
ParserFeedback feedback;
if (null != mContentHandler)
try
{
mParser = new Parser (
new Lexer (
new Page (
input.getByteStream (),
input.getEncoding ())));
locator = new Locator (mParser);
if (null != mErrorHandler)
feedback = new Feedback (mErrorHandler, locator);
else
feedback = new DefaultParserFeedback (0);
mParser.setFeedback (feedback);
mContentHandler.setDocumentLocator (locator);
try
{
mContentHandler.startDocument ();
for (NodeIterator iterator = mParser.elements ();
iterator.hasMoreNodes ();
doSAX (iterator.nextNode ()));
mContentHandler.endDocument ();
}
catch (SAXException se)
{
if (null != mErrorHandler)
mErrorHandler.fatalError (new SAXParseException (
"contentHandler threw me", locator, se));
}
}
catch (ParserException pe)
{
if (null != mErrorHandler)
mErrorHandler.fatalError (new SAXParseException (
pe.getMessage (), "", "", 0, 0));
}
}
/**
* Parse an XML document from a system identifier (URI).
*
* This method is a shortcut for the common case of reading a
* document from a system identifier. It is the exact
* equivalent of the following:
*
*
* parse(new InputSource(systemId));
*
*
* If the system identifier is a URL, it must be fully resolved
* by the application before it is passed to the parser.
*
* @param systemId The system identifier (URI).
* @exception org.xml.sax.SAXException Any SAX exception, possibly
* wrapping another exception.
* @exception java.io.IOException An IO exception from the parser,
* possibly from a byte stream or character stream
* supplied by the application.
* @see #parse(org.xml.sax.InputSource)
*/
public void parse (String systemId)
throws IOException, SAXException
{
Locator locator;
ParserFeedback feedback;
if (null != mContentHandler)
try
{
mParser = new Parser (systemId);
locator = new Locator (mParser);
if (null != mErrorHandler)
feedback = new Feedback (mErrorHandler, locator);
else
feedback = new DefaultParserFeedback (DefaultParserFeedback.QUIET);
mParser.setFeedback (feedback);
// OK, try a simplistic parse
mContentHandler.setDocumentLocator (locator);
try
{
mContentHandler.startDocument ();
for (NodeIterator iterator = mParser.elements (); iterator.hasMoreNodes (); )
doSAX (iterator.nextNode ());
mContentHandler.endDocument ();
}
catch (SAXException se)
{
if (null != mErrorHandler)
mErrorHandler.fatalError (
new SAXParseException ("contentHandler threw me", locator, se));
}
}
catch (ParserException pe)
{
if (null != mErrorHandler)
mErrorHandler.fatalError (
new SAXParseException (pe.getMessage (), "", systemId, 0, 0));
}
}
/**
* Process nodes recursively on the DocumentHandler.
* Calls methods on the handler based on the type and whether it's an end tag.
* Processes composite tags recursively.
* Does rudimentary namespace processing according to the state of {@link #mNameSpaces}
* and {@link #mNameSpacePrefixes}.
* @param node The htmlparser node to traverse.
* @exception ParserException If a parse error occurs.
* @exception SAXException If a SAX error occurs.
*/
protected void doSAX (Node node)
throws
ParserException,
SAXException
{
Tag tag;
Tag end;
if (node instanceof Remark)
{
String text = mParser.getLexer ().getPage ().getText (node.getStartPosition (), node.getEndPosition ());
mContentHandler.ignorableWhitespace (text.toCharArray (), 0, text.length ());
}
else if (node instanceof Text)
{
String text = mParser.getLexer ().getPage ().getText (node.getStartPosition (), node.getEndPosition ());
mContentHandler.characters (text.toCharArray (), 0, text.length ());
}
else if (node instanceof Tag)
{
tag = (Tag)node;
if (mNameSpaces)
mSupport.processName (tag.getTagName (), mParts, false);
else
{
mParts[0] = "";
mParts[1] = "";
}
if (mNameSpacePrefixes)
mParts[2] = tag.getTagName ();
else if (mNameSpaces)
mParts[2] = "";
else
mParts[2] = tag.getTagName ();
mContentHandler.startElement (
mParts[0], // uri
mParts[1], // local
mParts[2], // raw
new Attributes (tag, mSupport, mParts));
NodeList children = tag.getChildren ();
if (null != children)
for (int i = 0; i < children.size (); i++)
doSAX (children.elementAt (i));
end = tag.getEndTag ();
if (null != end)
{
if (mNameSpaces)
mSupport.processName (end.getTagName (), mParts, false);
else
{
mParts[0] = "";
mParts[1] = "";
}
if (mNameSpacePrefixes)
mParts[2] = end.getTagName ();
else if (mNameSpaces)
mParts[2] = "";
else
mParts[2] = end.getTagName ();
mContentHandler.endElement (
mParts[0], // uri
mParts[1], // local
mParts[2]); // raw
}
}
}
}