All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.htmlparser.sax.XMLReader Maven / Gradle / Ivy

// HTMLParser Library $Name: v1_5 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/sax/XMLReader.java,v $
// $Author: derrickoswald $
// $Date: 2005/05/13 10:44:15 $
// $Revision: 1.3 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.sax;

import java.io.IOException;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;

import org.xml.sax.ContentHandler;
import org.xml.sax.DTDHandler;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.NamespaceSupport;

import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Remark;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.util.DefaultParserFeedback;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.ParserFeedback;

/**
 * SAX parser.
 * Generates callbacks on the {@link ContentHandler} based on encountered nodes.
 * 
Preliminary. *
 * org.xml.sax.XMLReader reader = org.xml.sax.helpers.XMLReaderFactory.createXMLReader ("org.htmlparser.sax.XMLReader");
 * org.xml.sax.ContentHandler content = new MyContentHandler ();
 * reader.setContentHandler (content);
 * org.xml.sax.ErrorHandler errors = new MyErrorHandler ();
 * reader.setErrorHandler (errors);
 * reader.parse ("http://cbc.ca");
 * 
*/ public class XMLReader implements org.xml.sax.XMLReader { /** * Determines if namespace handling is on. * All XMLReaders are required to recognize the feature names: *
    *
  • http://xml.org/sax/features/namespaces - * a value of "true" indicates namespace URIs and unprefixed * local names for element and attribute names will be available
  • *
  • http://xml.org/sax/features/namespace-prefixes - * a value of "true" indicates that XML qualified names (with * prefixes) and attributes (including xmlns* attributes) will * be available. *
*/ protected boolean mNameSpaces; // namespaces /** * Determines if namespace prefix handling is on. * @see #mNameSpaces */ protected boolean mNameSpacePrefixes; // namespace-prefixes /** * not implemented */ protected EntityResolver mEntityResolver; /** * not implemented */ protected DTDHandler mDTDHandler; /** * The content callback object. */ protected ContentHandler mContentHandler; /** * The error handler object. */ protected ErrorHandler mErrorHandler; /** * The underlying DOM parser. */ protected Parser mParser; /** * Namspace utility object. */ protected NamespaceSupport mSupport; /** * Qualified name parts. */ protected String mParts[]; /** * Create an SAX parser. */ public XMLReader () { mNameSpaces = true; mNameSpacePrefixes = false; mEntityResolver = null; mDTDHandler = null; mContentHandler = null; mErrorHandler = null; mSupport = new NamespaceSupport (); mSupport.pushContext (); mSupport.declarePrefix ("", "http://www.w3.org/TR/REC-html40"); // todo: // xmlns:html='http://www.w3.org/TR/REC-html40' // or xmlns:html='http://www.w3.org/1999/xhtml' mParts = new String[3]; } //////////////////////////////////////////////////////////////////// // Configuration. //////////////////////////////////////////////////////////////////// /** * Look up the value of a feature flag. * *

The feature name is any fully-qualified URI. It is * possible for an XMLReader to recognize a feature name but * temporarily be unable to return its value. * Some feature values may be available only in specific * contexts, such as before, during, or after a parse. * Also, some feature values may not be programmatically accessible. * (In the case of an adapter for SAX1 {@link Parser}, there is no * implementation-independent way to expose whether the underlying * parser is performing validation, expanding external entities, * and so forth.)

* *

All XMLReaders are required to recognize the * http://xml.org/sax/features/namespaces and the * http://xml.org/sax/features/namespace-prefixes feature names.

* *

Typical usage is something like this:

* *
     * XMLReader r = new MySAXDriver();
     *
     *                         // try to activate validation
     * try {
     *   r.setFeature("http://xml.org/sax/features/validation", true);
     * } catch (SAXException e) {
     *   System.err.println("Cannot activate validation."); 
     * }
     *
     *                         // register event handlers
     * r.setContentHandler(new MyContentHandler());
     * r.setErrorHandler(new MyErrorHandler());
     *
     *                         // parse the first document
     * try {
     *   r.parse("http://www.foo.com/mydoc.xml");
     * } catch (IOException e) {
     *   System.err.println("I/O exception reading XML document");
     * } catch (SAXException e) {
     *   System.err.println("XML exception reading document.");
     * }
     * 
* *

Implementors are free (and encouraged) to invent their own features, * using names built on their own URIs.

* * @param name The feature name, which is a fully-qualified URI. * @return The current value of the feature (true or false). * @exception org.xml.sax.SAXNotRecognizedException If the feature * value can't be assigned or retrieved. * @exception org.xml.sax.SAXNotSupportedException When the * XMLReader recognizes the feature name but * cannot determine its value at this time. * @see #setFeature */ public boolean getFeature (String name) throws SAXNotRecognizedException, SAXNotSupportedException { boolean ret; if (name.equals ("http://xml.org/sax/features/namespaces")) ret = mNameSpaces; else if (name.equals ("http://xml.org/sax/features/namespace-prefixes")) ret = mNameSpacePrefixes; else throw new SAXNotSupportedException (name + " not yet understood"); return (ret); } /** * Set the value of a feature flag. * *

The feature name is any fully-qualified URI. It is * possible for an XMLReader to expose a feature value but * to be unable to change the current value. * Some feature values may be immutable or mutable only * in specific contexts, such as before, during, or after * a parse.

* *

All XMLReaders are required to support setting * http://xml.org/sax/features/namespaces to true and * http://xml.org/sax/features/namespace-prefixes to false.

* * @param name The feature name, which is a fully-qualified URI. * @param value The requested value of the feature (true or false). * @exception org.xml.sax.SAXNotRecognizedException If the feature * value can't be assigned or retrieved. * @exception org.xml.sax.SAXNotSupportedException When the * XMLReader recognizes the feature name but * cannot set the requested value. * @see #getFeature */ public void setFeature (String name, boolean value) throws SAXNotRecognizedException, SAXNotSupportedException { if (name.equals ("http://xml.org/sax/features/namespaces")) mNameSpaces = value; else if (name.equals ("http://xml.org/sax/features/namespace-prefixes")) mNameSpacePrefixes = value; else throw new SAXNotSupportedException (name + " not yet understood"); } /** * Look up the value of a property. * *

The property name is any fully-qualified URI. It is * possible for an XMLReader to recognize a property name but * temporarily be unable to return its value. * Some property values may be available only in specific * contexts, such as before, during, or after a parse.

* *

XMLReaders are not required to recognize any specific * property names, though an initial core set is documented for * SAX2.

* *

Implementors are free (and encouraged) to invent their own properties, * using names built on their own URIs.

* * @param name The property name, which is a fully-qualified URI. * @return The current value of the property. * @exception org.xml.sax.SAXNotRecognizedException If the property * value can't be assigned or retrieved. * @exception org.xml.sax.SAXNotSupportedException When the * XMLReader recognizes the property name but * cannot determine its value at this time. * @see #setProperty */ public Object getProperty (String name) throws SAXNotRecognizedException, SAXNotSupportedException { throw new SAXNotSupportedException (name + " not yet understood"); } /** * Set the value of a property. * *

The property name is any fully-qualified URI. It is * possible for an XMLReader to recognize a property name but * to be unable to change the current value. * Some property values may be immutable or mutable only * in specific contexts, such as before, during, or after * a parse.

* *

XMLReaders are not required to recognize setting * any specific property names, though a core set is defined by * SAX2.

* *

This method is also the standard mechanism for setting * extended handlers.

* * @param name The property name, which is a fully-qualified URI. * @param value The requested value for the property. * @exception org.xml.sax.SAXNotRecognizedException If the property * value can't be assigned or retrieved. * @exception org.xml.sax.SAXNotSupportedException When the * XMLReader recognizes the property name but * cannot set the requested value. */ public void setProperty (String name, Object value) throws SAXNotRecognizedException, SAXNotSupportedException { throw new SAXNotSupportedException (name + " not yet understood"); } //////////////////////////////////////////////////////////////////// // Event handlers. //////////////////////////////////////////////////////////////////// /** * Allow an application to register an entity resolver. * *

If the application does not register an entity resolver, * the XMLReader will perform its own default resolution.

* *

Applications may register a new or different resolver in the * middle of a parse, and the SAX parser must begin using the new * resolver immediately.

* * @param resolver The entity resolver. * @see #getEntityResolver */ public void setEntityResolver (EntityResolver resolver) { mEntityResolver = resolver; } /** * Return the current entity resolver. * * @return The current entity resolver, or null if none * has been registered. * @see #setEntityResolver */ public EntityResolver getEntityResolver () { return (mEntityResolver); } /** * Allow an application to register a DTD event handler. * *

If the application does not register a DTD handler, all DTD * events reported by the SAX parser will be silently ignored.

* *

Applications may register a new or different handler in the * middle of a parse, and the SAX parser must begin using the new * handler immediately.

* * @param handler The DTD handler. * @see #getDTDHandler */ public void setDTDHandler (DTDHandler handler) { mDTDHandler = handler; } /** * Return the current DTD handler. * * @return The current DTD handler, or null if none * has been registered. * @see #setDTDHandler */ public DTDHandler getDTDHandler () { return (mDTDHandler); } /** * Allow an application to register a content event handler. * *

If the application does not register a content handler, all * content events reported by the SAX parser will be silently * ignored.

* *

Applications may register a new or different handler in the * middle of a parse, and the SAX parser must begin using the new * handler immediately.

* * @param handler The content handler. * @see #getContentHandler */ public void setContentHandler (ContentHandler handler) { mContentHandler = handler; } /** * Return the current content handler. * * @return The current content handler, or null if none * has been registered. * @see #setContentHandler */ public ContentHandler getContentHandler () { return (mContentHandler); } /** * Allow an application to register an error event handler. * *

If the application does not register an error handler, all * error events reported by the SAX parser will be silently * ignored; however, normal processing may not continue. It is * highly recommended that all SAX applications implement an * error handler to avoid unexpected bugs.

* *

Applications may register a new or different handler in the * middle of a parse, and the SAX parser must begin using the new * handler immediately.

* * @param handler The error handler. * @see #getErrorHandler */ public void setErrorHandler (ErrorHandler handler) { mErrorHandler = handler; } /** * Return the current error handler. * * @return The current error handler, or null if none * has been registered. * @see #setErrorHandler */ public ErrorHandler getErrorHandler () { return (mErrorHandler); } //////////////////////////////////////////////////////////////////// // Parsing. //////////////////////////////////////////////////////////////////// /** * Parse an XML document. * *

The application can use this method to instruct the XML * reader to begin parsing an XML document from any valid input * source (a character stream, a byte stream, or a URI).

* *

Applications may not invoke this method while a parse is in * progress (they should create a new XMLReader instead for each * nested XML document). Once a parse is complete, an * application may reuse the same XMLReader object, possibly with a * different input source. * Configuration of the XMLReader object (such as handler bindings and * values established for feature flags and properties) is unchanged * by completion of a parse, unless the definition of that aspect of * the configuration explicitly specifies other behavior. * (For example, feature flags or properties exposing * characteristics of the document being parsed.) *

* *

During the parse, the XMLReader will provide information * about the XML document through the registered event * handlers.

* *

This method is synchronous: it will not return until parsing * has ended. If a client application wants to terminate * parsing early, it should throw an exception.

* * @param input The input source for the top-level of the * XML document. * @exception org.xml.sax.SAXException Any SAX exception, possibly * wrapping another exception. * @exception java.io.IOException An IO exception from the parser, * possibly from a byte stream or character stream * supplied by the application. * @see org.xml.sax.InputSource * @see #parse(java.lang.String) * @see #setEntityResolver * @see #setDTDHandler * @see #setContentHandler * @see #setErrorHandler */ public void parse (InputSource input) throws IOException, SAXException { Locator locator; ParserFeedback feedback; if (null != mContentHandler) try { mParser = new Parser ( new Lexer ( new Page ( input.getByteStream (), input.getEncoding ()))); locator = new Locator (mParser); if (null != mErrorHandler) feedback = new Feedback (mErrorHandler, locator); else feedback = new DefaultParserFeedback (0); mParser.setFeedback (feedback); mContentHandler.setDocumentLocator (locator); try { mContentHandler.startDocument (); for (NodeIterator iterator = mParser.elements (); iterator.hasMoreNodes (); doSAX (iterator.nextNode ())); mContentHandler.endDocument (); } catch (SAXException se) { if (null != mErrorHandler) mErrorHandler.fatalError (new SAXParseException ( "contentHandler threw me", locator, se)); } } catch (ParserException pe) { if (null != mErrorHandler) mErrorHandler.fatalError (new SAXParseException ( pe.getMessage (), "", "", 0, 0)); } } /** * Parse an XML document from a system identifier (URI). * *

This method is a shortcut for the common case of reading a * document from a system identifier. It is the exact * equivalent of the following:

* *
     * parse(new InputSource(systemId));
     * 
* *

If the system identifier is a URL, it must be fully resolved * by the application before it is passed to the parser.

* * @param systemId The system identifier (URI). * @exception org.xml.sax.SAXException Any SAX exception, possibly * wrapping another exception. * @exception java.io.IOException An IO exception from the parser, * possibly from a byte stream or character stream * supplied by the application. * @see #parse(org.xml.sax.InputSource) */ public void parse (String systemId) throws IOException, SAXException { Locator locator; ParserFeedback feedback; if (null != mContentHandler) try { mParser = new Parser (systemId); locator = new Locator (mParser); if (null != mErrorHandler) feedback = new Feedback (mErrorHandler, locator); else feedback = new DefaultParserFeedback (DefaultParserFeedback.QUIET); mParser.setFeedback (feedback); // OK, try a simplistic parse mContentHandler.setDocumentLocator (locator); try { mContentHandler.startDocument (); for (NodeIterator iterator = mParser.elements (); iterator.hasMoreNodes (); ) doSAX (iterator.nextNode ()); mContentHandler.endDocument (); } catch (SAXException se) { if (null != mErrorHandler) mErrorHandler.fatalError ( new SAXParseException ("contentHandler threw me", locator, se)); } } catch (ParserException pe) { if (null != mErrorHandler) mErrorHandler.fatalError ( new SAXParseException (pe.getMessage (), "", systemId, 0, 0)); } } /** * Process nodes recursively on the DocumentHandler. * Calls methods on the handler based on the type and whether it's an end tag. * Processes composite tags recursively. * Does rudimentary namespace processing according to the state of {@link #mNameSpaces} * and {@link #mNameSpacePrefixes}. * @param node The htmlparser node to traverse. * @exception ParserException If a parse error occurs. * @exception SAXException If a SAX error occurs. */ protected void doSAX (Node node) throws ParserException, SAXException { Tag tag; Tag end; if (node instanceof Remark) { String text = mParser.getLexer ().getPage ().getText (node.getStartPosition (), node.getEndPosition ()); mContentHandler.ignorableWhitespace (text.toCharArray (), 0, text.length ()); } else if (node instanceof Text) { String text = mParser.getLexer ().getPage ().getText (node.getStartPosition (), node.getEndPosition ()); mContentHandler.characters (text.toCharArray (), 0, text.length ()); } else if (node instanceof Tag) { tag = (Tag)node; if (mNameSpaces) mSupport.processName (tag.getTagName (), mParts, false); else { mParts[0] = ""; mParts[1] = ""; } if (mNameSpacePrefixes) mParts[2] = tag.getTagName (); else if (mNameSpaces) mParts[2] = ""; else mParts[2] = tag.getTagName (); mContentHandler.startElement ( mParts[0], // uri mParts[1], // local mParts[2], // raw new Attributes (tag, mSupport, mParts)); NodeList children = tag.getChildren (); if (null != children) for (int i = 0; i < children.size (); i++) doSAX (children.elementAt (i)); end = tag.getEndTag (); if (null != end) { if (mNameSpaces) mSupport.processName (end.getTagName (), mParts, false); else { mParts[0] = ""; mParts[1] = ""; } if (mNameSpacePrefixes) mParts[2] = end.getTagName (); else if (mNameSpaces) mParts[2] = ""; else mParts[2] = end.getTagName (); mContentHandler.endElement ( mParts[0], // uri mParts[1], // local mParts[2]); // raw } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy