com.sun.xml.xhtml.HtmlSaxParser Maven / Gradle / Ivy
/*
* $Id: HtmlSaxParser.java,v 1.3 1999/04/04 19:34:43 db Exp $
*
* Copyright (c) 1999 Sun Microsystems, Inc. All Rights Reserved.
*
* This software is the confidential and proprietary information of Sun
* Microsystems, Inc. ("Confidential Information"). You shall not
* disclose such Confidential Information and shall use it only in
* accordance with the terms of the license agreement you entered into
* with Sun.
*
* SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF THE
* SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
* IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
* PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR ANY DAMAGES
* SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING
* THIS SOFTWARE OR ITS DERIVATIVES.
*/
package com.sun.xml.xhtml;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Enumeration;
import java.util.Locale;
import javax.swing.text.BadLocationException;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.parser.*;
import org.xml.sax.*;
import org.xml.sax.helpers.AttributeListImpl;
import com.sun.xml.parser.LexicalEventListener;
import com.sun.xml.parser.Resolver;
// debug only:
// import com.sun.xml.tree.XmlDocumentBuilder;
/**
* This is a SAX parser which processes HTML, producing an event stream
* corresponding roughly to what would be produced when parsing a
* well formed (but not in general valid) XHTML document. It may be
* useful when beginning to convert HTML content into (valid) XHTML
* content, or when using DOM APIs to manipulate such HTML content.
*
* The parser uses the HTML parser built into the SWING library, and
* converts the events it reports into SAX event callbacks. At this writing,
* that parser understands HTML 3.2, a number of HTML 4.0 constructs at
* least in part, and recovers from many common HTML errors in a manner
* which is compatible with the handling of popular web browsers.
*
*
Note that the case of element and attribute names exposed by
* this parser is always normalized to lower case. This policy
* is the one adopted by current W3C working drafts of XHTML.
* It differs from that adopted by the Level 1 DOM HTML support, which
* normalizes element names to uppercase and uppercases only the initial
* letter of attribute names.
*
*
Also, note that when writing out HTML, you may want to avoid
* character encodings such as UTF-8, since HTML processors are less
* consistent in their support of internationalized text than are XML
* processors. ISO-8859-1 is safe, and newer HTML browsers tend to
* handle other character encodings better than older ones.
*
* @see com.sun.xml.parser.LexicalEventListener
* @see com.sun.xml.tree.XmlDocumentBuilder
* @see javax.swing.text.html.parser.ParserDelegator
*
* @author David Brownell
* @version $Revision: 1.3 $
*/
public class HtmlSaxParser
extends ParserCallback
implements org.xml.sax.Parser
{
private ParserDelegator parser;
private AttributeListImpl attributes;
private boolean disabled;
private DocumentHandler docHandler;
private DTDHandler dtdHandler;
private ErrorHandler errHandler;
private EntityResolver resolver;
private Locale locale;
private LexicalEventListener lexicalHandler;
/**
* Constructs a SAX parser accessing the HTML parser built in to
* the Swing subsystem.
*/
public HtmlSaxParser ()
{
HandlerBase base = new HandlerBase ();
docHandler = base;
dtdHandler = base;
errHandler = base;
resolver = base;
parser = new ParserDelegator ();
attributes = new AttributeListImpl ();
}
// SAX parser methods
/**
* Parses the HTML document provided through the specified
* input source.
*/
public void parse (InputSource in) throws SAXException, IOException
{
Reader reader = in.getCharacterStream ();
if (reader == null) {
InputStream input = in.getByteStream ();
//
// XXX must check the character set handling here,
// along both paths !!!!
//
if (input != null) {
String encoding = in.getEncoding ();
if (encoding == null)
encoding = "8859_1";
reader = new InputStreamReader (input, encoding);
} else if (in.getSystemId () != null) {
in = Resolver.createInputSource (
new java.net.URL (in.getSystemId ()), false);
reader = in.getCharacterStream ();
} else
throw new SAXException (
"Bad InputSource: no Reader, InputStream, or URI");
}
docHandler.startDocument ();
try {
parser.parse (reader, this, true);
} catch (DeferredException x) {
if (x.ioe != null)
throw x.ioe;
if (x.x instanceof SAXParseException)
errHandler.fatalError ((SAXParseException) x.x);
throw x.x;
}
docHandler.endDocument ();
}
/**
* Parses the HTML document provided at the specified URI.
*/
public void parse (String uri) throws SAXException, IOException
{
parse (new InputSource (uri));
}
/**
* Assigns the document handler through which all HTML content will
* be reported. This is the primary application interface to this
* parser.
*
*
If this handler implements the lexical event listener interface,
* it will also be notified of comments.
*/
public void setDocumentHandler (DocumentHandler handler)
{
docHandler = handler;
if (handler instanceof LexicalEventListener)
lexicalHandler = (LexicalEventListener) handler;
else
lexicalHandler = null;
}
/**
* Not useful with any HTML parser, since HTML does not use
* notations or unparsed entities.
*/
public void setDTDHandler (DTDHandler handler)
{ dtdHandler = handler; }
/**
* Not useful with this HTML parser, since it does not read
* doctype declarations.
*/
public void setEntityResolver (EntityResolver resolver)
{ this.resolver = resolver; }
/**
* Provides the error handler used to report fatal errors
* and warnings.
*/
public void setErrorHandler (ErrorHandler handler)
{ errHandler = handler; }
/**
* Not useful with this HTML parser, since no control over the
* language of its diagnostics is provided by the Java runtime.
*/
public void setLocale (Locale locale)
{ this.locale = locale; }
// Swing HTML ParserCallback methods
/**
* Not intended for application use.
* This is a Swing HTML ParserCallback method.
*/
public void flush () throws BadLocationException
{
}
/**
* Not intended for application use.
* This is a Swing HTML ParserCallback method.
*/
public void handleText (char data [], int pos)
{
if (disabled)
return;
try {
// System.out.println ("text");
docHandler.characters (data, 0, data.length);
} catch (SAXException e) {
DeferredException x = new DeferredException ();
x.x = e;
disabled = true;
throw x;
}
}
/**
* Not intended for application use.
* This is a Swing HTML ParserCallback method.
*/
public void comment (char data [], int pos)
{
if (disabled || lexicalHandler == null)
return;
try {
// System.out.println ("comment");
lexicalHandler.comment (new String (data));
} catch (SAXException e) {
DeferredException x = new DeferredException ();
x.x = e;
disabled = true;
throw x;
}
}
/**
* Not intended for application use.
* This is a Swing HTML ParserCallback method.
*/
public void handleStartTag (
Tag tag,
MutableAttributeSet attributes,
int pos
) {
if (disabled)
return;
try {
// System.out.println ("start: " + tag.toString ());
docHandler.startElement (tag.toString (),
toAttributeList (attributes));
} catch (SAXException e) {
DeferredException x = new DeferredException ();
x.x = e;
disabled = true;
throw x;
}
}
/**
* Not intended for application use.
* This is a Swing HTML ParserCallback method.
*/
public void handleEndTag (
Tag tag,
int pos
) {
if (disabled)
return;
try {
// System.out.println ("end: " + tag.toString ());
docHandler.endElement (tag.toString ());
} catch (SAXException e) {
DeferredException x = new DeferredException ();
x.x = e;
disabled = true;
throw x;
}
}
/**
* Not intended for application use.
* This is a Swing HTML ParserCallback method.
*/
public void handleSimpleTag (
Tag tag,
MutableAttributeSet attributes,
int pos
) {
if (disabled)
return;
try {
String tagName = tag.toString ();
// System.out.println ("start simple: " + tagName);
docHandler.startElement (tagName, toAttributeList (attributes));
// System.out.println ("end simple: " + tagName);
docHandler.endElement (tagName);
} catch (SAXException e) {
DeferredException x = new DeferredException ();
x.x = e;
disabled = true;
throw x;
}
}
/**
* Not intended for application use.
* This is a Swing HTML ParserCallback method.
*/
public void handleError (String diagnostic, int pos)
{
try {
// System.out.println ("ERR: " + diagnostic);
errHandler.warning (new SAXParseException (
diagnostic,
null, null,
-1, -1
));
} catch (SAXException e) {
DeferredException x = new DeferredException ();
x.x = e;;
disabled = true;
throw x;
}
}
//
// Convert Swing's model of an attribute set to SAX's.
//
private AttributeListImpl toAttributeList (MutableAttributeSet attrs)
{
attributes.clear ();
if (attrs.getAttributeCount () != 0) {
for (Enumeration e = attrs.getAttributeNames ();
e.hasMoreElements ();
) {
Object key = e.nextElement ();
String name = key.toString ();
String value = attrs.getAttribute (key).toString ();
attributes.addAttribute (name, "CDATA", value);
}
}
return attributes;
}
//
// We wrap an exception (only one!) in this class and throw them.
// The HotJava parser seems to continue reporting events after we
// throw ... so remember to disable ongoing reporting!!
//
static class DeferredException extends RuntimeException
{
SAXException x;
IOException ioe;
}
/*
// TESTING ONLY
public static void main (String argv [])
{
HtmlSaxParser parser = new HtmlSaxParser ();
XmlDocumentBuilder builder = new XmlDocumentBuilder ();
try {
parser.setDocumentHandler (builder);
parser.parse (argv [0]);
builder.getDocument ().write (System.out);
} catch (Throwable t) {
t.printStackTrace ();
}
}
*/
}