All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sun.xml.xhtml.HtmlSaxParser Maven / Gradle / Ivy

The newest version!
/*
 * $Id: HtmlSaxParser.java,v 1.3 1999/04/04 19:34:43 db Exp $
 * 
 * Copyright (c) 1999 Sun Microsystems, Inc. All Rights Reserved.
 * 
 * This software is the confidential and proprietary information of Sun
 * Microsystems, Inc. ("Confidential Information").  You shall not
 * disclose such Confidential Information and shall use it only in
 * accordance with the terms of the license agreement you entered into
 * with Sun.
 * 
 * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF THE
 * SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 * PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR ANY DAMAGES
 * SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING
 * THIS SOFTWARE OR ITS DERIVATIVES.
 */

package com.sun.xml.xhtml;

import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.Reader;

import java.util.Enumeration;
import java.util.Locale;

import javax.swing.text.BadLocationException;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.parser.*;

import org.xml.sax.*;
import org.xml.sax.helpers.AttributeListImpl;

import com.sun.xml.parser.LexicalEventListener;
import com.sun.xml.parser.Resolver;

// debug only:
// import com.sun.xml.tree.XmlDocumentBuilder;


/**
 * This is a SAX parser which processes HTML, producing an event stream
 * corresponding roughly to what would be produced when parsing a
 * well formed (but not in general valid) XHTML document.  It may be
 * useful when beginning to convert HTML content into (valid) XHTML
 * content, or when using DOM APIs to manipulate such HTML content.
 *
 * 

The parser uses the HTML parser built into the SWING library, and * converts the events it reports into SAX event callbacks. At this writing, * that parser understands HTML 3.2, a number of HTML 4.0 constructs at * least in part, and recovers from many common HTML errors in a manner * which is compatible with the handling of popular web browsers. * *

Note that the case of element and attribute names exposed by * this parser is always normalized to lower case. This policy * is the one adopted by current W3C working drafts of XHTML. * It differs from that adopted by the Level 1 DOM HTML support, which * normalizes element names to uppercase and uppercases only the initial * letter of attribute names. * *

Also, note that when writing out HTML, you may want to avoid * character encodings such as UTF-8, since HTML processors are less * consistent in their support of internationalized text than are XML * processors. ISO-8859-1 is safe, and newer HTML browsers tend to * handle other character encodings better than older ones. * * @see com.sun.xml.parser.LexicalEventListener * @see com.sun.xml.tree.XmlDocumentBuilder * @see javax.swing.text.html.parser.ParserDelegator * * @author David Brownell * @version $Revision: 1.3 $ */ public class HtmlSaxParser extends ParserCallback implements org.xml.sax.Parser { private ParserDelegator parser; private AttributeListImpl attributes; private boolean disabled; private DocumentHandler docHandler; private DTDHandler dtdHandler; private ErrorHandler errHandler; private EntityResolver resolver; private Locale locale; private LexicalEventListener lexicalHandler; /** * Constructs a SAX parser accessing the HTML parser built in to * the Swing subsystem. */ public HtmlSaxParser () { HandlerBase base = new HandlerBase (); docHandler = base; dtdHandler = base; errHandler = base; resolver = base; parser = new ParserDelegator (); attributes = new AttributeListImpl (); } // SAX parser methods /** * Parses the HTML document provided through the specified * input source. */ public void parse (InputSource in) throws SAXException, IOException { Reader reader = in.getCharacterStream (); if (reader == null) { InputStream input = in.getByteStream (); // // XXX must check the character set handling here, // along both paths !!!! // if (input != null) { String encoding = in.getEncoding (); if (encoding == null) encoding = "8859_1"; reader = new InputStreamReader (input, encoding); } else if (in.getSystemId () != null) { in = Resolver.createInputSource ( new java.net.URL (in.getSystemId ()), false); reader = in.getCharacterStream (); } else throw new SAXException ( "Bad InputSource: no Reader, InputStream, or URI"); } docHandler.startDocument (); try { parser.parse (reader, this, true); } catch (DeferredException x) { if (x.ioe != null) throw x.ioe; if (x.x instanceof SAXParseException) errHandler.fatalError ((SAXParseException) x.x); throw x.x; } docHandler.endDocument (); } /** * Parses the HTML document provided at the specified URI. */ public void parse (String uri) throws SAXException, IOException { parse (new InputSource (uri)); } /** * Assigns the document handler through which all HTML content will * be reported. This is the primary application interface to this * parser. * *

If this handler implements the lexical event listener interface, * it will also be notified of comments. */ public void setDocumentHandler (DocumentHandler handler) { docHandler = handler; if (handler instanceof LexicalEventListener) lexicalHandler = (LexicalEventListener) handler; else lexicalHandler = null; } /** * Not useful with any HTML parser, since HTML does not use * notations or unparsed entities. */ public void setDTDHandler (DTDHandler handler) { dtdHandler = handler; } /** * Not useful with this HTML parser, since it does not read * doctype declarations. */ public void setEntityResolver (EntityResolver resolver) { this.resolver = resolver; } /** * Provides the error handler used to report fatal errors * and warnings. */ public void setErrorHandler (ErrorHandler handler) { errHandler = handler; } /** * Not useful with this HTML parser, since no control over the * language of its diagnostics is provided by the Java runtime. */ public void setLocale (Locale locale) { this.locale = locale; } // Swing HTML ParserCallback methods /** * Not intended for application use. * This is a Swing HTML ParserCallback method. */ public void flush () throws BadLocationException { } /** * Not intended for application use. * This is a Swing HTML ParserCallback method. */ public void handleText (char data [], int pos) { if (disabled) return; try { // System.out.println ("text"); docHandler.characters (data, 0, data.length); } catch (SAXException e) { DeferredException x = new DeferredException (); x.x = e; disabled = true; throw x; } } /** * Not intended for application use. * This is a Swing HTML ParserCallback method. */ public void comment (char data [], int pos) { if (disabled || lexicalHandler == null) return; try { // System.out.println ("comment"); lexicalHandler.comment (new String (data)); } catch (SAXException e) { DeferredException x = new DeferredException (); x.x = e; disabled = true; throw x; } } /** * Not intended for application use. * This is a Swing HTML ParserCallback method. */ public void handleStartTag ( Tag tag, MutableAttributeSet attributes, int pos ) { if (disabled) return; try { // System.out.println ("start: " + tag.toString ()); docHandler.startElement (tag.toString (), toAttributeList (attributes)); } catch (SAXException e) { DeferredException x = new DeferredException (); x.x = e; disabled = true; throw x; } } /** * Not intended for application use. * This is a Swing HTML ParserCallback method. */ public void handleEndTag ( Tag tag, int pos ) { if (disabled) return; try { // System.out.println ("end: " + tag.toString ()); docHandler.endElement (tag.toString ()); } catch (SAXException e) { DeferredException x = new DeferredException (); x.x = e; disabled = true; throw x; } } /** * Not intended for application use. * This is a Swing HTML ParserCallback method. */ public void handleSimpleTag ( Tag tag, MutableAttributeSet attributes, int pos ) { if (disabled) return; try { String tagName = tag.toString (); // System.out.println ("start simple: " + tagName); docHandler.startElement (tagName, toAttributeList (attributes)); // System.out.println ("end simple: " + tagName); docHandler.endElement (tagName); } catch (SAXException e) { DeferredException x = new DeferredException (); x.x = e; disabled = true; throw x; } } /** * Not intended for application use. * This is a Swing HTML ParserCallback method. */ public void handleError (String diagnostic, int pos) { try { // System.out.println ("ERR: " + diagnostic); errHandler.warning (new SAXParseException ( diagnostic, null, null, -1, -1 )); } catch (SAXException e) { DeferredException x = new DeferredException (); x.x = e;; disabled = true; throw x; } } // // Convert Swing's model of an attribute set to SAX's. // private AttributeListImpl toAttributeList (MutableAttributeSet attrs) { attributes.clear (); if (attrs.getAttributeCount () != 0) { for (Enumeration e = attrs.getAttributeNames (); e.hasMoreElements (); ) { Object key = e.nextElement (); String name = key.toString (); String value = attrs.getAttribute (key).toString (); attributes.addAttribute (name, "CDATA", value); } } return attributes; } // // We wrap an exception (only one!) in this class and throw them. // The HotJava parser seems to continue reporting events after we // throw ... so remember to disable ongoing reporting!! // static class DeferredException extends RuntimeException { SAXException x; IOException ioe; } /* // TESTING ONLY public static void main (String argv []) { HtmlSaxParser parser = new HtmlSaxParser (); XmlDocumentBuilder builder = new XmlDocumentBuilder (); try { parser.setDocumentHandler (builder); parser.parse (argv [0]); builder.getDocument ().write (System.out); } catch (Throwable t) { t.printStackTrace (); } } */ }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy