All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.evpull.StaxToEventBridge Maven / Gradle / Ivy

There is a newer version: 12.5
Show newest version
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2015 Saxonica Limited.
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.evpull;

import net.sf.saxon.Configuration;
import net.sf.saxon.event.NamespaceReducer;
import net.sf.saxon.event.PipelineConfiguration;
import net.sf.saxon.expr.parser.ExplicitLocation;
import net.sf.saxon.om.FingerprintedQName;
import net.sf.saxon.om.NamePool;
import net.sf.saxon.om.NoNamespaceName;
import net.sf.saxon.pull.UnparsedEntity;
import net.sf.saxon.serialize.XMLEmitter;
import net.sf.saxon.trans.SaxonErrorCode;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.tree.tiny.CharSlice;
import net.sf.saxon.tree.util.Orphan;
import net.sf.saxon.type.Type;
import net.sf.saxon.type.Untyped;
import net.sf.saxon.value.Whitespace;

import javax.xml.stream.*;
import javax.xml.stream.events.EntityDeclaration;
import javax.xml.transform.SourceLocator;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

/**
 * This class implements the Saxon EventIterator API on top of a standard StAX parser
 * (or any other StAX XMLStreamReader implementation)
 */

public class StaxToEventBridge implements EventIterator, SourceLocator {

    private Configuration config;
    private XMLStreamReader reader;
    private PipelineConfiguration pipe;
    /*@Nullable*/ private List unparsedEntities = null;
    PullEvent currentEvent = null;
    int depth = 0;
    boolean ignoreIgnorable = false;

    /**
     * Create a new instance of the class
     */

    public StaxToEventBridge() {

    }

    /**
     * Supply an input stream containing XML to be parsed. A StAX parser is created using
     * the JAXP XMLInputFactory.
     *
     * @param systemId    The Base URI of the input document
     * @param inputStream the stream containing the XML to be parsed
     * @throws net.sf.saxon.trans.XPathException
     *          if an error occurs creating the StAX parser
     */

    public void setInputStream(String systemId, InputStream inputStream) throws XPathException {
        try {
            XMLInputFactory factory = XMLInputFactory.newInstance();
            //XMLInputFactory factory = new WstxInputFactory();
            factory.setXMLReporter(new StaxErrorReporter());
            reader = factory.createXMLStreamReader(systemId, inputStream);
        } catch (XMLStreamException e) {
            throw new XPathException(e);
        }
    }

    /**
     * Supply an XMLStreamReader: the events reported by this XMLStreamReader will be translated
     * into EventIterator events
     *
     * @param reader the supplier of XML events, typically an XML parser
     */

    public void setXMLStreamReader(XMLStreamReader reader) {
        this.reader = reader;
    }

    /**
     * Set configuration information. This must only be called before any events
     * have been read.
     *
     * @param pipe the pipeline configuration
     */

    public void setPipelineConfiguration(PipelineConfiguration pipe) {
        this.pipe = new PipelineConfiguration(pipe);
        config = pipe.getConfiguration();
        ignoreIgnorable = config.getStripsWhiteSpace() != Whitespace.NONE;
    }

    /**
     * Get configuration information.
     *
     * @return the pipeline configuration
     */

    public PipelineConfiguration getPipelineConfiguration() {
        return pipe;
    }

    /**
     * Get the XMLStreamReader used by this StaxBridge. This is available only after
     * setInputStream() or setXMLStreamReader() has been called
     *
     * @return the instance of XMLStreamReader allocated when setInputStream() was called,
     *         or the instance supplied directly to setXMLStreamReader()
     */

    public XMLStreamReader getXMLStreamReader() {
        return reader;
    }

    /**
     * Get the name pool
     *
     * @return the name pool
     */

    public NamePool getNamePool() {
        return pipe.getConfiguration().getNamePool();
    }

    /**
     * Get the next event
     *
     * @return the next event; or null to indicate the end of the event stream
     */

    public PullEvent next() throws XPathException {
        if (currentEvent == null) {
            // StAX isn't reporting START_DOCUMENT so we supply it ourselves
            currentEvent = StartDocumentEvent.getInstance();
            return currentEvent;
        }
        if (currentEvent instanceof EndDocumentEvent) {
            try {
                reader.close();
            } catch (XMLStreamException e) {
                //
            }
            return null;
        }
        try {
            if (reader.hasNext()) {
                int event = reader.next();
                //System.err.println("Read event " + event);
                currentEvent = translate(event);
            } else {
                currentEvent = null;
            }
        } catch (XMLStreamException e) {
            String message = e.getMessage();
            // Following code recognizes the messages produced by the Sun Zephyr parser
            if (message.startsWith("ParseError at")) {
                int c = message.indexOf("\nMessage: ");
                if (c > 0) {
                    message = message.substring(c + 10);
                }
            }
            XPathException err = new XPathException("Error reported by XML parser: " + message);
            err.setErrorCode(SaxonErrorCode.SXXP0003);
            err.setLocator(translateLocation(e.getLocation()));
            throw err;
        }
        return currentEvent;
    }

    /**
     * Translate a StAX event into a Saxon PullEvent
     *
     * @param event the StAX event
     * @return the Saxon PullEvent
     * @throws XPathException
     */

    private PullEvent translate(int event) throws XPathException {
        //System.err.println("EVENT " + event);
        switch (event) {
            case XMLStreamConstants.ATTRIBUTE:
                return next();          // attributes are reported as part of StartElement
            case XMLStreamConstants.CDATA:
            case XMLStreamConstants.CHARACTERS:
                if (depth == 0 && reader.isWhiteSpace()) {
                    return next();
                } else {
                    Orphan o = new Orphan(config);
                    o.setNodeKind(Type.TEXT);
                    CharSlice value = new CharSlice(
                            reader.getTextCharacters(), reader.getTextStart(), reader.getTextLength());
                    o.setStringValue(value);
                    return o;
                }
            case XMLStreamConstants.COMMENT: {
                Orphan o = new Orphan(config);
                o.setNodeKind(Type.COMMENT);
                CharSlice value = new CharSlice(
                        reader.getTextCharacters(), reader.getTextStart(), reader.getTextLength());
                o.setStringValue(value);
                return o;
            }
            case XMLStreamConstants.DTD:
                unparsedEntities = (List) reader.getProperty("javax.xml.stream.entities");
                return next();
            case XMLStreamConstants.END_DOCUMENT:
                return EndDocumentEvent.getInstance();
            case XMLStreamConstants.END_ELEMENT:
                depth--;
                return EndElementEvent.getInstance();
            case XMLStreamConstants.ENTITY_DECLARATION:
                return next();
            case XMLStreamConstants.ENTITY_REFERENCE:
                return next();
            case XMLStreamConstants.NAMESPACE:
                return next();      // namespaces are reported as part of StartElement
            case XMLStreamConstants.NOTATION_DECLARATION:
                return next();
            case XMLStreamConstants.PROCESSING_INSTRUCTION: {
                Orphan o = new Orphan(config);
                o.setNodeKind(Type.PROCESSING_INSTRUCTION);
                String local = reader.getPITarget();
                o.setNodeName(new NoNamespaceName(local));
                o.setStringValue(reader.getText());
                return o;
            }
            case XMLStreamConstants.SPACE:
                if (depth == 0) {
                    return next();
                } else if (ignoreIgnorable) {
                    // (Brave attempt, but Woodstox doesn't seem to report ignorable whitespace)
                    return next();
                } else {
                    Orphan o = new Orphan(config);
                    o.setNodeKind(Type.TEXT);
                    o.setStringValue(reader.getText());
                    return o;
                }
            case XMLStreamConstants.START_DOCUMENT:
                return next();  // we supplied the START_DOCUMENT ourselves
            case XMLStreamConstants.START_ELEMENT:
                depth++;
                StartElementEvent see = new StartElementEvent(pipe);
                String elocal = reader.getLocalName();
                String euri = reader.getNamespaceURI();
                String eprefix = reader.getPrefix();
                if (eprefix == null) {
                    eprefix = "";
                }
                if (euri == null) {
                    euri = "";
                }
                see.setElementName(new FingerprintedQName(eprefix, euri, elocal));
                see.setTypeCode(Untyped.getInstance());
                int attCount = reader.getAttributeCount();
                for (int index = 0; index < attCount; index++) {
                    String local = reader.getAttributeLocalName(index);
                    String uri = reader.getAttributeNamespace(index);
                    String prefix = reader.getAttributePrefix(index);
                    if (prefix == null) {
                        prefix = "";
                    }
                    if (uri == null) {
                        uri = "";
                    }
                    Orphan o = new Orphan(config);
                    o.setNodeKind(Type.ATTRIBUTE);
                    o.setNodeName(new FingerprintedQName(prefix, uri, local));
                    o.setStringValue(reader.getAttributeValue(index));
                    see.addAttribute(o);
                }
                see.namespaceFixup();
                return see;
            default:
                throw new IllegalStateException("Unknown StAX event " + event);


        }
    }


    /**
     * Return the public identifier for the current document event.
     * 

*

The return value is the public identifier of the document * entity or of the external parsed entity in which the markup * triggering the event appears.

* * @return A string containing the public identifier, or * null if none is available. * @see #getSystemId */ public String getPublicId() { return reader.getLocation().getPublicId(); } /** * Return the system identifier for the current document event. *

*

The return value is the system identifier of the document * entity or of the external parsed entity in which the markup * triggering the event appears.

*

*

If the system identifier is a URL, the parser must resolve it * fully before passing it to the application. For example, a file * name must always be provided as a file:... URL, and other * kinds of relative URI are also resolved against their bases.

* * @return A string containing the system identifier, or null * if none is available. * @see #getPublicId */ public String getSystemId() { return reader.getLocation().getSystemId(); } /** * Return the line number where the current document event ends. * Lines are delimited by line ends, which are defined in * the XML specification. *

*

Warning: The return value from the method * is intended only as an approximation for the sake of diagnostics; * it is not intended to provide sufficient information * to edit the character content of the original XML document. * In some cases, these "line" numbers match what would be displayed * as columns, and in others they may not match the source text * due to internal entity expansion.

*

*

The return value is an approximation of the line number * in the document entity or external parsed entity where the * markup triggering the event appears.

*

*

If possible, the SAX driver should provide the line position * of the first character after the text associated with the document * event. The first line is line 1.

* * @return The line number, or -1 if none is available. * @see #getColumnNumber */ public int getLineNumber() { return reader.getLocation().getLineNumber(); } /** * Return the column number where the current document event ends. * This is one-based number of Java char values since * the last line end. *

*

Warning: The return value from the method * is intended only as an approximation for the sake of diagnostics; * it is not intended to provide sufficient information * to edit the character content of the original XML document. * For example, when lines contain combining character sequences, wide * characters, surrogate pairs, or bi-directional text, the value may * not correspond to the column in a text editor's display.

*

*

The return value is an approximation of the column number * in the document entity or external parsed entity where the * markup triggering the event appears.

*

*

If possible, the SAX driver should provide the line position * of the first character after the text associated with the document * event. The first column in each line is column 1.

* * @return The column number, or -1 if none is available. * @see #getLineNumber */ public int getColumnNumber() { return reader.getLocation().getColumnNumber(); } public String getSystemId(int locationId) { return getSystemId(); } public int getLineNumber(int locationId) { return getLineNumber(); } public int getColumnNumber(int locationId) { return getColumnNumber(); } /** * Get a list of unparsed entities. * * @return a list of unparsed entities, or null if the information is not available, or * an empty list if there are no unparsed entities. Each item in the list will * be an instance of {@link net.sf.saxon.pull.UnparsedEntity} */ public List getUnparsedEntities() { if (unparsedEntities == null) { return null; } List list = new ArrayList(unparsedEntities.size()); for (int i = 0; i < unparsedEntities.size(); i++) { Object ent = unparsedEntities.get(i); String name = null; String systemId = null; String publicId = null; String baseURI = null; if (ent instanceof EntityDeclaration) { // This is what we would expect from the StAX API spec EntityDeclaration ed = (EntityDeclaration) ent; name = ed.getName(); systemId = ed.getSystemId(); publicId = ed.getPublicId(); baseURI = ed.getBaseURI(); } else if (ent.getClass().getName().equals("com.ctc.wstx.ent.UnparsedExtEntity")) { // Woodstox 3.0.0 returns this: use introspection to get the data we need try { Class woodstoxClass = ent.getClass(); Class[] noArgs = new Class[0]; Method method = woodstoxClass.getMethod("getName", noArgs); name = (String) method.invoke(ent, (Object[]) noArgs); method = woodstoxClass.getMethod("getSystemId", noArgs); systemId = (String) method.invoke(ent, (Object[]) noArgs); method = woodstoxClass.getMethod("getPublicId", noArgs); publicId = (String) method.invoke(ent, (Object[]) noArgs); method = woodstoxClass.getMethod("getBaseURI", noArgs); baseURI = (String) method.invoke(ent, (Object[]) noArgs); } catch (NoSuchMethodException e) { // } catch (IllegalAccessException e) { // } catch (InvocationTargetException e) { // } } if (name != null) { try { systemId = new URI(baseURI).resolve(systemId).toString(); } catch (URISyntaxException err) { // } UnparsedEntity ue = new UnparsedEntity(); ue.setName(name); ue.setSystemId(systemId); ue.setPublicId(publicId); ue.setBaseURI(baseURI); list.add(ue); } } return list; } /** * Translate a StAX Location object to a Saxon Locator * * @param location the StAX Location object * @return a Saxon/SAX SourceLocator object */ private ExplicitLocation translateLocation(Location location) { if (location == null) { return ExplicitLocation.UNKNOWN_LOCATION; } else { return new ExplicitLocation(location.getSystemId(), location.getLineNumber(), location.getColumnNumber()); } } /** * Error reporting class for StAX parser errors */ private class StaxErrorReporter implements XMLReporter { public void report(String message, String errorType, Object relatedInformation, Location location) throws XMLStreamException { ExplicitLocation loc = translateLocation(location); XPathException err = new XPathException("Error reported by XML parser: " + message + " (" + errorType + ')'); err.setLocator(loc); pipe.getErrorListener().error(err); } } /** * Simple test program * Usage: java StaxBridge in.xml [out.xml] * * @param args command line arguments */ public static void main(String[] args) throws Exception { for (int i = 0; i < 1; i++) { long startTime = System.currentTimeMillis(); PipelineConfiguration pipe = new Configuration().makePipelineConfiguration(); StaxToEventBridge puller = new StaxToEventBridge(); File f = new File(args[0]); puller.setInputStream(f.toURI().toString(), new FileInputStream(f)); puller.setPipelineConfiguration(pipe); XMLEmitter emitter = new XMLEmitter(); emitter.setPipelineConfiguration(pipe); emitter.setOutputProperties(new Properties()); if (args.length > 1) { emitter.setOutputStream(new FileOutputStream(args[1])); } else { emitter.setOutputStream(System.out); } NamespaceReducer r = new NamespaceReducer(emitter); EventIteratorToReceiver.copy(puller, r); System.err.println("Elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } } /** * Determine whether the EventIterator returns a flat sequence of events, or whether it can return * nested event iterators * * @return true if the next() method is guaranteed never to return an EventIterator */ public boolean isFlatSequence() { return false; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy