All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.pull.StaxBridge Maven / Gradle / Ivy

There is a newer version: 12.5
Show newest version
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2023 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.pull;


import net.sf.saxon.event.PipelineConfiguration;
import net.sf.saxon.expr.parser.Loc;
import net.sf.saxon.functions.ResolveURI;
import net.sf.saxon.om.*;
import net.sf.saxon.str.*;
import net.sf.saxon.trans.SaxonErrorCode;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.trans.XmlProcessingIncident;
import net.sf.saxon.type.BuiltInAtomicType;
import net.sf.saxon.type.SchemaType;
import net.sf.saxon.type.Untyped;
import net.sf.saxon.value.AtomicValue;
import net.sf.saxon.value.Whitespace;

import javax.xml.namespace.QName;
import javax.xml.stream.*;
import javax.xml.stream.events.EntityDeclaration;
import java.io.InputStream;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Stack;

/**
 * This class implements the Saxon PullProvider API on top of a standard StAX parser
 * (or any other StAX XMLStreamReader implementation)
 */

public class StaxBridge implements PullProvider {

    private XMLStreamReader reader;
    private AttributeMap attributes;
    private PipelineConfiguration pipe;
    private NamePool namePool;
    private final HashMap nameCache = new HashMap<>();
    private final Stack namespaceStack = new Stack<>();
    private List unparsedEntities = null;
    PullEvent currentEvent = PullEvent.START_OF_INPUT;
    int depth = 0;
    boolean ignoreIgnorable = false;

    /**
     * Create a new instance of the class
     */

    public StaxBridge() {
        namespaceStack.push(NamespaceMap.emptyMap());
    }

    /**
     * Supply an input stream containing XML to be parsed. A StAX parser is created using
     * the JAXP XMLInputFactory.
     *
     * @param systemId    The Base URI of the input document
     * @param inputStream the stream containing the XML to be parsed
     * @throws XPathException if an error occurs creating the StAX parser
     */

    public void setInputStream(String systemId, InputStream inputStream) throws XPathException {
        try {
            XMLInputFactory factory = XMLInputFactory.newInstance();
            //XMLInputFactory factory = new WstxInputFactory();
            factory.setXMLReporter(new StaxErrorReporter());
            reader = factory.createXMLStreamReader(systemId, inputStream);
        } catch (XMLStreamException e) {
            throw new XPathException(e);
        }
    }

    /**
     * Supply an XMLStreamReader: the events reported by this XMLStreamReader will be translated
     * into PullProvider events
     *
     * @param reader the supplier of XML events, typically an XML parser
     */

    public void setXMLStreamReader(XMLStreamReader reader) {
        this.reader = reader;
    }

    /**
     * Set configuration information. This must only be called before any events
     * have been read.
     */

    @Override
    public void setPipelineConfiguration(PipelineConfiguration pipe) {
        this.pipe = new PipelineConfiguration(pipe);
        this.namePool = pipe.getConfiguration().getNamePool();
        ignoreIgnorable = pipe.getConfiguration().getParseOptions().getSpaceStrippingRule() != NoElementsSpaceStrippingRule.getInstance();
    }

    /**
     * Get configuration information.
     */

    @Override
    public PipelineConfiguration getPipelineConfiguration() {
        return pipe;
    }

    /**
     * Get the XMLStreamReader used by this StaxBridge. This is available only after
     * setInputStream() or setXMLStreamReader() has been called
     *
     * @return the instance of XMLStreamReader allocated when setInputStream() was called,
     *         or the instance supplied directly to setXMLStreamReader()
     */

    public XMLStreamReader getXMLStreamReader() {
        return reader;
    }

    /**
     * Get the name pool
     *
     * @return the name pool
     */

    public NamePool getNamePool() {
        return pipe.getConfiguration().getNamePool();
    }

    /**
     * Get the next event
     *
     * @return an integer code indicating the type of event. The code
     *         {@link PullEvent#END_OF_INPUT} is returned at the end of the sequence.
     */

    @Override
    public PullEvent next() throws XPathException {
        if (currentEvent == PullEvent.START_OF_INPUT) {
            // StAX isn't reporting START_DOCUMENT so we supply it ourselves
            currentEvent = PullEvent.START_DOCUMENT;
            return currentEvent;
        }
        if (currentEvent == PullEvent.END_OF_INPUT || currentEvent == PullEvent.END_DOCUMENT) {
            try {
                reader.close();
            } catch (XMLStreamException e) {
                throw new XPathException(e);
            }
            return PullEvent.END_OF_INPUT;
        }
        try {
            if (reader.hasNext()) {
                int event = reader.next();
                //System.err.println("Read event " + event);
                currentEvent = translate(event);
                if (currentEvent == PullEvent.START_ELEMENT) {
                    NamespaceMap nsMap = namespaceStack.peek();
                    int n = reader.getNamespaceCount();
                    for (int i = 0; i < n; i++) {
                        String prefix = reader.getNamespacePrefix(i);
                        String uri = reader.getNamespaceURI(i);
                        nsMap = nsMap.bind(prefix==null ? "" : prefix, NamespaceUri.of(uri==null ? "" : uri));
                    }
                    namespaceStack.push(nsMap);

                    int attCount = reader.getAttributeCount();
                    if (attCount == 0) {
                        attributes = EmptyAttributeMap.getInstance();
                    } else {
                        List attList = new ArrayList<>();
                        NamePool pool = getNamePool();
                        for (int i=0; i 0) {
                    message = message.substring(c + 10);
                }
            }
            XPathException err = new XPathException("Error reported by XML parser: " + message, e);
            err.setErrorCode(SaxonErrorCode.SXXP0003);
            err.setLocator(translateLocation(e.getLocation()));
            throw err;
        }
        return currentEvent;
    }


    private PullEvent translate(int event) throws XPathException {
        //System.err.println("EVENT " + event);
        switch (event) {
            case XMLStreamConstants.ATTRIBUTE:
                return PullEvent.ATTRIBUTE;
            case XMLStreamConstants.CDATA:
                return PullEvent.TEXT;
            case XMLStreamConstants.CHARACTERS:
                if (depth == 0 && reader.isWhiteSpace()) {
                    return next();
//                    } else if (reader.isWhiteSpace()) {
//                        return next();
                } else {
//                        System.err.println("TEXT[" + new String(reader.getTextCharacters(), reader.getTextStart(), reader.getTextLength()) + "]");
//                        System.err.println("  ARRAY length " + reader.getTextCharacters().length + "[" + new String(reader.getTextCharacters(), 0, reader.getTextStart() + reader.getTextLength()) + "]");
//                        System.err.println("  START: " + reader.getTextStart() + " LENGTH " + reader.getTextLength());
                    return PullEvent.TEXT;
                }
            case XMLStreamConstants.COMMENT:
                return PullEvent.COMMENT;
            case XMLStreamConstants.DTD:
                unparsedEntities = (List) reader.getProperty("javax.xml.stream.entities");
                return next();
            case XMLStreamConstants.END_DOCUMENT:
                return PullEvent.END_DOCUMENT;
            case XMLStreamConstants.END_ELEMENT:
                depth--;
                return PullEvent.END_ELEMENT;
            case XMLStreamConstants.ENTITY_DECLARATION:
                return next();
            case XMLStreamConstants.ENTITY_REFERENCE:
                return next();
            case XMLStreamConstants.NAMESPACE:
                return PullEvent.NAMESPACE;
            case XMLStreamConstants.NOTATION_DECLARATION:
                return next();
            case XMLStreamConstants.PROCESSING_INSTRUCTION:
                return PullEvent.PROCESSING_INSTRUCTION;
            case XMLStreamConstants.SPACE:
                if (depth == 0) {
                    return next();
                } else if (ignoreIgnorable) {
                    // (Brave attempt, but Woodstox doesn't seem to report ignorable whitespace)
                    return next();
                } else {
                    return PullEvent.TEXT;
                }
            case XMLStreamConstants.START_DOCUMENT:
                return next();  // we supplied the START_DOCUMENT ourselves
            //return START_DOCUMENT;
            case XMLStreamConstants.START_ELEMENT:
                depth++;
                return PullEvent.START_ELEMENT;
            default:
                throw new IllegalStateException("Unknown StAX event " + event);


        }
    }

    /**
     * Get the event most recently returned by next(), or by other calls that change
     * the position, for example getStringValue() and skipToMatchingEnd(). This
     * method does not change the position of the PullProvider.
     *
     * @return the current event
     */

    @Override
    public PullEvent current() {
        return currentEvent;
    }

    /**
     * Get the attributes associated with the current element. This method must
     * be called only after a START_ELEMENT event has been notified. The contents
     * of the returned AttributeMap are immutable.
     * 

Attributes may be read before or after reading the namespaces of an element, * but must not be read after the first child node has been read, or after calling * one of the methods skipToEnd(), getStringValue(), or getTypedValue().

* * @return an AttributeCollection representing the attributes of the element * that has just been notified. */ @Override public AttributeMap getAttributes() { return attributes; } /** * Get the namespace declarations associated with the current element. This method must * be called only after a START_ELEMENT event has been notified. In the case of a top-level * START_ELEMENT event (that is, an element that either has no parent node, or whose parent * is not included in the sequence being read), the NamespaceDeclarations object returned * will contain a namespace declaration for each namespace that is in-scope for this element * node. In the case of a non-top-level element, the NamespaceDeclarations will contain * a set of namespace declarations and undeclarations, representing the differences between * this element and its parent. *

It is permissible for this method to return namespace declarations that are redundant.

*

The NamespaceDeclarations object is guaranteed to remain unchanged until the next START_ELEMENT * event, but may then be overwritten. The object should not be modified by the client.

*

Namespaces may be read before or after reading the attributes of an element, * but must not be read after the first child node has been read, or after calling * one of the methods skipToEnd(), getStringValue(), or getTypedValue().

*/ @Override public NamespaceBinding[] getNamespaceDeclarations() { int n = reader.getNamespaceCount(); if (n == 0) { return NamespaceBinding.EMPTY_ARRAY; } else { NamespaceBinding[] bindings = new NamespaceBinding[n]; for (int i = 0; i < n; i++) { String prefix = reader.getNamespacePrefix(i); if (prefix == null) { prefix = ""; } String uri = reader.getNamespaceURI(i); if (uri == null) { uri = ""; } bindings[i] = new NamespaceBinding(prefix, NamespaceUri.of(uri)); } return bindings; } } /** * Skip the current subtree. This method may be called only immediately after * a START_DOCUMENT or START_ELEMENT event. This call returns the matching * END_DOCUMENT or END_ELEMENT event; the next call on next() will return * the event following the END_DOCUMENT or END_ELEMENT. */ @Override public PullEvent skipToMatchingEnd() throws XPathException { switch (currentEvent) { case START_DOCUMENT: currentEvent = PullEvent.END_DOCUMENT; return currentEvent; case START_ELEMENT: try { int skipDepth = 0; while (reader.hasNext()) { int event = reader.next(); if (event == XMLStreamConstants.START_ELEMENT) { skipDepth++; } else if (event == XMLStreamConstants.END_ELEMENT) { if (skipDepth-- == 0) { currentEvent = PullEvent.END_ELEMENT; return currentEvent; } } } } catch (XMLStreamException e) { throw new XPathException(e); } throw new IllegalStateException( "Element start has no matching element end"); default: throw new IllegalStateException( "Cannot call skipToMatchingEnd() except when at start of element or document"); } } /** * Close the event reader. This indicates that no further events are required. * It is not necessary to close an event reader after {@link PullEvent#END_OF_INPUT} has * been reported, but it is recommended to close it if reading terminates * prematurely. Once an event reader has been closed, the effect of further * calls on next() is undefined. */ @Override public void close() { try { reader.close(); } catch (XMLStreamException e) { // } } /** * Get the NodeName identifying the name of the current node. This method * can be used after the {@link PullEvent#START_ELEMENT}, {@link PullEvent#PROCESSING_INSTRUCTION}, * {@link PullEvent#ATTRIBUTE}, or {@link PullEvent#NAMESPACE} events. With some PullProvider implementations, * it can also be used after {@link PullEvent#END_ELEMENT}, but this is not guaranteed. * If called at other times, the result is undefined and may result in an IllegalStateException. * If called when the current node is an unnamed namespace node (a node representing the default namespace) * the returned value is null. * * @return the NodeName. The NodeName can be used to obtain the prefix, local name, * and namespace URI. */ @Override public NodeName getNodeName() { if (currentEvent == PullEvent.START_ELEMENT || currentEvent == PullEvent.END_ELEMENT) { String local = reader.getLocalName(); NamespaceUri uri = NamespaceUri.of(reader.getNamespaceURI()); // We keep a cache indexed by local name, on the assumption that most of the time, a given // local name will only ever be used with the same prefix and URI NodeName cached = nameCache.get(local); if (cached != null && cached.hasURI(uri) && cached.getPrefix().equals(reader.getPrefix())) { return cached; } else { int fp = namePool.allocateFingerprint(uri, local); if (uri == null) { cached = new NoNamespaceName(local, fp); } else { cached = new FingerprintedQName(reader.getPrefix(), uri, local, fp); } nameCache.put(local, cached); return cached; } } else if (currentEvent == PullEvent.PROCESSING_INSTRUCTION) { String local = reader.getPITarget(); return new NoNamespaceName(local); } else { throw new IllegalStateException(); } } /** * Get the string value of the current element, text node, processing-instruction, * or top-level attribute or namespace node, or atomic value. *

In other situations the result is undefined and may result in an IllegalStateException.

*

If the most recent event was a {@link PullEvent#START_ELEMENT}, this method causes the content * of the element to be read. The current event on completion of this method will be the * corresponding {@link PullEvent#END_ELEMENT}. The next call of next() will return the event following * the END_ELEMENT event.

* * @return the String Value of the node in question, defined according to the rules in the * XPath data model. */ @Override public UnicodeString getStringValue() throws XPathException { switch (currentEvent) { case TEXT: return StringTool.compress(reader.getTextCharacters(), reader.getTextStart(), reader.getTextLength(), true); case COMMENT: return StringView.of(new String(reader.getTextCharacters(), reader.getTextStart(), reader.getTextLength())); case PROCESSING_INSTRUCTION: String s = reader.getPIData(); // The BEA parser includes the separator space in the value, // which isn't part of the XPath data model return Whitespace.removeLeadingWhitespace(StringView.tidy(s)); case START_ELEMENT: UnicodeBuilder combinedText = null; try { int depth = 0; while (reader.hasNext()) { int event = reader.next(); if (event == XMLStreamConstants.CHARACTERS) { if (combinedText == null) { combinedText = new UnicodeBuilder(); } combinedText.accept( StringView.of(new String(reader.getTextCharacters(), reader.getTextStart(), reader.getTextLength()))); } else if (event == XMLStreamConstants.START_ELEMENT) { depth++; } else if (event == XMLStreamConstants.END_ELEMENT) { if (depth-- == 0) { currentEvent = PullEvent.END_ELEMENT; if (combinedText != null) { return combinedText.toUnicodeString(); } else { return EmptyUnicodeString.getInstance(); } } } } } catch (XMLStreamException e) { throw new XPathException(e); } default: throw new IllegalStateException("getStringValue() called when current event is " + currentEvent); } } /** * Get an atomic value. This call may be used only when the last event reported was * ATOMIC_VALUE. This indicates that the PullProvider is reading a sequence that contains * a free-standing atomic value; it is never used when reading the content of a node. */ @Override public AtomicValue getAtomicValue() { throw new IllegalStateException(); } /** * Get the type annotation of the current attribute or element node, or atomic value. * The result of this method is undefined unless the most recent event was START_ELEMENT, * ATTRIBUTE, or ATOMIC_VALUE. * * @return the type annotation. */ @Override public SchemaType getSchemaType() { if (currentEvent == PullEvent.START_ELEMENT) { return Untyped.getInstance(); } else if (currentEvent == PullEvent.ATTRIBUTE) { return BuiltInAtomicType.UNTYPED_ATOMIC; } else { return null; } } /** * Get the location of the current event. * For an event stream representing a real document, the location information * should identify the location in the lexical XML source. For a constructed document, it should * identify the location in the query or stylesheet that caused the node to be created. * A value of null can be returned if no location information is available. */ @Override public net.sf.saxon.s9api.Location getSourceLocator() { return translateLocation(reader.getLocation()); } /** * Translate a StAX Location object to a Saxon Locator * * @param location the StAX Location object * @return a Saxon/SAX SourceLocator object */ private Loc translateLocation(Location location) { if (location == null) { return Loc.NONE; } else { return new Loc(location.getSystemId(), location.getLineNumber(), location.getColumnNumber()); } } /** * Get a list of unparsed entities. * * @return a list of unparsed entities, or null if the information is not available, or * an empty list if there are no unparsed entities. Each item in the list will * be an instance of {@link net.sf.saxon.pull.UnparsedEntity} */ @Override public List getUnparsedEntities() { if (unparsedEntities == null) { return null; } List list = new ArrayList<>(unparsedEntities.size()); for (Object ent : unparsedEntities) { String name = null; String systemId = null; String publicId = null; String baseURI = null; if (ent instanceof EntityDeclaration) { // This is what we would expect from the StAX API spec EntityDeclaration ed = (EntityDeclaration) ent; name = ed.getName(); systemId = ed.getSystemId(); publicId = ed.getPublicId(); baseURI = ed.getBaseURI(); } else if (ent.getClass().getName().equals("com.ctc.wstx.ent.UnparsedExtEntity")) { // Woodstox 3.0.0 returns this: use introspection to get the data we need try { Class woodstoxClass = ent.getClass(); Class[] noArgClasses = new Class[0]; Object[] noArgs = new Object[0]; Method method = woodstoxClass.getMethod("getName", noArgClasses); name = (String) method.invoke(ent, noArgs); method = woodstoxClass.getMethod("getSystemId", noArgClasses); systemId = (String) method.invoke(ent, noArgs); method = woodstoxClass.getMethod("getPublicId", noArgClasses); publicId = (String) method.invoke(ent, noArgs); method = woodstoxClass.getMethod("getBaseURI", noArgClasses); baseURI = (String) method.invoke(ent, noArgs); } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) { // } } if (name != null) { if (baseURI != null && systemId != null) { try { systemId = ResolveURI.makeAbsolute(systemId, baseURI).toString(); } catch (URISyntaxException err) { // } } UnparsedEntity ue = new UnparsedEntity(); ue.setName(name); ue.setSystemId(systemId); ue.setPublicId(publicId); ue.setBaseURI(baseURI); list.add(ue); } } return list; } /** * Error reporting class for StAX parser errors */ private class StaxErrorReporter implements XMLReporter { @Override public void report(String message, String errorType, Object relatedInformation, Location location) { XmlProcessingIncident err = new XmlProcessingIncident("Error reported by XML parser: " + message + " (" + errorType + ')'); err.setLocation(translateLocation(location)); pipe.getErrorReporter().report(err); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy