All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.event.ReceivingContentHandler Maven / Gradle / Ivy

There is a newer version: 12.5
Show newest version
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2022 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.event;

import net.sf.saxon.Configuration;
import net.sf.saxon.expr.parser.Loc;
import net.sf.saxon.functions.ResolveURI;
import net.sf.saxon.lib.Feature;
import net.sf.saxon.om.*;
import net.sf.saxon.s9api.Location;
import net.sf.saxon.str.*;
import net.sf.saxon.trans.Err;
import net.sf.saxon.trans.QuitParsingException;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.trans.XmlProcessingException;
import net.sf.saxon.type.*;
import net.sf.saxon.value.Whitespace;
import org.xml.sax.*;
import org.xml.sax.ext.Attributes2;
import org.xml.sax.ext.LexicalHandler;

import javax.xml.transform.Result;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

/**
 * ReceivingContentHandler is a glue class that provides a standard SAX ContentHandler
 * interface to a Saxon Receiver. To achieve this it needs to map names supplied
 * as strings to numeric name codes, for which purpose it needs access to a name
 * pool. The class also performs the function of assembling adjacent text nodes.
 * 

If the input stream contains the processing instructions assigned by JAXP to switch * disable-output-escaping on or off, these will be reflected in properties set in the corresponding * characters events. In this case adjacent text nodes will not be combined.

*

The {@code ReceivingContentHandler} is written on the assumption that it is receiving events * from a parser configured with {@code http://xml.org/sax/features/namespaces} set to true * and {@code http://xml.org/sax/features/namespace-prefixes} set to false.

*

When running as a {@code TransformerHandler}, we have no control over the feature settings * of the sender of the events, and if the events do not follow this pattern then the class may * fail in unpredictable ways.

* */ public class ReceivingContentHandler implements ContentHandler, LexicalHandler, DTDHandler { private PipelineConfiguration pipe; private Receiver receiver; private boolean inDTD = false; // true while processing the DTD private LocalLocator localLocator = new LocalLocator(Loc.NONE); private boolean lineNumbering; private Location lastTextNodeLocator; // buffer for accumulating character data, until the next markup event is received private char[] buffer = new char[512]; private int charsUsed = 0; //private CharSlice slice = new CharSlice(buffer, 0, 0); // stack for accumulating namespace information private Stack namespaceStack = new Stack<>(); private NamespaceMap currentNamespaceMap; // determine whether ignorable whitespace is ignored private boolean ignoreIgnorable = false; // determine whether DTD attribute types are retained private boolean retainDTDAttributeTypes = false; // determine whether DTD attribute value defaults should be suppressed //private boolean suppressDTDAttributeDefaults = false; // indicate that escaping is allowed to be disabled using the JAXP-defined processing instructions private boolean allowDisableOutputEscaping = false; // indicate that escaping is disabled private boolean escapingDisabled = false; // flag to indicate whether the last tag was a start tag or an end tag private boolean afterStartTag = true; /** * A local cache is used to avoid allocating namecodes for the same name more than once. * This reduces contention on the NamePool. This is a two-level hashmap: the first level * has the namespace URI as its key, and returns a HashMap which maps lexical QNames to integer * namecodes. */ private final HashMap> nameCache = new HashMap<>(10); private HashMap noNamespaceNameCache = new HashMap<>(10); // Action to be taken with defaulted attributes. 0=process normally, -1=suppress, +1=mark as defaulted private int defaultedAttributesAction = 0; // Stack holding depth of nesting of elements within external entities; created on first use private Stack elementDepthWithinEntity; /** * Create a ReceivingContentHandler and initialise variables */ public ReceivingContentHandler() { currentNamespaceMap = NamespaceMap.emptyMap(); namespaceStack.push(currentNamespaceMap); } /** * Set the ReceivingContentHandler to its initial state, except for the local name cache, * which is retained */ public void reset() { pipe = null; receiver = null; ignoreIgnorable = false; retainDTDAttributeTypes = false; charsUsed = 0; namespaceStack = new Stack<>(); currentNamespaceMap = NamespaceMap.emptyMap(); namespaceStack.push(currentNamespaceMap); localLocator = new LocalLocator(Loc.NONE); allowDisableOutputEscaping = false; escapingDisabled = false; lineNumbering = false; } /** * Set the receiver to which events are passed. ReceivingContentHandler is essentially a translator * that takes SAX events as input and produces Saxon Receiver events as output; these Receiver events * are passed to the supplied Receiver * * @param receiver the Receiver of events */ public void setReceiver(Receiver receiver) { this.receiver = receiver; //receiver = new TracingFilter(receiver); } /** * Get the receiver to which events are passed. * * @return the underlying Receiver */ public Receiver getReceiver() { return receiver; } /** * Set the pipeline configuration * * @param pipe the pipeline configuration. This holds a reference to the Saxon configuration, as well as * information that can vary from one pipeline to another */ public void setPipelineConfiguration(PipelineConfiguration pipe) { this.pipe = pipe; Configuration config = pipe.getConfiguration(); ignoreIgnorable = pipe.getParseOptions().getSpaceStrippingRule() != NoElementsSpaceStrippingRule.getInstance(); retainDTDAttributeTypes = config.getBooleanProperty(Feature.RETAIN_DTD_ATTRIBUTE_TYPES); if (!pipe.getParseOptions().isExpandAttributeDefaults()) { defaultedAttributesAction = -1; } else if (config.getBooleanProperty(Feature.MARK_DEFAULTED_ATTRIBUTES)) { defaultedAttributesAction = +1; } allowDisableOutputEscaping = config.getConfigurationProperty(Feature.USE_PI_DISABLE_OUTPUT_ESCAPING); lineNumbering = pipe.getParseOptions().isLineNumbering(); } /** * Get the pipeline configuration * * @return the pipeline configuration as supplied to * {@link #setPipelineConfiguration(PipelineConfiguration)} */ public PipelineConfiguration getPipelineConfiguration() { return pipe; } /** * Get the Configuration object * * @return the Saxon configuration */ public Configuration getConfiguration() { return pipe.getConfiguration(); } /** * Set whether "ignorable whitespace" should be ignored. This method is effective only * if called after setPipelineConfiguration, since the default value is taken from the * configuration. * * @param ignore true if ignorable whitespace (whitespace in element content that is notified * via the {@link #ignorableWhitespace(char[], int, int)} method) should be ignored, false if * it should be treated as ordinary text. */ public void setIgnoreIgnorableWhitespace(boolean ignore) { ignoreIgnorable = ignore; } /** * Determine whether "ignorable whitespace" is ignored. This returns the value that was set * using {@link #setIgnoreIgnorableWhitespace} if that has been called; otherwise the value * from the configuration. * * @return true if ignorable whitespace is being ignored */ public boolean isIgnoringIgnorableWhitespace() { return ignoreIgnorable; } /** * Receive notification of the beginning of a document. */ @Override public void startDocument() throws SAXException { // System.err.println("ReceivingContentHandler#startDocument"); try { charsUsed = 0; currentNamespaceMap = NamespaceMap.emptyMap(); namespaceStack = new Stack<>(); namespaceStack.push(currentNamespaceMap); receiver.setPipelineConfiguration(pipe); String systemId = localLocator.getSystemId(); if (systemId != null) { receiver.setSystemId(localLocator.getSystemId()); } receiver.open(); receiver.startDocument(ReceiverOption.NONE); } catch (QuitParsingException quit) { getPipelineConfiguration().getErrorReporter().report( new XmlProcessingException(quit).asWarning()); throw new SAXException(quit); } catch (XPathException err) { throw new SAXException(err); } } /** * Receive notification of the end of a document */ @Override public void endDocument() throws SAXException { // System.err.println("RCH: end document"); try { flush(true); receiver.endDocument(); receiver.close(); } catch (ValidationException err) { err.setLocator(localLocator); throw new SAXException(err); } catch (QuitParsingException err) { // no action: not worth bothering at this stage of the game } catch (XPathException err) { err.maybeSetLocation(localLocator); throw new SAXException(err); } } /** * Supply a locator that can be called to give information about location in the source document * being parsed. */ @Override public void setDocumentLocator(Locator locator) { localLocator = new LocalLocator(locator); if (!lineNumbering) { lastTextNodeLocator = localLocator; } } /** * Notify a namespace prefix to URI binding */ @Override public void startPrefixMapping(String prefix, String uri) { //System.err.println("StartPrefixMapping " + prefix + "=" + uri); if (prefix.equals("xmlns")) { // the binding xmlns:xmlns="http://www.w3.org/2000/xmlns/" // should never be reported, but it's been known to happen return; } currentNamespaceMap = currentNamespaceMap.bind(prefix, uri); } /** * Notify that a namespace binding is going out of scope */ @Override public void endPrefixMapping(String prefix) { //System.err.println("endPrefixMapping " + prefix); } /** * Receive notification of the beginning of an element. * *

The Parser will invoke this method at the beginning of every * element in the XML document; there will be a corresponding * {@link #endElement endElement} event for every startElement event * (even when the element is empty). All of the element's content will be * reported, in order, before the corresponding endElement * event.

* *

This event allows up to three name components for each * element:

* *
    *
  1. the Namespace URI;
  2. *
  3. the local name; and
  4. *
  5. the qualified (prefixed) name.
  6. *
* *

Saxon expects all three of these to be provided. * *

The attribute list provided should contain only * attributes with explicit values (specified or defaulted): * #IMPLIED attributes should be omitted. The attribute list * should not contain attributes used for Namespace declarations * (xmlns* attributes); if it does, Saxon will ignore them, * which may lead to unresolved namespace prefixes.

* * @param uri the Namespace URI, or the empty string if the * element has no Namespace URI or if Namespace * processing is not being performed * @param localname the local name (without prefix), or the * empty string if Namespace processing is not being * performed * @param rawname the qualified name (with prefix), or the * empty string if qualified names are not available * @param atts the attributes attached to the element. If * there are no attributes, it shall be an empty * Attributes object. The value of this object after * startElement returns is undefined * @throws org.xml.sax.SAXException any SAX exception, possibly * wrapping another exception * @see #endElement * @see org.xml.sax.Attributes * @see org.xml.sax.helpers.AttributesImpl */ @Override public void startElement(String uri, String localname, String rawname, Attributes atts) throws SAXException { //System.err.println("ReceivingContentHandler#startElement " + localname + " sysId=" + localLocator.getSystemId()); //for (int a=0; a list = new ArrayList<>(atts.getLength()); for (int a=0; a map2 = uri.isEmpty() ? noNamespaceNameCache : nameCache.get(uri); if (map2 == null) { map2 = new HashMap<>(50); nameCache.put(uri, map2); if (uri.isEmpty()) { noNamespaceNameCache = map2; } } NodeName n = map2.get(rawname); // we use the rawname (qname) rather than the local name because we want to retain the prefix // Note that the NodeName objects generated do not contain a namecode or fingerprint; it will be generated // later if we are building a TinyTree, but not necessarily on other paths (e.g. an identity transformation). // The NodeName object is shared by all elements with the same name, so when the namecode is allocated to one // of them, it is there for all of them. if (n == null) { if (uri.isEmpty()) { NoNamespaceName qn = new NoNamespaceName(localname); map2.put(rawname, qn); return qn; } else { String prefix = NameChecker.getPrefix(rawname); FingerprintedQName qn = new FingerprintedQName(prefix, uri, localname); map2.put(rawname, qn); return qn; } } else { return n; } } /** * Report the end of an element (the close tag) */ @Override public void endElement(String uri, String localname, String rawname) throws SAXException { //System.err.println("ReceivingContentHandler#End element " + rawname + " (depth=" + namespaceStack.size() + ")"); try { // don't attempt whitespace compression if this end tag follows a start tag flush(!afterStartTag); localLocator.levelInEntity--; receiver.endElement(); } catch (ValidationException err) { err.maybeSetLocation(localLocator); if (!err.hasBeenReported()) { pipe.getErrorReporter().report(new XmlProcessingException(err)); } err.setHasBeenReported(true); throw new SAXException(err); } catch (XPathException err) { err.maybeSetLocation(localLocator); throw new SAXException(err); } afterStartTag = false; namespaceStack.pop(); currentNamespaceMap = namespaceStack.peek(); } /** * Report character data. Note that contiguous character data may be reported as a sequence of * calls on this method, with arbitrary boundaries */ @Override public void characters(char[] ch, int start, int length) { // System.err.println("characters (" + length + ")"); // need to concatenate chunks of text before we can decide whether a node is all-white while (charsUsed + length > buffer.length) { buffer = Arrays.copyOf(buffer, buffer.length*2); //slice = new CharSlice(buffer, 0, 0); } System.arraycopy(ch, start, buffer, charsUsed, length); charsUsed += length; if (lineNumbering) { lastTextNodeLocator = localLocator.saveLocation(); } } /** * Report character data classified as "Ignorable whitespace", that is, whitespace text nodes * appearing as children of elements with an element-only content model */ @Override public void ignorableWhitespace(char[] ch, int start, int length) { if (!ignoreIgnorable) { characters(ch, start, length); } } /** * Notify the existence of a processing instruction */ @Override public void processingInstruction(String name, String remainder) throws SAXException { try { flush(true); if (!inDTD) { if (name == null) { // trick used by the old James Clark xp parser to notify a comment comment(remainder.toCharArray(), 0, remainder.length()); } else { // some parsers allow through PI names containing colons if (!NameChecker.isValidNCName(name)) { throw new SAXException("Invalid processing instruction name (" + name + ')'); } if (allowDisableOutputEscaping) { if (name.equals(Result.PI_DISABLE_OUTPUT_ESCAPING)) { //flush(); escapingDisabled = true; return; } else if (name.equals(Result.PI_ENABLE_OUTPUT_ESCAPING)) { //flush(); escapingDisabled = false; return; } } UnicodeString data; if (remainder == null) { // allowed by the spec but rarely seen: see Saxon bug 2491 data = EmptyUnicodeString.getInstance(); } else { // not strictly necessary (the parser should have done this) but needed in practice data = Whitespace.removeLeadingWhitespace(StringView.tidy(remainder)); } receiver.processingInstruction(name, data, localLocator, ReceiverOption.NONE); } } } catch (XPathException err) { throw new SAXException(err); } } /** * Notify the existence of a comment. Note that in SAX this is part of LexicalHandler interface * rather than the ContentHandler interface. */ @Override public void comment(char[] ch, int start, int length) throws SAXException { try { flush(true); if (!inDTD) { receiver.comment(StringView.of(new String(ch, start, length)), localLocator, ReceiverOption.NONE); } } catch (XPathException err) { throw new SAXException(err); } } /** * Flush buffer for accumulated character data * * @param compress true if compression of whitespace should be attempted. This is an expensive * operation, so we avoid doing it when we hit an end tag that follows after a start tag, as * it's not likely to succeed in that situation. * @throws XPathException if flushing the character data fails */ private void flush(boolean compress) throws XPathException { if (charsUsed > 0) { //CharSlice slice = new CharSlice(buffer, 0, charsUsed); UnicodeString content = StringTool.compress(buffer, 0, charsUsed, compress); receiver.characters(content, lastTextNodeLocator, escapingDisabled ? ReceiverOption.DISABLE_ESCAPING : ReceiverOption.WHOLE_TEXT_NODE); charsUsed = 0; escapingDisabled = false; } } /** * Notify a skipped entity. Saxon ignores this event */ @Override public void skippedEntity(String name) { } // No-op methods to satisfy lexical handler interface /** * Register the start of the DTD. Saxon ignores the DTD; however, it needs to know when the DTD starts and * ends so that it can ignore comments in the DTD, which are reported like any other comment, but which * are skipped because they are not part of the XPath data model */ @Override public void startDTD(String name, String publicId, String systemId) { inDTD = true; } /** * Register the end of the DTD. Comments in the DTD are skipped because they * are not part of the XPath data model */ @Override public void endDTD() { inDTD = false; } @Override public void startEntity(String name) { if (elementDepthWithinEntity == null) { elementDepthWithinEntity = new Stack<>(); } elementDepthWithinEntity.push(localLocator.levelInEntity); localLocator.levelInEntity = 0; } @Override public void endEntity(String name) { localLocator.levelInEntity = elementDepthWithinEntity.pop(); } @Override public void startCDATA() { } @Override public void endCDATA() { } ////////////////////////////////////////////////////////////////////////////// // Implement DTDHandler interface ////////////////////////////////////////////////////////////////////////////// @Override public void notationDecl(String name, String publicId, String systemId) { } @Override public void unparsedEntityDecl(String name, String publicId, String systemId, String notationName) throws SAXException { // Some (non-conformant) SAX parsers report the systemId as written. // We need to turn it into an absolute URL. String uri = systemId; if (localLocator != null) { try { URI suppliedURI = new URI(systemId); if (!suppliedURI.isAbsolute()) { String baseURI = localLocator.getSystemId(); if (baseURI != null) { // See bug 21679 uri = ResolveURI.makeAbsolute(systemId, baseURI).toString(); } } } catch (URISyntaxException err) { // fallback - no action } } try { receiver.setUnparsedEntity(name, uri, publicId); } catch (XPathException err) { throw new SAXException(err); } } /** * An implementation of the Saxon {@link Location} interface that wraps the SAX Locator * information. Note that this object is mutable and changes continually as parsing proceeds; * it is therefore necessary to call its {@link #saveLocation()} method to obtain an * immutable location that still has meaning once parsing is finished. */ public static class LocalLocator implements Location { private final Locator saxLocator; public int levelInEntity; LocalLocator(Locator saxLocator) { this.saxLocator = saxLocator; this.levelInEntity = 0; } /** * Return the system identifier for the current document event. * * @return A string containing the system identifier, or * null if none is available. */ @Override public String getSystemId() { return saxLocator.getSystemId(); } /** * Return the public identifier for the current document event. * * @return A string containing the public identifier, or * null if none is available. */ @Override public String getPublicId() { return saxLocator.getPublicId(); } /** * Return the line number where the current document event ends. * * @return The line number, or -1 if none is available. */ @Override public int getLineNumber() { return saxLocator.getLineNumber(); } /** * Return the character position where the current document event ends. * * @return The column number, or -1 if none is available. */ @Override public int getColumnNumber() { return saxLocator.getColumnNumber(); } /** * Get an immutable copy of this Location object. By default Location objects may be mutable, so they * should not be saved for later use. The result of this operation holds the same location information, * but in an immutable form. */ @Override public Location saveLocation() { return new Loc(getSystemId(), getLineNumber(), getColumnNumber()); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy