net.sf.saxon.event.ReceivingContentHandler Maven / Gradle / Ivy
Show all versions of Saxon-HE Show documentation
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2022 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
package net.sf.saxon.event;
import net.sf.saxon.Configuration;
import net.sf.saxon.expr.parser.Loc;
import net.sf.saxon.functions.ResolveURI;
import net.sf.saxon.lib.Feature;
import net.sf.saxon.om.*;
import net.sf.saxon.s9api.Location;
import net.sf.saxon.str.*;
import net.sf.saxon.trans.Err;
import net.sf.saxon.trans.QuitParsingException;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.trans.XmlProcessingException;
import net.sf.saxon.type.*;
import net.sf.saxon.value.Whitespace;
import org.xml.sax.*;
import org.xml.sax.ext.Attributes2;
import org.xml.sax.ext.LexicalHandler;
import javax.xml.transform.Result;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;
/**
* ReceivingContentHandler is a glue class that provides a standard SAX ContentHandler
* interface to a Saxon Receiver. To achieve this it needs to map names supplied
* as strings to numeric name codes, for which purpose it needs access to a name
* pool. The class also performs the function of assembling adjacent text nodes.
* If the input stream contains the processing instructions assigned by JAXP to switch
* disable-output-escaping on or off, these will be reflected in properties set in the corresponding
* characters events. In this case adjacent text nodes will not be combined.
* The {@code ReceivingContentHandler} is written on the assumption that it is receiving events
* from a parser configured with {@code http://xml.org/sax/features/namespaces} set to true
* and {@code http://xml.org/sax/features/namespace-prefixes} set to false.
* When running as a {@code TransformerHandler}, we have no control over the feature settings
* of the sender of the events, and if the events do not follow this pattern then the class may
* fail in unpredictable ways.
*
*/
public class ReceivingContentHandler
implements ContentHandler, LexicalHandler, DTDHandler
{
private PipelineConfiguration pipe;
private Receiver receiver;
private boolean inDTD = false; // true while processing the DTD
private LocalLocator localLocator = new LocalLocator(Loc.NONE);
private boolean lineNumbering;
private Location lastTextNodeLocator;
// buffer for accumulating character data, until the next markup event is received
private char[] buffer = new char[512];
private int charsUsed = 0;
//private CharSlice slice = new CharSlice(buffer, 0, 0);
// stack for accumulating namespace information
private Stack namespaceStack = new Stack<>();
private NamespaceMap currentNamespaceMap;
// determine whether ignorable whitespace is ignored
private boolean ignoreIgnorable = false;
// determine whether DTD attribute types are retained
private boolean retainDTDAttributeTypes = false;
// determine whether DTD attribute value defaults should be suppressed
//private boolean suppressDTDAttributeDefaults = false;
// indicate that escaping is allowed to be disabled using the JAXP-defined processing instructions
private boolean allowDisableOutputEscaping = false;
// indicate that escaping is disabled
private boolean escapingDisabled = false;
// flag to indicate whether the last tag was a start tag or an end tag
private boolean afterStartTag = true;
/**
* A local cache is used to avoid allocating namecodes for the same name more than once.
* This reduces contention on the NamePool. This is a two-level hashmap: the first level
* has the namespace URI as its key, and returns a HashMap which maps lexical QNames to integer
* namecodes.
*/
private final HashMap> nameCache = new HashMap<>(10);
private HashMap noNamespaceNameCache = new HashMap<>(10);
// Action to be taken with defaulted attributes. 0=process normally, -1=suppress, +1=mark as defaulted
private int defaultedAttributesAction = 0;
// Stack holding depth of nesting of elements within external entities; created on first use
private Stack elementDepthWithinEntity;
/**
* Create a ReceivingContentHandler and initialise variables
*/
public ReceivingContentHandler() {
currentNamespaceMap = NamespaceMap.emptyMap();
namespaceStack.push(currentNamespaceMap);
}
/**
* Set the ReceivingContentHandler to its initial state, except for the local name cache,
* which is retained
*/
public void reset() {
pipe = null;
receiver = null;
ignoreIgnorable = false;
retainDTDAttributeTypes = false;
charsUsed = 0;
namespaceStack = new Stack<>();
currentNamespaceMap = NamespaceMap.emptyMap();
namespaceStack.push(currentNamespaceMap);
localLocator = new LocalLocator(Loc.NONE);
allowDisableOutputEscaping = false;
escapingDisabled = false;
lineNumbering = false;
}
/**
* Set the receiver to which events are passed. ReceivingContentHandler is essentially a translator
* that takes SAX events as input and produces Saxon Receiver events as output; these Receiver events
* are passed to the supplied Receiver
*
* @param receiver the Receiver of events
*/
public void setReceiver(Receiver receiver) {
this.receiver = receiver;
//receiver = new TracingFilter(receiver);
}
/**
* Get the receiver to which events are passed.
*
* @return the underlying Receiver
*/
public Receiver getReceiver() {
return receiver;
}
/**
* Set the pipeline configuration
*
* @param pipe the pipeline configuration. This holds a reference to the Saxon configuration, as well as
* information that can vary from one pipeline to another
*/
public void setPipelineConfiguration(PipelineConfiguration pipe) {
this.pipe = pipe;
Configuration config = pipe.getConfiguration();
ignoreIgnorable = pipe.getParseOptions().getSpaceStrippingRule() != NoElementsSpaceStrippingRule.getInstance();
retainDTDAttributeTypes = config.getBooleanProperty(Feature.RETAIN_DTD_ATTRIBUTE_TYPES);
if (!pipe.getParseOptions().isExpandAttributeDefaults()) {
defaultedAttributesAction = -1;
} else if (config.getBooleanProperty(Feature.MARK_DEFAULTED_ATTRIBUTES)) {
defaultedAttributesAction = +1;
}
allowDisableOutputEscaping = config.getConfigurationProperty(Feature.USE_PI_DISABLE_OUTPUT_ESCAPING);
lineNumbering = pipe.getParseOptions().isLineNumbering();
}
/**
* Get the pipeline configuration
*
* @return the pipeline configuration as supplied to
* {@link #setPipelineConfiguration(PipelineConfiguration)}
*/
public PipelineConfiguration getPipelineConfiguration() {
return pipe;
}
/**
* Get the Configuration object
*
* @return the Saxon configuration
*/
public Configuration getConfiguration() {
return pipe.getConfiguration();
}
/**
* Set whether "ignorable whitespace" should be ignored. This method is effective only
* if called after setPipelineConfiguration, since the default value is taken from the
* configuration.
*
* @param ignore true if ignorable whitespace (whitespace in element content that is notified
* via the {@link #ignorableWhitespace(char[], int, int)} method) should be ignored, false if
* it should be treated as ordinary text.
*/
public void setIgnoreIgnorableWhitespace(boolean ignore) {
ignoreIgnorable = ignore;
}
/**
* Determine whether "ignorable whitespace" is ignored. This returns the value that was set
* using {@link #setIgnoreIgnorableWhitespace} if that has been called; otherwise the value
* from the configuration.
*
* @return true if ignorable whitespace is being ignored
*/
public boolean isIgnoringIgnorableWhitespace() {
return ignoreIgnorable;
}
/**
* Receive notification of the beginning of a document.
*/
@Override
public void startDocument() throws SAXException {
// System.err.println("ReceivingContentHandler#startDocument");
try {
charsUsed = 0;
currentNamespaceMap = NamespaceMap.emptyMap();
namespaceStack = new Stack<>();
namespaceStack.push(currentNamespaceMap);
receiver.setPipelineConfiguration(pipe);
String systemId = localLocator.getSystemId();
if (systemId != null) {
receiver.setSystemId(localLocator.getSystemId());
}
receiver.open();
receiver.startDocument(ReceiverOption.NONE);
} catch (QuitParsingException quit) {
getPipelineConfiguration().getErrorReporter().report(
new XmlProcessingException(quit).asWarning());
throw new SAXException(quit);
} catch (XPathException err) {
throw new SAXException(err);
}
}
/**
* Receive notification of the end of a document
*/
@Override
public void endDocument() throws SAXException {
// System.err.println("RCH: end document");
try {
flush(true);
receiver.endDocument();
receiver.close();
} catch (ValidationException err) {
err.setLocator(localLocator);
throw new SAXException(err);
} catch (QuitParsingException err) {
// no action: not worth bothering at this stage of the game
} catch (XPathException err) {
err.maybeSetLocation(localLocator);
throw new SAXException(err);
}
}
/**
* Supply a locator that can be called to give information about location in the source document
* being parsed.
*/
@Override
public void setDocumentLocator(Locator locator) {
localLocator = new LocalLocator(locator);
if (!lineNumbering) {
lastTextNodeLocator = localLocator;
}
}
/**
* Notify a namespace prefix to URI binding
*/
@Override
public void startPrefixMapping(String prefix, String uri) {
//System.err.println("StartPrefixMapping " + prefix + "=" + uri);
if (prefix.equals("xmlns")) {
// the binding xmlns:xmlns="http://www.w3.org/2000/xmlns/"
// should never be reported, but it's been known to happen
return;
}
currentNamespaceMap = currentNamespaceMap.bind(prefix, uri);
}
/**
* Notify that a namespace binding is going out of scope
*/
@Override
public void endPrefixMapping(String prefix) {
//System.err.println("endPrefixMapping " + prefix);
}
/**
* Receive notification of the beginning of an element.
*
* The Parser will invoke this method at the beginning of every
* element in the XML document; there will be a corresponding
* {@link #endElement endElement} event for every startElement event
* (even when the element is empty). All of the element's content will be
* reported, in order, before the corresponding endElement
* event.
*
* This event allows up to three name components for each
* element:
*
*
* - the Namespace URI;
* - the local name; and
* - the qualified (prefixed) name.
*
*
* Saxon expects all three of these to be provided.
*
*
The attribute list provided should contain only
* attributes with explicit values (specified or defaulted):
* #IMPLIED attributes should be omitted. The attribute list
* should not contain attributes used for Namespace declarations
* (xmlns* attributes); if it does, Saxon will ignore them,
* which may lead to unresolved namespace prefixes.
*
* @param uri the Namespace URI, or the empty string if the
* element has no Namespace URI or if Namespace
* processing is not being performed
* @param localname the local name (without prefix), or the
* empty string if Namespace processing is not being
* performed
* @param rawname the qualified name (with prefix), or the
* empty string if qualified names are not available
* @param atts the attributes attached to the element. If
* there are no attributes, it shall be an empty
* Attributes object. The value of this object after
* startElement returns is undefined
* @throws org.xml.sax.SAXException any SAX exception, possibly
* wrapping another exception
* @see #endElement
* @see org.xml.sax.Attributes
* @see org.xml.sax.helpers.AttributesImpl
*/
@Override
public void startElement(String uri, String localname, String rawname, Attributes atts)
throws SAXException {
//System.err.println("ReceivingContentHandler#startElement " + localname + " sysId=" + localLocator.getSystemId());
//for (int a=0; a list = new ArrayList<>(atts.getLength());
for (int a=0; a map2 = uri.isEmpty() ? noNamespaceNameCache : nameCache.get(uri);
if (map2 == null) {
map2 = new HashMap<>(50);
nameCache.put(uri, map2);
if (uri.isEmpty()) {
noNamespaceNameCache = map2;
}
}
NodeName n = map2.get(rawname);
// we use the rawname (qname) rather than the local name because we want to retain the prefix
// Note that the NodeName objects generated do not contain a namecode or fingerprint; it will be generated
// later if we are building a TinyTree, but not necessarily on other paths (e.g. an identity transformation).
// The NodeName object is shared by all elements with the same name, so when the namecode is allocated to one
// of them, it is there for all of them.
if (n == null) {
if (uri.isEmpty()) {
NoNamespaceName qn = new NoNamespaceName(localname);
map2.put(rawname, qn);
return qn;
} else {
String prefix = NameChecker.getPrefix(rawname);
FingerprintedQName qn = new FingerprintedQName(prefix, uri, localname);
map2.put(rawname, qn);
return qn;
}
} else {
return n;
}
}
/**
* Report the end of an element (the close tag)
*/
@Override
public void endElement(String uri, String localname, String rawname) throws SAXException {
//System.err.println("ReceivingContentHandler#End element " + rawname + " (depth=" + namespaceStack.size() + ")");
try {
// don't attempt whitespace compression if this end tag follows a start tag
flush(!afterStartTag);
localLocator.levelInEntity--;
receiver.endElement();
} catch (ValidationException err) {
err.maybeSetLocation(localLocator);
if (!err.hasBeenReported()) {
pipe.getErrorReporter().report(new XmlProcessingException(err));
}
err.setHasBeenReported(true);
throw new SAXException(err);
} catch (XPathException err) {
err.maybeSetLocation(localLocator);
throw new SAXException(err);
}
afterStartTag = false;
namespaceStack.pop();
currentNamespaceMap = namespaceStack.peek();
}
/**
* Report character data. Note that contiguous character data may be reported as a sequence of
* calls on this method, with arbitrary boundaries
*/
@Override
public void characters(char[] ch, int start, int length) {
// System.err.println("characters (" + length + ")");
// need to concatenate chunks of text before we can decide whether a node is all-white
while (charsUsed + length > buffer.length) {
buffer = Arrays.copyOf(buffer, buffer.length*2);
//slice = new CharSlice(buffer, 0, 0);
}
System.arraycopy(ch, start, buffer, charsUsed, length);
charsUsed += length;
if (lineNumbering) {
lastTextNodeLocator = localLocator.saveLocation();
}
}
/**
* Report character data classified as "Ignorable whitespace", that is, whitespace text nodes
* appearing as children of elements with an element-only content model
*/
@Override
public void ignorableWhitespace(char[] ch, int start, int length) {
if (!ignoreIgnorable) {
characters(ch, start, length);
}
}
/**
* Notify the existence of a processing instruction
*/
@Override
public void processingInstruction(String name, String remainder) throws SAXException {
try {
flush(true);
if (!inDTD) {
if (name == null) {
// trick used by the old James Clark xp parser to notify a comment
comment(remainder.toCharArray(), 0, remainder.length());
} else {
// some parsers allow through PI names containing colons
if (!NameChecker.isValidNCName(name)) {
throw new SAXException("Invalid processing instruction name (" + name + ')');
}
if (allowDisableOutputEscaping) {
if (name.equals(Result.PI_DISABLE_OUTPUT_ESCAPING)) {
//flush();
escapingDisabled = true;
return;
} else if (name.equals(Result.PI_ENABLE_OUTPUT_ESCAPING)) {
//flush();
escapingDisabled = false;
return;
}
}
UnicodeString data;
if (remainder == null) {
// allowed by the spec but rarely seen: see Saxon bug 2491
data = EmptyUnicodeString.getInstance();
} else {
// not strictly necessary (the parser should have done this) but needed in practice
data = Whitespace.removeLeadingWhitespace(StringView.tidy(remainder));
}
receiver.processingInstruction(name, data, localLocator, ReceiverOption.NONE);
}
}
} catch (XPathException err) {
throw new SAXException(err);
}
}
/**
* Notify the existence of a comment. Note that in SAX this is part of LexicalHandler interface
* rather than the ContentHandler interface.
*/
@Override
public void comment(char[] ch, int start, int length) throws SAXException {
try {
flush(true);
if (!inDTD) {
receiver.comment(StringView.of(new String(ch, start, length)), localLocator, ReceiverOption.NONE);
}
} catch (XPathException err) {
throw new SAXException(err);
}
}
/**
* Flush buffer for accumulated character data
*
* @param compress true if compression of whitespace should be attempted. This is an expensive
* operation, so we avoid doing it when we hit an end tag that follows after a start tag, as
* it's not likely to succeed in that situation.
* @throws XPathException if flushing the character data fails
*/
private void flush(boolean compress) throws XPathException {
if (charsUsed > 0) {
//CharSlice slice = new CharSlice(buffer, 0, charsUsed);
UnicodeString content = StringTool.compress(buffer, 0, charsUsed, compress);
receiver.characters(content, lastTextNodeLocator,
escapingDisabled ? ReceiverOption.DISABLE_ESCAPING : ReceiverOption.WHOLE_TEXT_NODE);
charsUsed = 0;
escapingDisabled = false;
}
}
/**
* Notify a skipped entity. Saxon ignores this event
*/
@Override
public void skippedEntity(String name) {
}
// No-op methods to satisfy lexical handler interface
/**
* Register the start of the DTD. Saxon ignores the DTD; however, it needs to know when the DTD starts and
* ends so that it can ignore comments in the DTD, which are reported like any other comment, but which
* are skipped because they are not part of the XPath data model
*/
@Override
public void startDTD(String name, String publicId, String systemId) {
inDTD = true;
}
/**
* Register the end of the DTD. Comments in the DTD are skipped because they
* are not part of the XPath data model
*/
@Override
public void endDTD() {
inDTD = false;
}
@Override
public void startEntity(String name) {
if (elementDepthWithinEntity == null) {
elementDepthWithinEntity = new Stack<>();
}
elementDepthWithinEntity.push(localLocator.levelInEntity);
localLocator.levelInEntity = 0;
}
@Override
public void endEntity(String name) {
localLocator.levelInEntity = elementDepthWithinEntity.pop();
}
@Override
public void startCDATA() {
}
@Override
public void endCDATA() {
}
//////////////////////////////////////////////////////////////////////////////
// Implement DTDHandler interface
//////////////////////////////////////////////////////////////////////////////
@Override
public void notationDecl(String name,
String publicId,
String systemId) {
}
@Override
public void unparsedEntityDecl(String name,
String publicId,
String systemId,
String notationName) throws SAXException {
// Some (non-conformant) SAX parsers report the systemId as written.
// We need to turn it into an absolute URL.
String uri = systemId;
if (localLocator != null) {
try {
URI suppliedURI = new URI(systemId);
if (!suppliedURI.isAbsolute()) {
String baseURI = localLocator.getSystemId();
if (baseURI != null) { // See bug 21679
uri = ResolveURI.makeAbsolute(systemId, baseURI).toString();
}
}
} catch (URISyntaxException err) {
// fallback - no action
}
}
try {
receiver.setUnparsedEntity(name, uri, publicId);
} catch (XPathException err) {
throw new SAXException(err);
}
}
/**
* An implementation of the Saxon {@link Location} interface that wraps the SAX Locator
* information. Note that this object is mutable and changes continually as parsing proceeds;
* it is therefore necessary to call its {@link #saveLocation()} method to obtain an
* immutable location that still has meaning once parsing is finished.
*/
public static class LocalLocator implements Location {
private final Locator saxLocator;
public int levelInEntity;
LocalLocator(Locator saxLocator) {
this.saxLocator = saxLocator;
this.levelInEntity = 0;
}
/**
* Return the system identifier for the current document event.
*
* @return A string containing the system identifier, or
* null if none is available.
*/
@Override
public String getSystemId() {
return saxLocator.getSystemId();
}
/**
* Return the public identifier for the current document event.
*
* @return A string containing the public identifier, or
* null if none is available.
*/
@Override
public String getPublicId() {
return saxLocator.getPublicId();
}
/**
* Return the line number where the current document event ends.
*
* @return The line number, or -1 if none is available.
*/
@Override
public int getLineNumber() {
return saxLocator.getLineNumber();
}
/**
* Return the character position where the current document event ends.
*
* @return The column number, or -1 if none is available.
*/
@Override
public int getColumnNumber() {
return saxLocator.getColumnNumber();
}
/**
* Get an immutable copy of this Location object. By default Location objects may be mutable, so they
* should not be saved for later use. The result of this operation holds the same location information,
* but in an immutable form.
*/
@Override
public Location saveLocation() {
return new Loc(getSystemId(), getLineNumber(), getColumnNumber());
}
}
}