All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.saxon.s9api.DocumentBuilder Maven / Gradle / Ivy

There is a newer version: 12.5
Show newest version
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2018-2023 Saxonica Limited
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

package net.sf.saxon.s9api;

import net.sf.saxon.Configuration;
import net.sf.saxon.event.*;
import net.sf.saxon.expr.EarlyEvaluationContext;
import net.sf.saxon.expr.JPConverter;
import net.sf.saxon.lib.AugmentedSource;
import net.sf.saxon.lib.ParseOptions;
import net.sf.saxon.lib.Validation;
import net.sf.saxon.om.NoElementsSpaceStrippingRule;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.om.TreeInfo;
import net.sf.saxon.om.TreeModel;
import net.sf.saxon.query.XQueryExpression;
import net.sf.saxon.serialize.SerializationProperties;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.transpile.CSharpModifiers;
import net.sf.saxon.value.Whitespace;

import javax.xml.transform.Source;
import javax.xml.transform.stream.StreamSource;
import java.io.File;
import java.net.URI;
import java.util.Objects;

/**
 * A document builder holds properties controlling how a Saxon document tree should be built, and
 * provides methods to invoke the tree construction.
 * 

This class has no public constructor. To construct a {@code DocumentBuilder}, * use the factory method {@link net.sf.saxon.s9api.Processor#newDocumentBuilder()}.

*

All documents used in a single Saxon query, transformation, or validation episode must * be built with the same {@link net.sf.saxon.Configuration}. However, there is no requirement that they * should use the same DocumentBuilder.

*

Sharing of a DocumentBuilder across multiple threads is not recommended. However, * in the current implementation sharing a DocumentBuilder (once initialized) will only * cause problems if a SchemaValidator is used.

* * @since 9.0 */ @CSharpModifiers(code = {"internal"}) public class DocumentBuilder { private final Configuration config; private SchemaValidator schemaValidator; private boolean dtdValidation; private boolean lineNumbering; private TreeModel treeModel = TreeModel.TINY_TREE; private WhitespaceStrippingPolicy whitespacePolicy = WhitespaceStrippingPolicy.UNSPECIFIED; private URI baseURI; private XQueryExecutable projectionQuery; /** * Create a DocumentBuilder. This is a protected constructor. Users should construct a DocumentBuilder * by calling the factory method {@link net.sf.saxon.s9api.Processor#newDocumentBuilder()}. * * @param config the Saxon configuration */ protected DocumentBuilder(Configuration config) { this.config = config; } /** * Set the tree model to be used for documents constructed using this DocumentBuilder. * By default, the TinyTree is used (irrespective of the TreeModel set in the underlying * Configuration). * * @param model typically one of the constants {@link net.sf.saxon.om.TreeModel#TINY_TREE}, * {@link TreeModel#TINY_TREE_CONDENSED}, or {@link TreeModel#LINKED_TREE}. It can also be * an external object model such as {@link net.sf.saxon.option.xom.XOMObjectModel} * @since 9.2 */ public void setTreeModel(TreeModel model) { this.treeModel = model; } /** * Get the tree model to be used for documents constructed using this DocumentBuilder. * By default, the TinyTree is used (irrespective of the TreeModel set in the underlying * Configuration). * * @return the tree model in use: typically one of the constants {@link net.sf.saxon.om.TreeModel#TINY_TREE}, * {@link net.sf.saxon.om.TreeModel#TINY_TREE_CONDENSED}, or {@link TreeModel#LINKED_TREE}. However, in principle * a user-defined tree model can be used. * @since 9.2 */ public TreeModel getTreeModel() { return treeModel; } /** * Say whether line and column numbering and is to be enabled for documents constructed using this DocumentBuilder. * This has the effect that the line and column number in the original source document is maintained in the constructed * tree, for each element node (and only for elements). The line and column number in question are generally the position * at which the closing ">" of the element start tag appears. *

By default, line and column numbers are not maintained.

*

Errors relating to document parsing and validation will generally contain line numbers whether or not * this option is set, because such errors are detected during document construction.

*

Line numbering is not available for all kinds of source: for example, * it is not available when loading from an existing DOM Document.

*

The resulting line and column numbers are accessible to applications using the * XPath extension functions saxon:line-number() and saxon:column-number() applied to a node, or using the * Java methods {@link net.sf.saxon.om.NodeInfo#getLineNumber()} and * {@link net.sf.saxon.om.NodeInfo#getColumnNumber()}

*

Line and column numbers are maintained only for element nodes; the line number * returned for any other node will be that of the most recent element. For an element node, the * line and column number are generally that of the closing angle bracket at the end of the start tag * (this is what a SAX parser notifies)

* * @param option true if line numbers are to be maintained, false otherwise. */ public void setLineNumbering(boolean option) { lineNumbering = option; } /** * Ask whether line and column numbering is enabled for documents loaded using this * DocumentBuilder. *

By default, line and column numbering is disabled.

*

Line numbering is not available for all kinds of source: in particular, * it is not available when loading from an existing DOM Document.

*

The resulting line and column numbers are accessible to applications using the * extension functions saxon:line-number() and saxon:column-number applied to a node, or using the * Java methods {@link net.sf.saxon.om.NodeInfo#getLineNumber()} and * {@link net.sf.saxon.om.NodeInfo#getColumnNumber()}

*

Line and column numbers are maintained only for element nodes; the line number * returned for any other node will be that of the most recent element. For an element node, the * line number is generally that of the closing angle bracket at the end of the start tag * (this is what a SAX parser notifies)

* * @return true if line numbering is enabled */ public boolean isLineNumbering() { return lineNumbering; } /** * Set options for schema validation. This determines whether schema validation is applied to an input * document and whether type annotations in a supplied document are retained. If no schemaValidator * is supplied, then schema validation does not take place. *

This option requires the schema-aware version of the Saxon product (Saxon-EE).

*

The supplied SchemaValidator is not actually used directly when a document is built * using {@link #parse(File, Destination)} or {@link #parse(Source, Destination)} * (the {@link SchemaValidator#validate(Source)} * method is never called). Rather, some of the properties of the SchemaValidator are used to control * how schema validation is performed by the DocumentBuilder. The particular properties * that take effect include:

*
    *
  • The validation mode (strict or lax)
  • *
  • The required top-level element declaration (see {@link SchemaValidator#setDocumentElementName(QName)}
  • *
  • The required type of the top-level element (see {@link SchemaValidator#setDocumentElementTypeName(QName)}
  • *
  • The option {@link SchemaValidator#isUseXsiSchemaLocation()}
  • *
  • The option {@link SchemaValidator#isExpandAttributeDefaults()}
  • *
  • Validation parameters set using {@link SchemaValidator#setParameter}
  • *
  • The {@link net.sf.saxon.lib.InvalidityHandler}
  • *
*

Properties that do NOT have any effect include:

*
    *
  • The option {@link SchemaValidator#isCollectStatistics()}
  • *
* * @param validator the SchemaValidator to be used */ public void setSchemaValidator(SchemaValidator validator) { schemaValidator = validator; } /** * Get the SchemaValidator used to validate documents loaded using this * DocumentBuilder. * * @return the SchemaValidator if one has been set; otherwise null. */ public SchemaValidator getSchemaValidator() { return schemaValidator; } /** * Set whether DTD validation should be applied to documents loaded using this * DocumentBuilder. *

By default, no DTD validation takes place.

* * @param option true if DTD validation is to be applied to the document */ public void setDTDValidation(boolean option) { dtdValidation = option; } /** * Ask whether DTD validation is to be applied to documents loaded using this DocumentBuilder * * @return true if DTD validation is to be applied */ public boolean isDTDValidation() { return dtdValidation; } /** * Set the whitespace stripping policy applied when loading a document * using this DocumentBuilder. * *

If DTD or schema validation is applied, the only permitted setting * is {@link WhitespaceStrippingPolicy#IGNORABLE}. Any other value results * in an exception from the {@link #build(File)} method

* * @param policy the policy for stripping whitespace-only text nodes from * source documents */ public void setWhitespaceStrippingPolicy(WhitespaceStrippingPolicy policy) { whitespacePolicy = policy; } /** * Get the white whitespace stripping policy applied when loading a document * using this DocumentBuilder. * * @return the policy for stripping whitespace-only text nodes */ public WhitespaceStrippingPolicy getWhitespaceStrippingPolicy() { return whitespacePolicy; } /** * Set the base URI of a document loaded using this DocumentBuilder. *

This is used for resolving any relative URIs appearing * within the document, for example in references to DTDs and external entities.

*

This information is required when the document is loaded from a source that does not * provide an intrinsic URI, notably when loading from a Stream or a DOMSource. The value is * ignored when loading from a source that does have an intrinsic base URI.

* * @param uri the base URI of documents loaded using this DocumentBuilder. This * must be an absolute URI. * @throws IllegalArgumentException if the baseURI supplied is not an absolute URI */ public void setBaseURI(URI uri) { if (!uri.isAbsolute()) { throw new IllegalArgumentException("Supplied base URI must be absolute"); } baseURI = uri; } /** * Get the base URI of documents loaded using this DocumentBuilder when no other URI is available. * * @return the base URI to be used, or null if no value has been set. */ public URI getBaseURI() { return baseURI; } /** * Set a compiled query to be used for implementing document projection. The effect of using * this option is that the tree constructed by the DocumentBuilder contains only those parts * of the source document that are needed to answer this query. Running this query against * the projected document should give the same results as against the raw document, but the * projected document typically occupies significantly less memory. It is permissible to run * other queries against the projected document, but unless they are carefully chosen, they * will give the wrong answer, because the document being used is different from the original. *

The query should be written to use the projected document as its initial context item. * For example, if the query is //ITEM[COLOR='blue'), then only ITEM * elements and their COLOR children will be retained in the projected document.

*

This facility is only available in Saxon-EE; if the facility is not available, * calling this method has no effect.

* * @param query the compiled query used to control document projection * @since 9.3 */ public void setDocumentProjectionQuery(XQueryExecutable query) { this.projectionQuery = query; } /** * Get the compiled query to be used for implementing document projection. * * @return the query set using {@link #setDocumentProjectionQuery} if this * has been called, or null otherwise * @since 9.3. In 9.4 the unused and undocumented first argument is removed. */ public XQueryExecutable getDocumentProjectionQuery() { return this.projectionQuery; } /** * Load an XML document, to create a tree representation of the document in memory. * * @param source A JAXP Source object identifying the source of the document. This can always be * a {@link javax.xml.transform.stream.StreamSource} or a {@link javax.xml.transform.sax.SAXSource}. * Some kinds of Source are consumed by this method, and should only be used once. *

If a SAXSource is supplied, the XMLReader held within the SAXSource may be modified (by setting * features and properties) to reflect the options selected on this DocumentBuilder.

*

If the source is an instance of {@link net.sf.saxon.om.NodeInfo} then the subtree rooted at this node * will be copied (applying schema validation if requested) to create a new tree.

*

Saxon also accepts an instance of {@link javax.xml.transform.stax.StAXSource} or * {@link net.sf.saxon.pull.PullSource}, which can be used to supply a document that is to be parsed * using a StAX parser.

*

(9.8.0.5) This method now (once again) accepts an instance of {@link net.sf.saxon.lib.AugmentedSource}. * If an {@code AugmentedSource} is supplied, the properties of the {@code AugmentedSource} take * precedence over any properties set on this {@code DocumentBuilder}, which in turn take precedence * over properties set at the {@link Processor} or {@link Configuration} level. The concept of * "taking precedence" is explained more fully at {@link ParseOptions#merge(ParseOptions)}

* * @return An XdmNode. This will be * the document node at the root of the tree of the resulting in-memory document. * @throws NullPointerException if the source argument is null * @throws IllegalArgumentException if the kind of source is not recognized * @throws SaxonApiException if any other failure occurs building the document, for example * a parsing error */ public XdmNode build(Source source) throws SaxonApiException { Objects.requireNonNull(source, "source"); ParseOptions options = getParseOptions(source); try { TreeInfo doc = config.buildDocumentTree(source, options); return new XdmNode(doc.getRootNode()); } catch (XPathException e) { throw new SaxonApiException(e); } } private ParseOptions getParseOptions(Source source) throws SaxonApiException { if (!(whitespacePolicy == WhitespaceStrippingPolicy.UNSPECIFIED || whitespacePolicy == WhitespaceStrippingPolicy.IGNORABLE || whitespacePolicy.ordinal() == Whitespace.XSLT)) { if (dtdValidation) { throw new SaxonApiException("When DTD validation is used, the whitespace stripping policy must be IGNORABLE"); } if (schemaValidator != null) { throw new SaxonApiException("When schema validation is used, the whitespace stripping policy must be IGNORABLE"); } } ParseOptions options = config.getParseOptions() .withDTDValidationMode(dtdValidation ? Validation.STRICT : Validation.STRIP); if (schemaValidator != null) { options = options.withSchemaValidationMode(schemaValidator.isLax() ? Validation.LAX : Validation.STRICT); if (schemaValidator.getDocumentElementName() != null) { QName qn = schemaValidator.getDocumentElementName(); options = options.withTopLevelElement(qn.getStructuredQName()); } if (schemaValidator.getDocumentElementType() != null) { options = options.withTopLevelType(schemaValidator.getDocumentElementType()); } options = options.withExpandAttributeDefaults(schemaValidator.isExpandAttributeDefaults()); options = options.withUseXsiSchemaLocation(schemaValidator.isUseXsiSchemaLocation()); options = options.withValidationParams(schemaValidator.getValidationParameters()); options = options.withInvalidityHandler(schemaValidator.getInvalidityHandler()); } if (treeModel != null) { options = options.withModel(treeModel); } if (whitespacePolicy != null && whitespacePolicy != WhitespaceStrippingPolicy.UNSPECIFIED) { int option = whitespacePolicy.ordinal(); if (option == Whitespace.XSLT) { options = options.withSpaceStrippingRule(NoElementsSpaceStrippingRule.getInstance()); options = options.withFilter(whitespacePolicy.makeStripper()); } else { options = options.withSpaceStrippingRule(whitespacePolicy.getSpaceStrippingRule()); } } options = options.withLineNumbering(lineNumbering); if (source.getSystemId() == null && baseURI != null) { source.setSystemId(baseURI.toString()); } if (projectionQuery != null) { XQueryExpression exp = projectionQuery.getUnderlyingCompiledQuery(); FilterFactory ff = config.makeDocumentProjector(exp); if (ff != null) { options = options.withFilter(ff); } } if (source instanceof AugmentedSource) { options = options.merge(((AugmentedSource)source).getParseOptions()); } return options; } /** * Build a document from a supplied XML file * * @param file the supplied file * @return the XdmNode representing the root of the document tree * @throws SaxonApiException if any failure occurs retrieving or parsing the document */ public XdmNode build(File file) throws SaxonApiException { return build(new StreamSource(file)); } /** * Get an {@link org.xml.sax.ContentHandler} that may be used to build the document programmatically. * * @return a newly constructed {@link BuildingContentHandler}, which implements the ContentHandler * interface. If schema validation has been requested for this DocumentBuilder, then the document constructed * using the ContentHandler will be validated as it is written. *

Note that the returned ContentHandler expects namespace scopes to be indicated * explicitly by calls to {@link org.xml.sax.ContentHandler#startPrefixMapping} and * {@link org.xml.sax.ContentHandler#endPrefixMapping}.

*

If the stream of events supplied to the ContentHandler does not constitute * a well formed (and namespace-well-formed) document, the effect is undefined; Saxon may fail * to detect the error, and construct an unusable tree.

* @throws SaxonApiException if any failure occurs * @since 9.3 */ public BuildingContentHandler newBuildingContentHandler() throws SaxonApiException { PipelineConfiguration pipe = config.makePipelineConfiguration(); Builder builder = treeModel.makeBuilder(pipe); if (baseURI != null) { builder.setSystemId(baseURI.toASCIIString()); } builder.setLineNumbering(lineNumbering); Receiver r = builder; r = new NamespaceReducer(r); r = injectValidator(r, builder); return new BuildingContentHandlerImpl(r, builder); } private Receiver injectValidator(Receiver r, Builder builder) throws SaxonApiException { if (schemaValidator != null) { PipelineConfiguration pipe = builder.getPipelineConfiguration(); Receiver val = schemaValidator.getReceiver(pipe, config.obtainDefaultSerializationProperties()); val.setPipelineConfiguration(pipe); if (val instanceof ProxyReceiver) { ((ProxyReceiver) val).setUnderlyingReceiver(r); } return val; } return r; } /** * Private implementation of BuildingContentHandler */ private static class BuildingContentHandlerImpl extends ReceivingContentHandler implements BuildingContentHandler { private final Builder builder; public BuildingContentHandlerImpl(Receiver r, Builder b) { setReceiver(r); setPipelineConfiguration(r.getPipelineConfiguration()); this.builder = b; } @Override public XdmNode getDocumentNode() { return new XdmNode(builder.getCurrentRoot()); } } /** * Get an {@link javax.xml.stream.XMLStreamWriter} that may be used to build the document programmatically. * * @return a newly constructed {@link BuildingStreamWriter}, which implements the XMLStreamWriter * interface. If schema validation has been requested for this DocumentBuilder, then the document constructed * using the XMLStreamWriter will be validated as it is written. *

If the stream of events supplied to the XMLStreamWriter does not constitute * a well formed (and namespace-well-formed) document, the effect is undefined; Saxon may fail * to detect the error, and construct an unusable tree.

* @throws SaxonApiException if any failure occurs * @since 9.3 */ public BuildingStreamWriterImpl newBuildingStreamWriter() throws SaxonApiException { PipelineConfiguration pipe = config.makePipelineConfiguration(); Builder builder = treeModel.makeBuilder(pipe); builder.setLineNumbering(lineNumbering); Receiver r = builder; r = new NamespaceReducer(r); r = injectValidator(r, builder); return new BuildingStreamWriterImpl(r, builder); } /** * Create a node by wrapping a recognized external node from a supported object model. *

If the supplied object implements the {@link net.sf.saxon.om.NodeInfo} interface then it * will be wrapped as an XdmNode without copying and without change. The NodeInfo * must have been created using a {@link net.sf.saxon.Configuration} compatible * with the one used by this Processor (specifically, one that uses the same * {@link net.sf.saxon.om.NamePool})

*

To wrap nodes from other object models, such as DOM, the support module for the external object * model must be on the class path and registered with the Saxon configuration. The support modules * for DOM, JDOM, DOM4J and XOM are registered automatically if they can be found on the classpath.

*

It is best to avoid calling this method repeatedly to wrap different nodes in the same document. * Each such wrapper conceptually creates a new XDM tree instance with its own identity. Although the * memory is shared, operations that rely on node identity might not have the expected result. It is * best to create a single wrapper for the document node, and then to navigate to the other nodes in the * tree using S9API interfaces.

* * @param node the node in the external tree representation. Either an instance of * {@link net.sf.saxon.om.NodeInfo}, or an instances of a node in an external object model. * Nodes in other object models (such as DOM, JDOM, etc) are recognized only if * the support module for the external object model is known to the Configuration. * @return the supplied node wrapped as an XdmNode * @throws IllegalArgumentException if the type of object supplied is not recognized. This may be because * node was created using a different Saxon Processor, or because the required code for the external * object model is not on the class path */ public XdmNode wrap(Object node) throws IllegalArgumentException { if (node instanceof NodeInfo) { NodeInfo nodeInfo = (NodeInfo) node; if (nodeInfo.getConfiguration().isCompatible(config)) { return new XdmNode(nodeInfo); } else { throw new IllegalArgumentException("Supplied NodeInfo was created using a different Configuration"); } } else { try { JPConverter converter = JPConverter.allocate(node.getClass(), null, config); NodeInfo nodeInfo = (NodeInfo) converter.convert(node, new EarlyEvaluationContext(config)); return XdmItem.wrapItem(nodeInfo); } catch (XPathException e) { throw new IllegalArgumentException(e.getMessage()); } catch (ClassCastException e) { throw new IllegalArgumentException("Class " + node.getClass() + " is not a recognized external node type"); } } } /** * Parse a source document, sending it to a supplied {@link Destination} *

The process is streamed; no tree is constructed in memory.

* @param source The source document to be parsed * @param destination The destination to which the document is to be sent * @throws SaxonApiException if parsing fails, or if the destination reports an error */ public void parse(Source source, Destination destination) throws SaxonApiException { try { ParseOptions options = getParseOptions(source); PipelineConfiguration pipe = config.makePipelineConfiguration(); Sender.send(source, destination.getReceiver(pipe, new SerializationProperties()), options); } catch (XPathException e) { throw new SaxonApiException(e); } } /** * Parse a source document from a File, sending it to a supplied {@link Destination} *

The process is streamed; no tree is constructed in memory.

* @param file The file containing the XML source document to be parsed * @param destination The destination to which the document is to be sent * @throws SaxonApiException if parsing fails, or if the destination reports an error */ public void parse(File file, Destination destination) throws SaxonApiException { parse(new StreamSource(file), destination); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy