All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.topologi.diffx.load.DOMRecorder Maven / Gradle / Ivy

Go to download

docx4j is a library which helps you to work with the Office Open XML file format as used in docx documents, pptx presentations, and xlsx spreadsheets.

There is a newer version: 6.1.2
Show newest version
/*
 * This file is part of the DiffX library.
 *
 * For licensing information please see the file license.txt included in the release.
 * A copy of this licence can also be found at
 *   http://www.opensource.org/licenses/artistic-license-2.0.php
 */
package com.topologi.diffx.load;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import javax.xml.parsers.DocumentBuilder;

import org.docx4j.XmlUtils;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.ProcessingInstruction;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;

import com.topologi.diffx.config.DiffXConfig;
import com.topologi.diffx.event.AttributeEvent;
import com.topologi.diffx.event.CloseElementEvent;
import com.topologi.diffx.event.OpenElementEvent;
import com.topologi.diffx.event.TextEvent;
import com.topologi.diffx.event.impl.EventFactory;
import com.topologi.diffx.event.impl.ProcessingInstructionEvent;
import com.topologi.diffx.load.text.TextTokenizer;
import com.topologi.diffx.load.text.TokenizerFactory;
import com.topologi.diffx.sequence.EventSequence;
import com.topologi.diffx.sequence.PrefixMapping;

/**
 * Loads a DOM documents as a sequence of events.
 * 
 * 

This class implements the methods {@link Recorder#process(File)} and * {@link Recorder#process(String)} for convenience, but is it much more efficient * to feed this recorder directly with a DOM. * *

This class is not synchronised. * * @author Christophe Lauret * @version 10 May 2010 * * @since 0.7 */ public final class DOMRecorder implements XMLRecorder { // class attributes ------------------------------------------------------------------------------- /** * The DiffX configuration to use */ private DiffXConfig config = new DiffXConfig(); // state variables -------------------------------------------------------------------------------- /** * The factory that will produce events according to the configuration. */ private transient EventFactory efactory; /** * The text tokenizer used by this recorder. */ private transient TextTokenizer tokenizer; /** * The sequence of event for this recorder. */ private transient EventSequence sequence; /** * The sequence of event for this recorder. */ private transient PrefixMapping mapping; /** * The weight of the current element. */ private transient int currentWeight = -1; /** * The stack of events' weight, should only contain Integer. */ private transient List weights = new ArrayList(); /** * Indicates whether the given document is a fragment. * *

An fragment is a portion of XML that is not necessarily well-formed by itself, because the * namespace has been declared higher in the hierarchy, in which if the DOM tree was serialised * it would not produce well-formed XML. * *

This option indicates that the recorder should try to generate the prefix mapping without * the declaration. */ private transient boolean isFragment = true; // methods ---------------------------------------------------------------------------------------- /** * Returns the configuration used by this recorder. * * @return the configuration used by this recorder. */ public DiffXConfig getConfig() { return this.config; } /** * Sets the configuration used by this recorder. * * @param config The configuration used by this recorder. */ public void setConfig(DiffXConfig config) { this.config = config; } /** * {@inheritDoc} */ public EventSequence process(File file) throws LoadingException, IOException { InputStream in = new BufferedInputStream(new FileInputStream(file)); return process(new InputSource(in)); } /** * {@inheritDoc} */ public EventSequence process(String xml) throws LoadingException { return process(new InputSource(new StringReader(xml))); } /** * Runs the recorder on the specified input source. * * @param is The input source. * * @return The recorded sequence of events. * * @throws LoadingException If thrown while parsing. */ public EventSequence process(InputSource is) throws LoadingException { this.isFragment = false; // input source is not a fragment // DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); // dbFactory.setNamespaceAware(this.config.isNamespaceAware()); // dbFactory.setExpandEntityReferences(true); // dbFactory.setValidating(false); try { DocumentBuilder builder = XmlUtils.getNewDocumentBuilder(); Document document = builder.parse(is); return this.process(document); } catch (Exception ex) { throw new LoadingException(ex); } } /** * Processes the given node and returns the corresponding event sequence. * * @param node The W3C DOM node to be processed. * * @return The recorded sequence of events. * * @throws LoadingException If thrown while parsing. */ public EventSequence process(Node node) throws LoadingException { // initialise the state variables. this.efactory = new EventFactory(this.config.isNamespaceAware()); this.tokenizer = TokenizerFactory.get(this.config); this.sequence = new EventSequence(); this.mapping = this.sequence.getPrefixMapping(); // start processing the nodes loadNode(node); this.isFragment = true; return this.sequence; } /** * Processes the given node list and returns the corresponding event sequence. * *

This method only returns the event sequence from the first node in the node list, if the * node list is empty, this method returns an empty sequence. * * @param node The W3C DOM node to be processed. * * @return The recorded sequence of events. * * @throws LoadingException If thrown while parsing. */ public EventSequence process(NodeList node) throws LoadingException { if (node.getLength() == 0) return new EventSequence(); return process(node.item(0)); } // specific loaders --------------------------------------------------------------------- /** * Loads the given node in the current sequence. * * @param node The W3C DOM node to load. * * @throws LoadingException If thrown while parsing. */ private void loadNode(Node node) throws LoadingException { // dispatch to the correct loader performance: order by occurrence if (node instanceof Element) { load((Element)node); } if (node instanceof Text) { load((Text)node); } else if (node instanceof Attr) { load((Attr)node); } else if (node instanceof Document) { load((Document)node); } else if (node instanceof ProcessingInstruction) { load((ProcessingInstruction)node); // all other node types are ignored } } /** * Loads the given document in the current sequence. * * @param document The W3C DOM document node to load. * * @throws LoadingException If thrown while parsing. */ private void load(Document document) throws LoadingException { load(document.getDocumentElement()); } /** * Loads the given element in the current sequence. * * @param element The W3C DOM element node to load. * * @throws LoadingException If thrown while parsing. */ private void load(Element element) throws LoadingException { if (this.currentWeight > 0) { this.weights.add(Integer.valueOf(this.currentWeight)); } this.currentWeight = 1; // namespace handling OpenElementEvent open = null; // namespace aware configuration if (this.config.isNamespaceAware()) { String uri = element.getNamespaceURI() == null? "" : element.getNamespaceURI(); String name = element.getLocalName(); handlePrefixMapping(uri, element.getPrefix()); open = this.efactory.makeOpenElement(uri, name); // not namespace aware } else { open = this.efactory.makeOpenElement(null, element.getNodeName()); } this.sequence.addEvent(open); NamedNodeMap atts = element.getAttributes(); // only 1 attribute, just load it if (atts.getLength() == 1) { load((Attr)atts.item(0)); // several attributes sort them in alphabetical order // TODO: also use URI } else if (atts.getLength() > 1) { String[] names = new String[atts.getLength()]; for (int i = 0; i < atts.getLength(); i++) { Attr attr = (Attr)atts.item(i); names[i] = attr.getName(); } Arrays.sort(names); for (String name : names) { load((Attr)atts.getNamedItem(name)); } } // load all the child nodes NodeList list = element.getChildNodes(); for (int i = 0; i < list.getLength(); i++) { loadNode(list.item(i)); } CloseElementEvent close = this.efactory.makeCloseElement(open); this.sequence.addEvent(close); // handle the weights close.setWeight(this.currentWeight); open.setWeight(this.currentWeight); this.currentWeight += popWeight(); } /** * Loads the given text in the current sequence depending on the configuration. * * @param text The W3C DOM text node to load. * * @throws LoadingException If thrown while parsing. */ private void load(Text text) throws LoadingException { List events = this.tokenizer.tokenize(text.getData()); for (TextEvent e : events) { this.sequence.addEvent(e); } this.currentWeight += events.size(); } /** * Loads the given processing instruction in the current sequence. * * @param pi The W3C DOM PI node to load. * * @throws LoadingException If thrown while parsing. */ private void load(ProcessingInstruction pi) throws LoadingException { this.sequence.addEvent(new ProcessingInstructionEvent(pi.getTarget(), pi.getData())); this.currentWeight++; } /** * Returns the last weight and remove it from the stack. * * @return The weight on top of the stack. */ private int popWeight() { if (this.weights.size() > 0) return ((Integer)this.weights.remove(this.weights.size() - 1)).intValue(); else return 0; } /** * Loads the given attribute in the current sequence. * * @param attr The W3C DOM attribute node to load. */ private void load(Attr attr) { handlePrefixMapping(attr.getNamespaceURI(), attr.getPrefix()); load(this.efactory.makeAttribute(attr.getNamespaceURI(), attr.getLocalName(), attr.getNodeName(), attr.getValue())); } /** * Loads the given attribute in the current sequence. * * @param e An attribute event. */ private void load(AttributeEvent e) { // a namespace declaration, translate the event into a prefix mapping if ("http://www.w3.org/2000/xmlns/".equals(e.getURI())) { this.sequence.mapPrefix(e.getValue(), e.getName()); // a regular attribute } else { e.setWeight(2); this.currentWeight += 2; this.sequence.addEvent(e); } } /** * Handles the prefix mapping. * * If the current process is working on a fragment, * * @param uri The namespace URI. * @param prefix The prefix used for the namespace. */ private void handlePrefixMapping(String uri, String prefix) { if (this.isFragment) { if (this.mapping.getPrefix(uri) != null) return; if (prefix == null && !"".equals(uri)) { this.mapping.add(uri, ""); } else if (prefix != null && !"xmlns".equals(prefix)) { this.mapping.add(uri, prefix); } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy