com.topologi.diffx.load.DOMRecorder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of docx4j Show documentation
docx4j is a library which helps you to work with the Office Open XML file format as used in docx documents, pptx presentations, and xlsx spreadsheets.
There is a newer version: 6.1.2
Show newest version
package com.topologi.diffx.load;

/* ============================================================================
 * ARTISTIC LICENCE
 * 
 * Preamble
 * 
 * The intent of this document is to state the conditions under which a Package
 * may be copied, such that the Copyright Holder maintains some semblance of 
 * artistic control over the development of the package, while giving the users
 * of the package the right to use and distribute the Package in a more-or-less
 * customary fashion, plus the right to make reasonable modifications.
 *
 * Definitions:
 *  - "Package" refers to the collection of files distributed by the Copyright 
 *    Holder, and derivatives of that collection of files created through 
 *    textual modification.
 *  - "Standard Version" refers to such a Package if it has not been modified, 
 *    or has been modified in accordance with the wishes of the Copyright 
 *    Holder.
 *  - "Copyright Holder" is whoever is named in the copyright or copyrights 
 *    for the package.
 *  - "You" is you, if you're thinking about copying or distributing this 
 *    Package.
 *  - "Reasonable copying fee" is whatever you can justify on the basis of 
 *    media cost, duplication charges, time of people involved, and so on. 
 *    (You will not be required to justify it to the Copyright Holder, but only 
 *    to the computing community at large as a market that must bear the fee.)
 *  - "Freely Available" means that no fee is charged for the item itself, 
 *    though there may be fees involved in handling the item. It also means 
 *    that recipients of the item may redistribute it under the same conditions
 *    they received it.
 *
 * 1. You may make and give away verbatim copies of the source form of the 
 *    Standard Version of this Package without restriction, provided that you 
 *    duplicate all of the original copyright notices and associated 
 *    disclaimers.
 *
 * 2. You may apply bug fixes, portability fixes and other modifications 
 *    derived from the Public Domain or from the Copyright Holder. A Package 
 *    modified in such a way shall still be considered the Standard Version.
 *
 * 3. You may otherwise modify your copy of this Package in any way, provided 
 *    that you insert a prominent notice in each changed file stating how and 
 *    when you changed that file, and provided that you do at least ONE of the 
 *    following:
 * 
 *    a) place your modifications in the Public Domain or otherwise make them 
 *       Freely Available, such as by posting said modifications to Usenet or 
 *       an equivalent medium, or placing the modifications on a major archive 
 *       site such as ftp.uu.net, or by allowing the Copyright Holder to 
 *       include your modifications in the Standard Version of the Package.
 * 
 *    b) use the modified Package only within your corporation or organization.
 *
 *    c) rename any non-standard executables so the names do not conflict with 
 *       standard executables, which must also be provided, and provide a 
 *       separate manual page for each non-standard executable that clearly 
 *       documents how it differs from the Standard Version.
 * 
 *    d) make other distribution arrangements with the Copyright Holder.
 *
 * 4. You may distribute the programs of this Package in object code or 
 *    executable form, provided that you do at least ONE of the following:
 * 
 *    a) distribute a Standard Version of the executables and library files, 
 *       together with instructions (in the manual page or equivalent) on where
 *       to get the Standard Version.
 *
 *    b) accompany the distribution with the machine-readable source of the 
 *       Package with your modifications.
 * 
 *    c) accompany any non-standard executables with their corresponding 
 *       Standard Version executables, giving the non-standard executables 
 *       non-standard names, and clearly documenting the differences in manual 
 *       pages (or equivalent), together with instructions on where to get 
 *       the Standard Version.
 *
 *    d) make other distribution arrangements with the Copyright Holder.
 *
 * 5. You may charge a reasonable copying fee for any distribution of this 
 *    Package. You may charge any fee you choose for support of this Package. 
 *    You may not charge a fee for this Package itself. However, you may 
 *    distribute this Package in aggregate with other (possibly commercial) 
 *    programs as part of a larger (possibly commercial) software distribution 
 *    provided that you do not advertise this Package as a product of your own.
 *
 * 6. The scripts and library files supplied as input to or produced as output 
 *    from the programs of this Package do not automatically fall under the 
 *    copyright of this Package, but belong to whomever generated them, and may
 *    be sold commercially, and may be aggregated with this Package.
 *
 * 7. C or perl subroutines supplied by you and linked into this Package shall 
 *    not be considered part of this Package.
 *
 * 8. The name of the Copyright Holder may not be used to endorse or promote 
 *    products derived from this software without specific prior written 
 *    permission.
 * 
 * 9. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED 
 *    WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF 
 *    MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
 * ============================================================================
 */

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.ProcessingInstruction;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;

import com.topologi.diffx.Docx4jDriver;
import com.topologi.diffx.config.DiffXConfig;
import com.topologi.diffx.event.AttributeEvent;
import com.topologi.diffx.event.CloseElementEvent;
import com.topologi.diffx.event.OpenElementEvent;
import com.topologi.diffx.event.impl.EventFactory;
import com.topologi.diffx.event.impl.ProcessingInstructionEvent;
import com.topologi.diffx.load.text.TextTokeniser;
import com.topologi.diffx.load.text.TokeniserFactory;
import com.topologi.diffx.sequence.EventSequence;

/**
 * Loads a DOM documents as a sequence of events.
 * 
 * This class implements the methods {@link Recorder#process(File)} and
 * {@link Recorder#process(String)} for convenience, but is it much more efficient
 * to feed this recorder directly with a DOM. 
 * 
 * 
This class is not synchronised.
 * 
 * @author Christophe Lauret
 * @version 26 April 2005
 */
public final class DOMRecorder implements XMLRecorder {

  /**
   * Set to true to show debug info.
   */
  private static final boolean DEBUG = false;

// class attributes ---------------------------------------------------------------------

  /**
   * The DiffX configuration to use
   */
  private DiffXConfig config = new DiffXConfig(); 

// state variables ----------------------------------------------------------------------

  /**
   * The factory that will produce events according to the configuration. 
   */
  private transient EventFactory efactory = null;

  /**
   * The factory that will produce text tokenisers according to the configuration.
   */
  private transient TokeniserFactory tfactory = null;

  /**
   * The sequence of event for this recorder.
   */
  private transient EventSequence sequence = null;

  /**
   * The weight of the current element.
   */
  private transient int currentWeight = -1;

  /**
   * The stack of events' weight, should only contain Integer.
   */
  private transient ArrayList weights = new ArrayList();

  /**
   * Indicates whether the given document is a fragment.
   * 
   * 
An fragment is a portion of XML that is not necessrily well-formed by
   * itself, because the namespace has been declared higher in the hierarchy, in
   * which if the DOM tree was serialised it would not produce well-formed XML.
   * 
   * This option indicates that the recorder should try to generate the prefix
   * mapping without the declaration.  
   */
  private transient boolean isFragment = true;

// methods ------------------------------------------------------------------------------

  /**
   * Returns the configuration used by this recorder.
   * 
   * @return the configuration used by this recorder.
   */
  public DiffXConfig getConfig() {
    return config;
  }

  /**
   * Sets the configuration used by this recorder.
   * 
   * @param config The configuration used by this recorder.
   */
  public void setConfig(DiffXConfig config) {
    this.config = config;
  }

  /**
   * @see Recorder#process(java.io.File)
   */
  public EventSequence process(File file) throws LoadingException, IOException {
    InputStream in = new BufferedInputStream(new FileInputStream(file));
    return process(new InputSource(in));
  }

  /**
   * @see Recorder#process(java.lang.String)
   */
  public EventSequence process(String xml) throws LoadingException {
    return process(new InputSource(new StringReader(xml)));
  }

  /**
   * Runs the recorder on the specified input source.
   * 
   * @param is The input source.
   * 
   * @return The recorded sequence of events.
   * 
   * @throws LoadingException If thrown whilst parsing.
   */
  public EventSequence process(InputSource is) throws LoadingException {
    this.isFragment = false; // input source is not a fragment
    DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
    dbFactory.setNamespaceAware(config.isNamespaceAware());
    dbFactory.setExpandEntityReferences(true);
    dbFactory.setValidating(false);
    try {
      DocumentBuilder builder = dbFactory.newDocumentBuilder();
      Document document = builder.parse(is);
      return this.process(document);
    } catch (Exception ex) {
      throw new LoadingException(ex);
    }
  }

  /**
   * Processed the given node and return the corresponding event sequence. 
   * 
   * @param node The W3C DOM node to be processed.
   * 
   * @return The recorded sequence of events.
   * 
   * @throws LoadingException If thrown whilst parsing.
   */
  public EventSequence process(Node node) throws LoadingException {
    // initialise the state variables.
    this.efactory = new EventFactory(config.isNamespaceAware());
    this.tfactory = new TokeniserFactory(config);
    this.sequence = new EventSequence();
    // start processing the nodes
    loadNode(node);
    this.isFragment = true;
    return this.sequence;
  }

// specific loaders ---------------------------------------------------------------------

  /**
   * Loads the given node in the current sequence. 
   * 
   * @param node The W3C DOM node to load.
   * 
   * @throws LoadingException If thrown whilst parsing.
   */
  private void loadNode(Node node) throws LoadingException {
    // dispatch to the correct loader
    // performance: order by occurrence
    if (node instanceof Element)
      load((Element)node);
    if (node instanceof Text)
      load((Text)node);
    else if (node instanceof Attr)
      load((Attr)node);
    else if (node instanceof Document)
      load((Document)node);
    else if (node instanceof ProcessingInstruction)
      load((ProcessingInstruction)node);
    // all other node types are ignored
  }

  /**
   * Loads the given document in the current sequence. 
   * 
   * @param document The W3C DOM document node to load.
   * 
   * @throws LoadingException If thrown whilst parsing.
   */
  private void load(Document document) throws LoadingException {
    load(document.getDocumentElement());
  }

  /**
   * Loads the given element in the current sequence. 
   * 
   * @param element The W3C DOM element node to load.
   * 
   * @throws LoadingException If thrown whilst parsing.
   */
  private void load(Element element) throws LoadingException {
    if (currentWeight > 0) weights.add(new Integer(currentWeight));
    currentWeight = 1;
    // namespace handling
    OpenElementEvent open = null;
    // namespace aware configuration
    if (config.isNamespaceAware()) {
      String uri = (element.getNamespaceURI() == null)? "" : element.getNamespaceURI();
      String name = element.getLocalName();
      if (isFragment) {
        String prefix = element.getPrefix();
        if (prefix != null && isFragment)
          this.sequence.mapPrefix(uri, prefix);
      }
      open = efactory.makeOpenElement(uri, name);
    // not namespace aware
    } else {
      open = efactory.makeOpenElement(null, element.getNodeName());
    }

    this.sequence.addEvent(open);
    NamedNodeMap atts = element.getAttributes();
    // only 1 attribute, just load it
    if (atts.getLength() == 1) {
      load((Attr)atts.item(0));
    // several attributes sort them in alphabetical order
    // TODO: also use URI
    } else if (atts.getLength() > 1) {
      String[] names = new String[atts.getLength()];
      for (int i = 0; i < atts.getLength(); i++) {
        Attr attr = (Attr)atts.item(i);
        names[i] = attr.getName();
      }
      Arrays.sort(names);
      for (int i = 0; i < names.length; i++)
        load((Attr)atts.getNamedItem(names[i]));
    }
    // load all the child nodes
    NodeList list = element.getChildNodes();
    for (int i = 0; i < list.getLength(); i++)
      loadNode(list.item(i));
    CloseElementEvent close = efactory.makeCloseElement(open);
    this.sequence.addEvent(close);
    // handle the weights
    close.setWeight(this.currentWeight);
    open.setWeight(this.currentWeight);
    this.currentWeight += popWeight();
  }

  /**
   * Loads the given text in the current sequence depending on the configuration. 
   * 
   * @param text The W3C DOM text node to load.
   * 
   * @throws LoadingException If thrown whilst parsing.
   */
  private void load(Text text) throws LoadingException {
    TextTokeniser ct = tfactory.makeTokeniser(text.getData());
    for (int i = 0; i < ct.countTokens(); i++) {
      sequence.addEvent(ct.nextToken());
      this.currentWeight++;
    }
  }

  /**
   * Loads the given processing instruction in the current sequence. 
   * 
   * @param pi The W3C DOM PI node to load.
   * 
   * @throws LoadingException If thrown whilst parsing.
   */
  private void load(ProcessingInstruction pi) throws LoadingException {
    sequence.addEvent(new ProcessingInstructionEvent(pi.getTarget(), pi.getData()));
    this.currentWeight++;
  }

  /**
   * Returns the last weight and remove it from the stack.
   * 
   * @return The weight on top of the stack.
   */
  private int popWeight() {
    if (this.weights.size() > 0)
      return ((Integer)this.weights.remove(this.weights.size() - 1)).intValue();
    else
      return 0;
  }

  /**
   * Handles the attributes, will add them to the sequence in order if any.
   * 
   * @param element The element which attributes have to be handled.
   */
  private void handleAttributes(Element element) {
    NamedNodeMap atts = element.getAttributes();
    // only 1 attribute, just load it
    if (atts.getLength() == 1) {
      load((Attr)atts.item(0));
    // several attributes sort them in alphabetical order
    } else if (atts.getLength() > 1) {
      AttributeEvent[] events = new AttributeEvent[atts.getLength()];
      for (int i = 0; i < atts.getLength(); i++) {
        Attr attr = (Attr)atts.item(i);
        events[i] = efactory.makeAttribute(attr.getNamespaceURI(), 
                                           attr.getLocalName(),
                                           attr.getNodeName(),
                                           attr.getValue());
      }
      Arrays.sort(events, new AttributeComparator());
      for (int i = 0; i < events.length; i++)
        load(events[i]);
    }
  }

  /**
   * Loads the given attribute in the current sequence. 
   * 
   * @param attr The W3C DOM attribute node to load.
   */
  private void load(Attr attr) {
    load(efactory.makeAttribute(attr.getNamespaceURI(),
                                attr.getLocalName(),
                                attr.getNodeName(),
                                attr.getValue())); 
  }

  /**
   * Loads the given attribute in the current sequence. 
   * 
   * @param e An attribute event.
   */
  private void load(AttributeEvent e) {
	  
    // a namespace declaration, translate the event into a prefix mapping
    if ("http://www.w3.org/2000/xmlns/".equals(e.getURI())) {
    	
    	//Docx4jDriver.log("Encountered namespace declaration: " + e.getValue() );
    	
    	// Trap/handle xmlns:xmlns="",
    	// which JAXB seems to produce 
    	// and will later cause errors since
    	// Non-default namespace can not map to empty URI (as per Namespace 1.0 # 2) in XML 1.0 documents
    	if (e.getName().equals("xmlns") &&
    			e.getValue().equals("") ) {
    		Docx4jDriver.log("Ignoring xmlns:xmlns='' ");
    		return;
    	}
    	
      this.sequence.mapPrefix(e.getValue(), e.getName()); 
    
    // a regular attribute
    } else {
    	//System.out.print(e.getURI());
      e.setWeight(2);
      this.currentWeight += 2;
      this.sequence.addEvent(e);
    }
  }
}