All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.sirix.service.xml.shredder.XmlShredder Maven / Gradle / Ivy

Go to download

SirixDB is a hybrid on-disk and in-memory document oriented, versioned database system. It has a lightweight buffer manager, stores everything in a huge persistent and durable tree and allows efficient reconstruction of every revision. Furthermore, SirixDB implements change tracking, diffing and supports time travel queries.

There is a newer version: 0.11.0
Show newest version
/**
 * Copyright (c) 2011, University of Konstanz, Distributed Systems Group All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met: * Redistributions of source code must retain the
 * above copyright notice, this list of conditions and the following disclaimer. * Redistributions
 * in binary form must reproduce the above copyright notice, this list of conditions and the
 * following disclaimer in the documentation and/or other materials provided with the distribution.
 * * Neither the name of the University of Konstanz nor the names of its contributors may be used to
 * endorse or promote products derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL  BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package io.sirix.service.xml.shredder;

import io.sirix.access.DatabaseConfiguration;
import io.sirix.access.Databases;
import io.sirix.access.ResourceConfiguration;
import io.sirix.api.xml.XmlNodeTrx;
import io.sirix.api.xml.XmlResourceSession;
import io.sirix.exception.SirixException;
import io.sirix.exception.SirixIOException;
import io.sirix.node.xml.ElementNode;
import io.sirix.service.InsertPosition;
import io.sirix.service.ShredderCommit;
import io.sirix.settings.Constants;
import io.sirix.utils.LogWrapper;
import io.brackit.query.atomic.QNm;
import org.slf4j.LoggerFactory;

import javax.xml.namespace.QName;
import javax.xml.stream.*;
import javax.xml.stream.events.*;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.Callable;

import static java.util.Objects.requireNonNull;

/**
 * This class appends a given {@link XMLStreamReader} to a {@link XmlNodeTrx} . The content of the
 * stream is added as a subtree. Based on an enum which identifies the point of insertion, the
 * subtree is either added as first child or as right sibling.
 *
 * @author Marc Kramis, Seabix
 * @author Sebastian Graf, University of Konstanz
 * @author Johannes Lichtenberger, University of Konstanz
 *
 */
public final class XmlShredder extends AbstractShredder implements Callable {

  /** {@link LogWrapper} reference. */
  private static final LogWrapper LOGWRAPPER = new LogWrapper(LoggerFactory.getLogger(XmlShredder.class));

  /** {@link XmlNodeTrx}. */
  private final XmlNodeTrx wtx;

  /** {@link XMLEventReader}. */
  private final XMLEventReader reader;

  /** Determines if changes are going to be commit right after shredding. */
  private final ShredderCommit commit;

  /** Insertion position. */
  private final InsertPosition insert;

  /** Determines if comments should be included. */
  private final boolean includeComments;

  /** Determines if processing instructions should be included. */
  private final boolean includePIs;

  /**
   * Builder to build an {@link XmlShredder} instance.
   */
  public static class Builder {

    /** {@link XmlNodeTrx} implementation. */
    private final XmlNodeTrx wtx;

    /** {@link XMLEventReader} implementation. */
    private final XMLEventReader reader;

    /** Insertion position. */
    private final InsertPosition insert;

    /** Determines if comments should be included. */
    private boolean includeComments = true;

    /** Determines if processing instructions should be included. */
    private boolean includePIs = true;

    /**
     * Determines if after shredding the transaction should be immediately commited.
     */
    private ShredderCommit commit = ShredderCommit.NOCOMMIT;

    /**
     * Constructor.
     *
     * @param wtx {@link XmlNodeTrx} implementation
     * @param reader {@link XMLEventReader} implementation
     * @param insert insertion position
     */
    public Builder(final XmlNodeTrx wtx, final XMLEventReader reader, final InsertPosition insert) {
      this.wtx = requireNonNull(wtx);
      this.reader = requireNonNull(reader);
      this.insert = requireNonNull(insert);
    }

    /**
     * Include comments or not (default: yes).
     *
     * @param include include comments
     * @return this builder instance
     */
    public Builder includeComments(final boolean include) {
      includeComments = include;
      return this;
    }

    /**
     * Include processing instructions or not (default: yes).
     *
     * @param include processing instructions
     * @return this builder instance
     */
    public Builder includePIs(final boolean include) {
      includePIs = include;
      return this;
    }

    /**
     * Commit afterwards.
     *
     * @return this builder instance
     */
    public Builder commitAfterwards() {
      commit = ShredderCommit.COMMIT;
      return this;
    }

    /**
     * Build an instance.
     *
     * @return {@link XmlShredder} instance
     */
    public XmlShredder build() {
      return new XmlShredder(this);
    }
  }

  /**
   * Private constructor.
   *
   * @param builder builder reference
   */
  private XmlShredder(final Builder builder) {
    super(builder.wtx, builder.insert);
    wtx = builder.wtx;
    reader = builder.reader;
    insert = builder.insert;
    includeComments = builder.includeComments;
    includePIs = builder.includePIs;
    commit = builder.commit;
  }

  /**
   * Invoking the shredder.
   *
   * @throws SirixException if any kind of sirix exception which has occured
   * @return revision of file
   */
  @Override
  public Long call() {
    final long revision = wtx.getRevisionNumber();
    insertNewContent();
    commit.commit(wtx);
    return revision;
  }

  /**
   * Insert new content based on a StAX parser {@link XMLStreamReader}.
   *
   * @throws SirixException if something went wrong while inserting
   */
  private void insertNewContent() {
    try {
      boolean firstElement = true;
      int level = 0;
      QName rootElement = null;
      boolean endElemReached = false;
      final StringBuilder sBuilder = new StringBuilder();
      long insertedRootNodeKey = -1;

      // Iterate over all nodes.
      while (reader.hasNext() && !endElemReached) {
        final XMLEvent event = reader.nextEvent();

        switch (event.getEventType()) {
          case XMLStreamConstants.START_ELEMENT:
            level++;
            addNewElement(event.asStartElement());
            if (firstElement) {
              firstElement = false;
              insertedRootNodeKey = wtx.getNodeKey();
              rootElement = event.asStartElement().getName();
            }
            break;
          case XMLStreamConstants.END_ELEMENT:
            level--;
            if (level == 0 && rootElement != null && rootElement.equals(event.asEndElement().getName())) {
              endElemReached = true;
            }
            final QName name = event.asEndElement().getName();
            processEndTag(new QNm(name.getNamespaceURI(), name.getPrefix(), name.getLocalPart()));
            break;
          case XMLStreamConstants.CHARACTERS:
            if (reader.peek().getEventType() == XMLStreamConstants.CHARACTERS) {
              sBuilder.append(event.asCharacters().getData().trim());
            } else {
              sBuilder.append(event.asCharacters().getData().trim());
              processText(sBuilder.toString());
              sBuilder.setLength(0);
            }
            break;
          case XMLStreamConstants.COMMENT:
            if (includeComments) {
              processComment(((Comment) event).getText());
            }
            break;
          case XMLStreamConstants.PROCESSING_INSTRUCTION:
            if (includePIs) {
              final ProcessingInstruction pi = (ProcessingInstruction) event;
              processPI(pi.getData(), pi.getTarget());
            }
            break;
          default:
            // Node kind not known.
        }
      }

      wtx.moveTo(insertedRootNodeKey);
    } catch (final XMLStreamException e) {
      throw new SirixIOException(e);
    }
  }

  /**
   * Add a new element node.
   *
   * @param event the current event from the StAX parser
   * @throws SirixException if adding {@link ElementNode} fails
   */
  private void addNewElement(final StartElement event) throws SirixException {
    assert event != null;
    final QName qName = event.getName();
    final QNm name = new QNm(qName.getNamespaceURI(), qName.getPrefix(), qName.getLocalPart());
    processStartTag(name);

    // Parse namespaces.
    for (final Iterator it = event.getNamespaces(); it.hasNext();) {
      final Namespace namespace = (Namespace) it.next();
      wtx.insertNamespace(new QNm(namespace.getNamespaceURI(), namespace.getPrefix(), ""));
      wtx.moveToParent();
    }

    // Parse attributes.
    for (final Iterator it = event.getAttributes(); it.hasNext();) {
      final Attribute attribute = (Attribute) it.next();
      final QName attName = attribute.getName();
      wtx.insertAttribute(new QNm(attName.getNamespaceURI(), attName.getPrefix(), attName.getLocalPart()),
                          attribute.getValue());
      wtx.moveToParent();
    }
  }

  /**
   * Main method.
   *
   * @param args input and output files
   * @throws XMLStreamException if the XML stream isn't valid
   * @throws IOException if an I/O error occurs
   * @throws SirixException if a Sirix error occurs
   */
  public static void main(final String... args) throws SirixException, IOException, XMLStreamException {
    if (args.length != 2 && args.length != 3) {
      throw new IllegalArgumentException("Usage: XMLShredder XMLFile Database [true/false] (shredder comment|PI)");
    }
    LOGWRAPPER.info("Shredding '" + args[0] + "' to '" + args[1] + "' ... ");
    final long time = System.nanoTime();
    final Path target = Paths.get(args[1]);
    final DatabaseConfiguration config = new DatabaseConfiguration(target);
    Databases.removeDatabase(target);
    Databases.createXmlDatabase(config);

    try (final var db = Databases.openXmlDatabase(target)) {
      db.createResource(new ResourceConfiguration.Builder("shredded").build());
      try (final XmlResourceSession resMgr = db.beginResourceSession("shredded");
           final XmlNodeTrx wtx = resMgr.beginNodeTrx();
           final FileInputStream fis = new FileInputStream(Paths.get(args[0]).toFile())) {
        final XMLEventReader reader = createFileReader(fis);
        final boolean includeCoPI = args.length == 3 && Boolean.parseBoolean(args[2]);
        final XmlShredder shredder =
            new XmlShredder.Builder(wtx, reader, InsertPosition.AS_FIRST_CHILD).commitAfterwards()
                                                                               .includeComments(includeCoPI)
                                                                               .includePIs(includeCoPI)
                                                                               .build();
        shredder.call();
      }
    }

    LOGWRAPPER.info(" done [" + (System.nanoTime() - time) / 1000000 + " ms].");
  }

  /**
   * Create a new {@link XMLEventReader} instance on a file.
   *
   * @param fis the file input stream
   * @return an {@link XMLEventReader}
   * @throws SirixException if creating the xml event reader fails.
   */
  public static XMLEventReader createFileReader(final FileInputStream fis) {
    requireNonNull(fis);
    final XMLInputFactory factory = XMLInputFactory.newInstance();
    setProperties(factory);
    try {
      return factory.createXMLEventReader(fis);
    } catch (XMLStreamException e) {
      throw new SirixException(e.getMessage(), e);
    }
  }

  private static void setProperties(final XMLInputFactory factory) {
    factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
    factory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
    factory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, true);
  }

  /**
   * Create a new {@link XMLEventReader} instance on a string.
   *
   * @param xmlString the XML file as a string to parse
   * @return an {@link XMLEventReader}
   * @throws SirixException if creating the xml event reader fails.
   */
  public static XMLEventReader createStringReader(final String xmlString) {
    requireNonNull(xmlString);
    final XMLInputFactory factory = XMLInputFactory.newInstance();
    setProperties(factory);
    try {
      final InputStream in = new ByteArrayInputStream(xmlString.getBytes(Constants.DEFAULT_ENCODING));
      return factory.createXMLEventReader(in);
    } catch (XMLStreamException e) {
      throw new SirixException(e.getMessage(), e);
    }
  }

  /**
   * Create a new StAX reader based on a List of {@link XMLEvent}s.
   *
   * @param events {@link XMLEvent}s
   * @return an {@link XMLEventReader}
   * @throws IOException if I/O operation fails
   * @throws XMLStreamException if any parsing error occurs
   */
  public static XMLEventReader createQueueReader(final Queue events) throws IOException, XMLStreamException {
    return new QueueEventReader(requireNonNull(events));
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy