gate.corpora.DocumentXmlUtils Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of gate-core Show documentation

GATE - general achitecture for text engineering - is open source software capable of solving almost any text processing problem. This artifact enables you to embed the core GATE Embedded with its essential dependencies. You will able to use the GATE Embedded API and load and store GATE XML documents. This artifact is the perfect dependency for CREOLE plugins or for applications that need to customize the GATE dependencies due to confict with their own dependencies or for lower footprint.

The newest version!

/*
 *  DocumentXmlUtils.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Ian Roberts, 20/Jul/2006
 *
 *  $Id: DocumentXmlUtils.java 17580 2014-03-07 18:58:06Z markagreenwood $
 */
package gate.corpora;

import gate.Annotation;
import gate.AnnotationSet;
import gate.FeatureMap;
import gate.TextualDocument;
import gate.event.StatusListener;
import gate.util.Err;
import gate.util.Strings;

import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;

/**
 * This class is contains useful static methods for working with the GATE XML
 * format.  Many of the methods in this class were originally in {@link
 * DocumentImpl} but as they are not specific to any one implementation of the
 * Document interface they have been moved here.
 */
public class DocumentXmlUtils {

  /**
   * This field is used when creating StringBuffers for toXml() methods. The
   * size of the StringBuffer will be docDonctent.size() multiplied by this
   * value. It is aimed to improve the performance of StringBuffer
   */
  public static final int DOC_SIZE_MULTIPLICATION_FACTOR = 40;

  /**
   * Returns a GateXml document that is a custom XML format for wich there is a
   * reader inside GATE called gate.xml.GateFormatXmlHandler. What it does is to
   * serialize a GATE document in an XML format.
   * 
   * @param doc the document to serialize.
   * @return a string representing a Gate Xml document.
   */
  public static String toXml(TextualDocument doc) {
    // Initialize the xmlContent several time the size of the current document.
    // This is because of the tags size. This measure is made to increase the
    // performance of StringBuffer.
    StringBuffer xmlContent = new StringBuffer(
            DOC_SIZE_MULTIPLICATION_FACTOR
            * (doc.getContent().size().intValue()));
    // Add xml header
    xmlContent.append("");
    xmlContent.append(Strings.getNl());
    // Add the root element
    xmlContent.append("\n");
    xmlContent.append("\n\n");
    xmlContent.append("\n");
    xmlContent.append(featuresToXml(doc.getFeatures(),null));
    xmlContent.append("\n");
    xmlContent.append("\n\n");
    // Add plain text element
    xmlContent.append("");
    xmlContent.append(textWithNodes(doc, doc.getContent().toString()));
    xmlContent.append("\n");
    // Serialize as XML all document's annotation sets
    // Serialize the default AnnotationSet
    StatusListener sListener = (StatusListener)gate.Gate
            .getListeners().get("gate.event.StatusListener");
    if(sListener != null)
      sListener.statusChanged("Saving the default annotation set ");
    xmlContent.append("\n\n");
    annotationSetToXml(doc.getAnnotations(), xmlContent);
    // Serialize all others AnnotationSets
    // namedAnnotSets is a Map containing all other named Annotation Sets.
    Map namedAnnotSets = doc.getNamedAnnotationSets();
    if(namedAnnotSets != null) {
      Iterator iter = namedAnnotSets.values().iterator();
      while(iter.hasNext()) {
        AnnotationSet annotSet = iter.next();
        xmlContent.append("\n\n");
        // Serialize it as XML
        if(sListener != null)
          sListener.statusChanged("Saving " + annotSet.getName()
                  + " annotation set ");
        annotationSetToXml(annotSet, xmlContent);
      }// End while
    }// End if
    // Add the end of GateDocument
    xmlContent.append("");
    if(sListener != null) sListener.statusChanged("Done !");
    // return the XmlGateDocument
    return xmlContent.toString();
  }


  /**
   * This method saves a FeatureMap as XML elements.
   * 
   * @param aFeatureMap
   *          the feature map that has to be saved as XML.
   * @return a String like this: <Feature><Name>...</Name> <Value>...</Value></Feature><Feature>...</Feature>
   */
  public static StringBuffer featuresToXml(FeatureMap aFeatureMap, Map normalizedFeatureNames) {
    if(aFeatureMap == null) return new StringBuffer();
    StringBuffer buffer = new StringBuffer(1024);
    Set