gate.composite.impl.AbstractCombiningMethod Maven / Gradle / Ivy

Go to download
package gate.composite.impl;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Utils;
import gate.composite.CombiningMethod;
import gate.composite.CombiningMethodException;
import gate.composite.CompositeDocument;
import gate.composite.OffsetDetails;
import gate.compound.CompoundDocument;
import gate.corpora.DocumentImpl;
import gate.creole.ResourceInstantiationException;
import gate.util.InvalidOffsetException;

import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;

/**
 * Abstract implementation of the combining method. Classes extending
 * this class must use startDocument() before adding any content (i.e.
 * addContent) and must finalizeDocument() at the end of all additions.
 * 
 * @author niraj
 */
public abstract class AbstractCombiningMethod implements CombiningMethod {

  private static final long serialVersionUID = -6866546515510128190L;

  protected HashMap> offsetMappings;

  protected StringBuffer documentContent;

  protected String toAdd;

  protected CompoundDocument containerDocument;

  protected List annotations;

  protected List offsets;

  protected Set annotationTypesToCopy;

  private boolean startDocumentCalled = false;

  protected boolean debug = false;

  /**
   * User must call this method to start a composite document
   * 
   * @param containerDocument - instance of compound document that the
   *          new composite is going to become member of.
   * @param annotationTypesToCopy - list of types of annotations to copy
   *          underlying the unit annotation. Supply null to copy all
   *          the annotations. Supply an empty set to copy nothing.
   */
  protected void startDocument(CompoundDocument containerDocument,
          Set annotationTypesToCopy) throws CombiningMethodException {
    if(debug) {
      System.out.println("Start Document called");
    }
    
    offsetMappings = new HashMap>();
    this.containerDocument = containerDocument;
    this.annotationTypesToCopy = annotationTypesToCopy;
    this.annotations = new ArrayList();
    this.offsets = new ArrayList();
    documentContent = new StringBuffer();
    toAdd = "";
    startDocumentCalled = true;

    if(debug) {
      System.out.println("Exiting Start Document");
    }
    
  }

  protected CompositeDocument finalizeDocument()
          throws CombiningMethodException {
    if(debug) {
      System.out.println("FinalizeDocument called");
    }

    if(!startDocumentCalled)
      throw new CombiningMethodException(
              "CompositeDocument is not initialized - please "
                      + "call the startDocument() method to initialize the "
                      + "composite document");

    XMLOutputFactory outputFactory = XMLOutputFactory.newInstance();
    String encoding = containerDocument.getEncoding();
    if(encoding == null) encoding = "UTF-8";
    StringWriter sw = new StringWriter();
    try {

      XMLStreamWriter xsw = outputFactory.createXMLStreamWriter(sw);
      xsw.writeStartDocument(encoding, "1.0");
      xsw.writeStartElement("", "composite");
      char[] result = documentContent.toString().toCharArray();
      replaceXMLIllegalCharacters(result);
      xsw.writeCharacters(new String(result));
      xsw.writeEndElement();
      xsw.writeEndDocument();
      xsw.close();
    }
    catch(XMLStreamException e2) {
      throw new CombiningMethodException(e2);
    }

    CompositeDocument doc = null;
    try {
      FeatureMap features = Factory.newFeatureMap();
      features.put("collectRepositioningInfo", containerDocument
              .getCollectRepositioningInfo());
      features.put("encoding", encoding);
      features.put("markupAware", new Boolean(true));
      features.put("preserveOriginalContent", containerDocument
              .getPreserveOriginalContent());
      features.put(DocumentImpl.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, sw
              .toString());
      FeatureMap subFeatures = Factory.newFeatureMap();
      Gate.setHiddenAttribute(subFeatures, true);

      doc = (CompositeDocument)Factory.createResource(
              "gate.composite.impl.CompositeDocumentImpl", features,
              subFeatures);
    }
    catch(ResourceInstantiationException e1) {
      throw new CombiningMethodException(e1);
    }

    ((gate.composite.impl.CompositeDocumentImpl)doc).disableListener = true;

    // lets add all annotations now
    for(OffsetDetails od : annotations) {
      // obtain annotation set to add annotations to
      AnnotationSet aSet = od.getAsName() == null
              || od.getAsName().trim().length() == 0
              ? doc.getAnnotations()
              : doc.getAnnotations(od.getAsName());
      String type = od.getOriginalAnnotation().getType();
      gate.FeatureMap f = od.getOriginalAnnotation().getFeatures();
      Integer id = od.getOriginalAnnotation().getId();
      try {
        aSet.add(id,new Long(od.getNewStartOffset()), new Long(od
                .getNewEndOffset()), type, f);
        od.setNewAnnotation(aSet.get(id));
      }
      catch(InvalidOffsetException e) {
        System.out.println("Offsets :" + od.getNewStartOffset() + "=>"
                + od.getNewEndOffset());
        throw new CombiningMethodException(e);
      }
    }
    ((gate.composite.impl.CompositeDocumentImpl)doc).disableListener = false;

    doc.setCombiningMethod(this);
    doc.setOffsetMappingInformation(offsetMappings);
    doc.setCombinedDocumentsIds(new HashSet(containerDocument
            .getDocumentIDs()));
    doc.setCompoundDocument(containerDocument);
    if(debug) {
      System.out.println("Exiting FinalizDocument");
    }
    
    return doc;
  }

  /**
   * Returns the Ids of combined documents
   */
  public Set getCombinedDocumentsIds() {
    return offsetMappings.keySet();
  }

  /**
   * This method returns the new offset for where the content was added
   */
  protected long[] addContent(Document srcDocument, Annotation unitAnnotation)
          throws CombiningMethodException {
    if(debug) {
      System.out.println("AddContent called");
    }

    if(!startDocumentCalled)
      throw new CombiningMethodException(
              "CompositeDocument is not initialized - please "
                      + "call the startDocument() method to initialize the "
                      + "composite document");

    String documentID = srcDocument.getName();
    offsets = offsetMappings.get(documentID);
    if(offsets == null) {
      offsets = new ArrayList();
      offsetMappings.put(documentID, offsets);
    }

    OffsetDetails offset = new OffsetDetails();
    offset.setOldStartOffset(unitAnnotation.getStartNode().getOffset()
            .longValue());
    offset.setOldEndOffset(unitAnnotation.getEndNode().getOffset().longValue());
    offset.setNewStartOffset(documentContent.length());
    documentContent.append(gate.Utils.contentFor(srcDocument, unitAnnotation));
    offset.setNewEndOffset(documentContent.length());
    offset.setOriginalAnnotation(unitAnnotation);
    offsets.add(offset);
    annotations.add(offset);

    if(debug) {
      System.out.println("Unit annotation:" + unitAnnotation.getType() + "=>"
              + offset.getOldStartOffset() + "=>" + offset.getOldEndOffset()
              + "=>" + offset.getNewStartOffset() + "=>"
              + offset.getNewEndOffset());
    }

    OffsetDetails unitAnnotDetails = new OffsetDetails();
    unitAnnotDetails.setOldStartOffset(offset.getOldStartOffset());
    unitAnnotDetails.setOldEndOffset(offset.getOldEndOffset());
    unitAnnotDetails.setNewStartOffset(offset.getNewStartOffset());
    unitAnnotDetails.setNewEndOffset(offset.getNewEndOffset());
    offsets.add(unitAnnotDetails);

    if(annotationTypesToCopy == null || !annotationTypesToCopy.isEmpty()) {

      if(debug) {
        System.out
                .println("copying annotations from the default Annotation set");
      }
      // copy annotations under the default annotation set
      copyAnnotations(srcDocument.getAnnotations(), unitAnnotation, offset);

      // copy annotations from all the named annotation set
      Map annotationSets = srcDocument
              .getNamedAnnotationSets();
      if(annotationSets != null) {
        for(String asName : annotationSets.keySet()) {
          if(debug) {
            System.out.println("copying annotations from the :" + asName
                    + " Annotation set");
          }
          copyAnnotations(srcDocument.getAnnotations(asName), unitAnnotation,
                  offset);
        }
      }
    }
    documentContent.append("\n");
    if(debug) {
      System.out.println("Exiting AddContent");
    }

    return new long[] {unitAnnotDetails.getNewStartOffset(),
        unitAnnotDetails.getNewEndOffset()};
  }

  private void copyAnnotations(AnnotationSet inputAS,
          Annotation unitAnnotation, OffsetDetails boundaries) {
    if(debug) {
      System.out.println("CopyAnnotations called");
    }

    if(debug) {
      System.out.println("Obtaning annotations between :"
              + Utils.start(unitAnnotation) + " and "
              + Utils.end(unitAnnotation));
    }
    AnnotationSet tempSet = inputAS.getContained(Utils.start(unitAnnotation),
            Utils.end(unitAnnotation));
    if(annotationTypesToCopy != null && !annotationTypesToCopy.isEmpty()) {
      tempSet = tempSet.get(annotationTypesToCopy);
    }

    Iterator iter = tempSet.iterator();
    while(iter.hasNext()) {

      Annotation anAnnot = iter.next();
      if(anAnnot == unitAnnotation) continue;
      Long start = Utils.start(anAnnot);
      Long end = Utils.end(anAnnot);
      if(start < boundaries.getOldStartOffset() || start > boundaries.getOldEndOffset())
        continue;
      
      if(end < boundaries.getOldStartOffset() || end > boundaries.getOldEndOffset())
        continue;
      
      OffsetDetails anOffset = new OffsetDetails();
      anOffset.setOldStartOffset(start);
      anOffset.setOldEndOffset(end);

      long stDiff = anOffset.getOldStartOffset()
              - boundaries.getOldStartOffset();
      long len = anOffset.getOldEndOffset() - anOffset.getOldStartOffset();

      anOffset.setNewStartOffset(boundaries.getNewStartOffset() + stDiff);
      anOffset.setNewEndOffset(anOffset.getNewStartOffset() + len);
      anOffset.setOriginalAnnotation(anAnnot);

      if(debug) {
        System.out.println("\tCopied" + anAnnot.getType() + "="
                + anOffset.getOldStartOffset() + "="
                + anOffset.getOldEndOffset() + "="
                + anOffset.getNewStartOffset() + "="
                + anOffset.getNewEndOffset());
      }

      // this will be interned - making it easier to store and less
      // expensive
      anOffset.setAsName(inputAS.getName());
      offsets.add(anOffset);
      annotations.add(anOffset);
    }
    if(debug) {
      System.out.println("Exiting copy contents");
    }
    
  }

  static void replaceXMLIllegalCharacters(char[] buf) {
    for(int i = 0; i < buf.length; i++) {
      if(buf[i] <= 0x0008 || buf[i] == 0x000B || buf[i] == 0x000C
              || (buf[i] >= 0x000E && buf[i] <= 0x001F)) {
        buf[i] = ' ';
        continue;
      }

      // buf[i) is a high surrogate...
      if(buf[i] >= 0xD800 && buf[i] <= 0xDBFF) {
        // if we're not at the end of the buffer we can look ahead
        if(i < buf.length - 1) {
          // followed by a low surrogate is OK
          if(buf[i + 1] >= 0xDC00 && buf[i + 1] <= 0xDFFF) {
            continue;
          }
        }
        buf[i] = ' ';
        continue;
      }

      // buf[i) is a low surrogate...
      if(buf[i] >= 0xDC00 && buf[i] <= 0xDFFF) {
        // if we're not at the start of the buffer we can look behind
        if(i > 0) {
          // preceded by a high surrogate is OK
          if(buf[i - 1] >= 0xD800 && buf[i - 1] <= 0xDBFF) {
            continue;
          }
        }

        buf[i] = ' ';
        continue;
      }

      // buf[i) is a BOM character
      if(buf[i] == 0xFFFE || buf[i] == 0xFFFF) {
        buf[i] = ' ';
        continue;
      }
    }
  }
}