All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.lib.basicdocument.GateDocumentUpdater Maven / Gradle / Ivy

/*
 * Copyright (c) 2019 The University of Sheffield.
 *
 * This file is part of gateplugin-Format_Bdoc 
 * (see https://github.com/GateNLP/gateplugin-Format_Bdoc).
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program. If not, see .
 */

package gate.lib.basicdocument;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.creole.ResourceInstantiationException;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

// TODO: use offset mapper when copying over the annotations from bdoc/changelog
//   in case those offsets are type python



/**
 * A class that allows to update a GATE document from a BasicDocument
 *
 * @author Johann Petrak
 */
public class GateDocumentUpdater {

  /**
   * What to do when adding an annotation that already exists in the document.
   */
  public static enum HandleExistingAnns {
    /**
     * Completely replace the annotation with the new one.
     */
    REPLACE_ANNOTATION, // completely replace with the new one
    /**
     * Completely replace the features of the existing annotation.
     */
    REPLACE_FEATURES, // just completely replace the features 
    /**
     * Add new and update existing features, do not delete any.
     */
    UPDATE_FEATURES, // add new and update existing features, do not delete any
    /**
     * Only add new features.
     */
    ADD_NEW_FEATURES, // only add new features
    /**
     * Ignore that annotation.
     */
    IGNORE, // ignore that annotation, do nothing
    /**
     * Add as a new annotation with a new id.
     */
    ADD_WITH_NEW_ID,  // add that annotation with a new id
  }
  
  /**
   * What to do when adding a new annotation.
   */
  public static enum HandleNewAnns {
    /**
     * Add as a new annotation with a new id.
     */
    ADD_WITH_NEW_ID,  // add that annotation with a new id
    /**
     * Add as a new annotation with a new id.
     */
    ADD_WITH_BDOC_ID,  // add that annotation with the id we get from the BDOC    
  }
  
  private Document gateDocument;

  private HandleExistingAnns handleExistingAnns = HandleExistingAnns.ADD_WITH_NEW_ID;
  private HandleNewAnns handleNewAnns = HandleNewAnns.ADD_WITH_BDOC_ID;

  /**
   * If null, use all, otherwise the set of annotation set names to use.
   */
  private Set annsetnames;

  /**
   * If null, use all, otherwise the set of document feature names to use.
   */
  private Set featurenames;
  
  /**
   * If true update the GATE document name from the bdoc document name.
   */
  private boolean updateName = true;
  
  /**
   * OffsetMapper for converting offsets to Java.
   * If we update from a BdocDocument of ChangeLog which does not have Java
   * offsets, we first create the offset mapper and store it here before any
   * annotations get copied. The offset mapper is only built whenever the 
   * first annotation actually needs to get converted.
   */
  private OffsetMapper offsetMapper = null;

  /**
   * Create a document updater with the default options. Initially, all
   * information from the update source except text will be used to update the
   * GATE document. Use the noXxx() methods followed by useXxx() methods to
   * select a specific set of information.
   *
   * @param doc the GATE document to update
   */
  public GateDocumentUpdater(Document doc) {
    this.gateDocument = doc;

  }
  
  /**
   * Create a document updater for updating a brand new document with this text.
   * 
   * This can be used to convert a BdocDocument to a GATE document and still
   * control, if necessary, which annotations/features of the BdocDocument
   * should get converted. 
   * 
   * @param text initial text to start building the document from
   */
  public GateDocumentUpdater(String text) {
    try {
      this.gateDocument = Factory.newDocument(text);
    } catch (ResourceInstantiationException ex) {
      throw new GateRuntimeException("Could not create GATE document from the given text", ex);
    }
  }

  // Methods to set options about how to update the document
  // These can be chained as necessary
  /**
   * Set the current list of known annotation set names to add to empty.
   * Initially, all annotation sets are added, this can be used to start giving
   * an explicit list of annotation set names to use by subsequently calling
   * useAnnotationSet(name)
   *
   * @return modified GateDocumentUpdater
   */
  public GateDocumentUpdater noAnnotationSet() {
    annsetnames = new HashSet<>();
    return this;
  }

  /**
   * Set if the document name should get updated (default is yes). 
   * 
   * @param flag if false, prevents the update
   * @return modified GateDocumentUpdater
   */
  public GateDocumentUpdater updateName(boolean flag) {
    updateName = flag;
    return this;
  }
  
  
  /**
   * Include this annotation set in the updates.
   *
   * @param name name of annotation set to include
   * @return modified GateDocumentUpdater
   */
  public GateDocumentUpdater useAnnotationSet(String name) {
    annsetnames.add(name);
    return this;
  }

  /**
   * Clear the list of document feature names to use for updating.
   *
   * @return modified GateDocumentUpdater
   */
  public GateDocumentUpdater noFeature() {
    featurenames = new HashSet();
    return this;
  }

  /**
   * Add feature name to include for updating.
   *
   * @param name the name of the feature
   * @return modified GateDocumentUpdater
   */
  public GateDocumentUpdater useFeature(String name) {
    featurenames.add(name);
    return this;
  }

  /**
   * Specify how annotations with an id that already exists should be
   * handled.Default is ADD_WITH_NEW_ID
   *
   *
   * @param option The annotation handling option to use
   * @return modified GateDocumentUpdater
   */
  public GateDocumentUpdater handleExistingAnnotation(HandleExistingAnns option) {
    handleExistingAnns = option;
    return this;
  }
  
  /**
   * Specify how new annotations should be
   * handled.Default is ADD_WITH_NEW_ID.
   *
   * For restoring a document exactly as it was from a BDOC representation,
   * ADD_WITH_BDOC_ID is necessary!
   * 
   * @param option The annotation handling option to use
   * @return modified GateDocumentUpdater
   */
  public GateDocumentUpdater handleNewAnnotation(HandleNewAnns option) {
    handleNewAnns = option;
    return this;
  }
  

  /**
   * Add an annotation to the GATE annotation set.
   * This uses the information from a changelog or a bdoc document to
   * add an annotation to the GATE annotation set, or update an annotation.
   * The flags handleNewAnns and handleExistingAnns are used to influence 
   * the behavior. 
   * @param gateset the GATE annotation set to update
   * @param bdocannid  the annotation id of the annotation from changelog/bdoc
   * @param startoffset start offset
   * @param endoffset end offset
   * @param bdoctype annotation type
   * @param bdocfeatures annotation features
   */
  private void addAnnotation(AnnotationSet gateset,
          int bdocannid, int bdocstart, int bdocend, String bdoctype,
          Map bdocfeatures, String offsetType) {
    // make sure we always have the offsets as java offsets
    long startoffset = convertOffset(bdocstart, offsetType);
    long endoffset = convertOffset(bdocend, offsetType);
    
    // make sure we always have a non-null feature map, use a new empty one
    // if necessary.
    Map bdoc_fm = 
            (bdocfeatures == null)
            ? new HashMap<>()
            : bdocfeatures;
    // try to get the annotation with the annotation id 
    Annotation gateann = gateset.get(bdocannid);
    
    // Case 1: the annotation does not already exist and we want to add it
    // with a new id
    if (gateann == null && handleNewAnns == HandleNewAnns.ADD_WITH_NEW_ID) {
      try {
        gateset.add(startoffset, endoffset,
                bdoctype, gate.Utils.toFeatureMap(bdoc_fm));
      } catch (InvalidOffsetException ex) {
        throw new RuntimeException("Cannot add annotation", ex);
      }     
    // Case 2: the annotation does not already exist and we want to add it 
    // with its own existing id.
    } else if (gateann == null && handleNewAnns == HandleNewAnns.ADD_WITH_BDOC_ID) {
      try {
        gateset.add(bdocannid,
                startoffset, endoffset,
                bdoctype, gate.Utils.toFeatureMap(bdoc_fm));
      } catch (InvalidOffsetException ex) {
        throw new RuntimeException("Cannot add annotation", ex);
      }     
    // Case 3: the annotation already exists but we want to add with a new id
    } else if (gateann != null && handleExistingAnns == HandleExistingAnns.ADD_WITH_NEW_ID) {
      try {
        gateset.add(startoffset, endoffset,
                bdoctype, gate.Utils.toFeatureMap(bdoc_fm));
      } catch (InvalidOffsetException ex) {
        throw new RuntimeException("Cannot add annotation", ex);
      }
    // All other cases: we already have that annotation, and we want to do 
    // something with it, depending on the HandleExistingAnns flag
    } else if(gateann != null) {  // make null pointer checker happy
      // an annotation with this id already exists, choose what to do
      // first get the existing featuremap and map string feature names
      // to the original keys. in theory this could yield duplicates but
      // we do not care about this for now, those features really should all
      // have string names! null keys are ignored
      
      // NOTE: the offsets we get from the bdoc/chlog should correspond to
      // the offsets of the existing annotation!
      // We check this here to catch any bugs that may still exist!
      if (!gateann.getStartNode().getOffset().equals(startoffset) ||
          !gateann.getEndNode().getOffset().equals(endoffset)) {
        throw new GateRuntimeException(
                "Annotation offsets do not match for GATE annotation: "+
                        gateann+
                        " and bdoc/chlog annotation: from(orig)="+bdocstart+
                        ", from(converted)="+startoffset+
                        ", to(orig)="+bdocend+
                        ", to(converted)="+endoffset
        );
      }
      
      
      FeatureMap gatefm = gateann.getFeatures();
      Map name2key = new HashMap<>();
      for (Object key : gatefm.keySet()) {
        if (key != null) {
          name2key.put(
                  (key instanceof String)
                          ? (String) key : key.toString(), key);
        }
      }
      // Subsequently, when we need to figure out if a feature is in the 
      // featuremap, use the name2key mapping
      switch (handleExistingAnns) {
        case ADD_NEW_FEATURES:
          for (String fname : bdoc_fm.keySet()) {
            if (!(name2key.containsKey(fname) && gatefm.containsKey(name2key.get(fname)))) {
              gatefm.put(fname, bdoc_fm.get(fname));
            }
          }
          break;
        // already gets handled above!
        // case ADD_WITH_NEW_ID:            
        //  break;
        case REPLACE_ANNOTATION:
          // I think there is no way to actually update need to remove and add with id
          gateset.remove(gateann);
          try {
            gateset.add(bdocannid, startoffset, endoffset,
                    bdoctype, gate.Utils.toFeatureMap(bdoc_fm));
          } catch (InvalidOffsetException ex) {
            throw new RuntimeException("Cannot add annotation", ex);
          }
          break;
        case REPLACE_FEATURES:
          gatefm.clear();
          for (String fname : bdoc_fm.keySet()) {
            gatefm.put(fname, bdoc_fm.get(fname));
          }
          break;
        case UPDATE_FEATURES:
          bdoc_fm.keySet().forEach((fname) -> {
            gatefm.put(fname, bdoc_fm.get(fname));
        });
          break;

        case IGNORE:
          break;
        default:
          throw new RuntimeException("Should never happen!");
      }
    }

  }

  private void addAnnotationSet(BdocAnnotationSet annset, String offsetType) {
    String setname = annset.name;
    if(setname == null) {
      setname = "";
    }
    AnnotationSet gateset;
    if (setname.equals("")) {
      gateset = gateDocument.getAnnotations();
    } else {
      gateset = gateDocument.getAnnotations(setname);
    }
    annset.annotations.forEach((bdocann) -> {
      addAnnotation(gateset,
              bdocann.id, bdocann.start, bdocann.end, bdocann.type,
              bdocann.features, offsetType);
    });
  }

  /**
   * Actually carry out the update of the GATE document from the BdocDocument.
   * 
   * This carries out the update with whatever options have been set.
   * 
   * @param bdoc the bdoc to use for the updates
   * @return the updated GATE document
   */
  public Document fromBdoc(BdocDocument bdoc) {
    // can only assign features if there are any in the bdoc
    if (bdoc.features != null) {
      if (featurenames == null) {
        gateDocument.getFeatures().putAll(bdoc.features);
      } else {
        featurenames.forEach((fname) -> {
          gateDocument.getFeatures().put(fname, bdoc.features.get(fname));
        });
      }
    }
    if (bdoc.annotation_sets != null) {
      if (annsetnames == null) {
        bdoc.annotation_sets.keySet().forEach((annsetname) -> {
          addAnnotationSet(bdoc.annotation_sets.get(annsetname), bdoc.offset_type);
        });
      } else {
        annsetnames.forEach((annsetname) -> {
          addAnnotationSet(bdoc.annotation_sets.get(annsetname), bdoc.offset_type);
        });
      }
    }
    if(updateName && bdoc.name != null && !bdoc.name.isEmpty() ) {
      gateDocument.setName(bdoc.name);
    }
    return gateDocument;
  }

  /**
   * Actually carry out the update of the GATE document from the Bdoc ChangeLog.
   * 
   * This carries out the update with whatever options have been set.
   * 
   * @param chlog the changelog to use for the updates
   * @return returns the updated GATE document 
   */
  public Document fromChangeLog(ChangeLog chlog) {
    for (Map chg : chlog.changes) {
      // doc-features:clear setname, id
      // doc-feature:set, feature, value
      // doc-feature:remove, feature
      // ann-features:clear set, id
      // ann-feature:set, annid, feature, value
      // ann-feature:remove, annid, feature
      // annotation:add, set, start, end, type, features, id
      // annotation:remove, set, id
      // annotations:clear, setname
      // annotations:add, setname
      String cmd = (String) chg.get("command");
      String setname = (String) chg.get("set");
      AnnotationSet annset = null;
      if (setname != null) {
        annset
                = setname.equals("")
                ? gateDocument.getAnnotations()
                : gateDocument.getAnnotations(setname);
      }
      Integer id = (Integer) chg.get("id");
      String feature = (String) chg.get("feature");
      Object value = chg.get("value");
      switch (cmd) {
        case "doc-features:clear":
          gateDocument.getFeatures().clear();
          break;
        case "ann-features:clear":
          if (annset != null) {
            Annotation ann = annset.get(id);
            if (ann == null) {
              // silently ignore, that annotation could have been removed 
            } else {
              ann.getFeatures().clear();
            }
          }
          break;
        case "doc-feature:set":
          gateDocument.getFeatures().put(feature, value);
          break;
        case "name:set":
          if(updateName) {
            String name = (String)chg.get("name");
            if(name != null && !name.trim().isEmpty()) {
              gateDocument.setName(name);
            }
          }
          break;
        case "ann-feature:set":
          if (annset != null) {
            Annotation ann = annset.get(id);
            if (ann == null) {
              // IMPORTANT: this is silently ignored because the changelog can
              // sometimes contain feature changes for annotations which are
              // not in the set any longer. This happens if an annotation gets
              // removed from the set, but still exists as an annotation
              // and somebody sets a feature on that annotation. 
              // throw new RuntimeException("Annotation does not exist with id " + id);
            } else {
                ann.getFeatures().put(feature, value);
            }
          } // TODO: how could it happen that there is no annset?
          break;
        case "doc-feature:remove":
          gateDocument.getFeatures().remove(feature);
          break;
        case "ann-feature:remove":
          if (annset != null) {
            Annotation ann = annset.get(id);
            if (ann == null) {
              throw new RuntimeException("Annotation does not exist with id " + id);
            } else {
              ann.getFeatures().remove(feature);
            }
          }
          break;
        case "annotation:add":
          int start = (Integer) chg.get("start");
          int end = (Integer) chg.get("end");
          String type = (String) chg.get("type");
          @SuppressWarnings("unchecked")
          Map features = (Map) chg.get("features");
          addAnnotation(annset, id, start, end, type, features, chlog.offset_type);
          break;
        case "annotation:remove":
          if (annset != null) {
            Annotation gateann = annset.get(id);
            annset.remove(gateann);
          }
          break;
        case "annotations:clear":
          if (annset != null) {
            annset.clear();
          }
          break;
        case "annotations:remove":
          if (setname != null) {
            if (setname.isEmpty() && annset != null) {
              annset.clear();
            } else {
              gateDocument.removeAnnotationSet(setname);
            }
          }
          break;
      }

    }
    return gateDocument;
  }
  
  /**
   * This converts the given offset from python to java, if necessary.
   * If the offsetType is python, then the offset mapper is used to convert
   * the offset to Java, and if we do not have an offset mapper yet, we 
   * create it on the fly.
   * 
   * @param offset the offset from the bdoc or changelog to convert
   * @param offsetType the offset type of the bdoc or changelog
   * @return  converted offset, if necessary
   */
  private long convertOffset(int offset, String offsetType) {
    if("p".equals(offsetType)) {
      if(offsetMapper == null) {
        offsetMapper = new OffsetMapper(gateDocument.getContent().toString());
      }
      return (long)offsetMapper.convertToJava(offset);
    } else {
      return (long)offset;
    }
  }
  
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy