gate.corpora.ConllDocumentFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gate-core Show documentation
GATE - general achitecture for text engineering - is open source software capable of solving almost any text processing problem. This artifact enables you to embed the core GATE Embedded with its essential dependencies. You will able to use the GATE Embedded API and load and store GATE XML documents. This artifact is the perfect dependency for CREOLE plugins or for applications that need to customize the GATE dependencies due to confict with their own dependencies or for lower footprint.
The newest version!
/*
 *  ConllDocumentFormat.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *  
 *  $Id: ConllDocumentFormat.java 19658 2016-10-10 06:46:13Z markagreenwood $
 */

package gate.corpora;

import java.util.*;
import gate.*;
import gate.creole.ANNIEConstants;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.AutoInstance;
import gate.creole.metadata.CreoleResource;
import gate.util.DocumentFormatException;
import gate.util.InvalidOffsetException;


/** Document format for handling CoNLL/IOB documents:
  * He PRP B-NP
  * accepted VBD B-VP
  * the DT B-NP
  * position NN I-NP
  * ...
  */
@CreoleResource(name = "GATE CoNLL Document Format", isPrivate = true,
    autoinstances = {@AutoInstance(hidden = true)})
public class ConllDocumentFormat extends TextualDocumentFormat {
  
  private static final long serialVersionUID = 5756433194230855515L;
  
  public static final String ANNOTATION_COLUMN_FEATURE = "column";
  public static final String ANNOTATION_KIND_FEATURE = "kind";
  
  
  /** Debug flag */
  private static final boolean DEBUG = false;

  /** Default construction */
  public ConllDocumentFormat() { super();}


  @Override
  public void unpackMarkup(gate.Document doc) throws DocumentFormatException{
    if ( (doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null) ) {
      throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }

    setNewLineProperty(doc);

    String[] lines = doc.getContent().toString().split("[\\n\\r]+");
    StringBuilder newContent = new StringBuilder();
    
    // Items of data to be turned into Original markups annotations
    List annotanda = new ArrayList();
    
    // Currently open tags: created by "B-FOO", extended by "I-FOO", closed
    // by "O" or end of sentence.
    Map inProgress = new HashMap();

    /* Note: I-Foo handling currently has a weak spot.
     * 
     * this    B-Foo
     * is      B-Bar
     * strange I-Foo
     * 
     * will result in a Foo annotation spanning "this is strange", because
     * the I-Foo extends the existing B-Foo.  If the sentence is cut off 
     * before hitting another I-Foo, however, the Foo annotation will not
     * have been extended.  But this situation will not occur in carefully
     * edited input.  
     */

    long oldEnd = 0L;
    long start = 0L;
    long end = 0L;

    for (String line : lines) {
      oldEnd = end;
      start = newContent.length();
      String[] items = line.split("\\s+");
      
      // blank line: stick a newline in the document content and close
      // any annotations in progress
      if (items.length == 0) {
        newContent.append("\n");
        end = newContent.length();
        finishAllTags(inProgress, annotanda, oldEnd);
      }
      
      else {
        String token = items[0];
        // We've agreed to put the space after every token.
        newContent.append(token);
        end = newContent.length();
        newContent.append(' ');
        
        // Create Token and following SpaceToken annotation.
        annotanda.add(Annotandum.makeToken(start, end, token));
        annotanda.add(Annotandum.makeSpaceToken(end));

        for (int column=1 ; column < items.length ; column++) {
          // O means close all annotations in progress
          if (items[column].equals("O")) {
            finishAllTags(inProgress, annotanda, oldEnd);
          }
          
          // "U-FOO": unigram, single-token "FOO"
          // annotation, after closing any "FOO" already in progress
          else if ( (items[column].length() > 2) &&
              items[column].startsWith("U-") ) {
            String type = items[column].substring(2);
            finishTag(type, inProgress, annotanda, oldEnd);
            annotanda.add(new Annotandum(type, start, end, column, true));
          }
          
          // "L-FOO": last bit of "FOO": extend and 
          // close any "FOO" already in progress
          else if ( (items[column].length() > 2) &&
              items[column].startsWith("L-") ) {
            String type = items[column].substring(2);
            
            if (inProgress.containsKey(type)) {
              // good L-FOO, so update the end offset
              inProgress.get(type).endOffset = end;
            }
            else {
              // bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
              inProgress.put(type, new Annotandum(type, start, end, column, true));
            }

            finishTag(type, inProgress, annotanda, end);
          }
          
          // "B-FOO": start a new "FOO" annotation
          // after closing any "FOO" already in progress
          else if ( (items[column].length() > 2) &&
              items[column].startsWith("B-") ) {
            String type = items[column].substring(2);
            finishTag(type, inProgress, annotanda, oldEnd);
            inProgress.put(type, new Annotandum(type, start, end, column, true));
          }
          
          // "I-FOO": extend current "FOO" annotation
          else if ( (items[column].length() > 2) &&
              items[column].startsWith("I-") ) {
            String type = items[column].substring(2);
            
            if (inProgress.containsKey(type)) {
              // good I-FOO, so update the end offset
              inProgress.get(type).endOffset = end;
            }
            else {
              // bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
              inProgress.put(type, new Annotandum(type, start, end, column, true));
            }
          }
          
          // "FOO": treat as single-token annotation (such as POS tag)
          else { 
            Annotandum tag = new Annotandum(items[column], start, end, column, false);
            annotanda.add(tag);
          }
        }
      }
      
    }
    // end of input: close any remaining annotations
    finishAllTags(inProgress, annotanda, end);

    
    // set new content & create Original markups annotations
    try {
      DocumentContent newContentImpl = new DocumentContentImpl(newContent.toString());
      doc.edit(0L, doc.getContent().size(), newContentImpl);
      long newSize = doc.getContent().size();
      
      AnnotationSet originalMarkups = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
      for (Annotandum ann : annotanda) {
        if (DEBUG) {
          String string = Utils.stringFor(doc, ann.startOffset, 
              (ann.endOffset <= newSize) ? ann.endOffset : newSize);
          System.out.format("%d  %d  %s  %s\n", ann.startOffset, ann.endOffset, ann.type, string);
        }
        originalMarkups.add(ann.startOffset, ann.endOffset, ann.type, ann.features);
      }
    }
    catch (InvalidOffsetException e) {
      throw new DocumentFormatException(e);
    }
  }

  
  /* Close any open annotations (typically at the end of a sentence).  Leave the existing
   * end offset on an annotation that has one, but chop it off if it's still unspecified.
   */
  private void finishAllTags(Map annsUnderway, List annsFinished, long cutoff) {
    for (Annotandum ann : annsUnderway.values()) {
      if (ann.endOffset == null) {
        ann.endOffset = cutoff;
      }
      annsFinished.add(ann);
    }
    annsUnderway.clear();
  }
  
  
  /* If there is an annotation in progress of this type, close it;
   * if not, do nothing.    */
  private void finishTag(String type, Map annsUnderway, List annsFinished, long cutoff) {
    Annotandum ann = annsUnderway.remove(type);
    if (ann != null) {
      if (ann.endOffset == null) {
        ann.endOffset = cutoff;
      }
      annsFinished.add(ann);
    }
  }
  
  
  
  /** Initialise this resource, and return it. */
  @Override
  public Resource init() throws ResourceInstantiationException{
    // Register ad hoc MIME-type
    MimeType mime = new MimeType("text","x-conll");
    // Register the class handler for this MIME-type
    mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), this);
    // Register the mime type with string
    mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
    // Register file suffixes for this mime type
    suffixes2mimeTypeMap.put("conll",mime);
    suffixes2mimeTypeMap.put("iob",mime);
    // Register magic numbers for this mime type
    //magic2mimeTypeMap.put("Subject:",mime);
    // Set the mimeType for this language resource
    setMimeType(mime);
    return this;
  }
  
}


/** Wrapper around data to be turned into an "Original markups" annotation.
 */
class Annotandum {
  protected Long startOffset, endOffset;
  protected String type;
  protected FeatureMap features;
  
  protected Annotandum(String type, Long startOffset, Long endOffset) {
    this.startOffset = startOffset;
    this.endOffset = endOffset;
    this.type = type;
    this.features = Factory.newFeatureMap();
  }
  
  /* Note that chunkiness is determined by the tag structure.  A "B-Foo"
   * that spans only one token is chunky.  Tags outside the B/I/L/U system
   * get the kind==token feature; tags in the system get kind==chunky.   */
  protected Annotandum(String type, Long startOffset, Long endOffset, int column, boolean chunky) {
    this.features = Factory.newFeatureMap();
    this.features.put(ConllDocumentFormat.ANNOTATION_COLUMN_FEATURE, column);
    this.features.put(ConllDocumentFormat.ANNOTATION_KIND_FEATURE, chunky ? "chunk" : "token");
    this.startOffset = startOffset;
    this.endOffset = endOffset;
    this.type = type;
  }

  protected Annotandum(String type, Long startOffset, Long endOffset, FeatureMap features) {
    this.startOffset = startOffset;
    this.endOffset = endOffset;
    this.type = type;
    this.features = features;
  }

  protected static Annotandum makeToken(long start, long end, String string) {
    int length = (int) (end - start);
    FeatureMap features = Factory.newFeatureMap();
    features.put(ANNIEConstants.TOKEN_LENGTH_FEATURE_NAME, length);
    features.put(ANNIEConstants.TOKEN_STRING_FEATURE_NAME, string);
    return new Annotandum("Token", start, end, features);
  }
  
  protected static Annotandum makeSpaceToken(long start) {
    long end = start + 1L;
    FeatureMap features = Factory.newFeatureMap();
    features.put(ANNIEConstants.TOKEN_LENGTH_FEATURE_NAME, 1);
    features.put(ANNIEConstants.TOKEN_STRING_FEATURE_NAME, " ");
    return new Annotandum("SpaceToken", start, end, features);
  }
  
  
}