All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.corpora.DataSiftFormat Maven / Gradle / Ivy

The newest version!
/*
 *  Copyright (c) 1995-2014, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *  
 *  Mark A. Greenwood, 23/06/2014
 */
package gate.corpora;

import gate.AnnotationSet;
import gate.DocumentContent;
import gate.FeatureMap;
import gate.Resource;
import gate.corpora.datasift.DataSift;
import gate.corpora.datasift.Interaction;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.AutoInstance;
import gate.creole.metadata.CreoleResource;
import gate.util.DocumentFormatException;
import gate.util.InvalidOffsetException;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import org.apache.commons.lang.StringUtils;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.ObjectMapper;

@CreoleResource(name = "GATE DataSift JSON Document Format", isPrivate = true, autoinstances = {@AutoInstance(hidden = true)}, comment = "Format parser for DataSift JSON files")
public class DataSiftFormat extends TextualDocumentFormat {

  private static final long serialVersionUID = -1768496491819413313L;

  /** Initialise this resource, and return it. */
  public Resource init() throws ResourceInstantiationException {
    // Register ad hoc MIME-type
    // There is an application/json mime type, but I don't think
    // we want everything to be handled this way?
    MimeType mime = new MimeType("text", "x-json-datasift");
    // Register the class handler for this MIME-type
    mimeString2ClassHandlerMap.put(mime.getType() + "/" + mime.getSubtype(),
        this);
    // Register the mime type with string
    mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
    // Register file suffixes for this mime type
    suffixes2mimeTypeMap.put("datasift.json", mime);
    // Register magic numbers for this mime type
    // magic2mimeTypeMap.put("Subject:",mime);
    // Set the mimeType for this language resource
    setMimeType(mime);
    return this;
  }

  @Override
  public void cleanup() {
    super.cleanup();

    MimeType mime = getMimeType();

    mimeString2ClassHandlerMap.remove(mime.getType() + "/" + mime.getSubtype());
    mimeString2mimeTypeMap.remove(mime.getType() + "/" + mime.getSubtype());
    suffixes2mimeTypeMap.remove("datasift.json");
  }

  @Override
  public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
    if((doc == null)
        || (doc.getSourceUrl() == null && doc.getContent() == null)) { throw new DocumentFormatException(
        "GATE document is null or no content found. Nothing to parse!"); }

    setNewLineProperty(doc);
    String jsonString = StringUtils.trimToEmpty(doc.getContent().toString());

    // TODO build the new content
    StringBuilder concatenation = new StringBuilder();
    
    try {
      ObjectMapper om = new ObjectMapper();
      
      JsonFactory factory = new JsonFactory(om);
      JsonParser parser = factory.createParser(jsonString);
      
      Map offsets = new HashMap();
      
      Iterator it = parser.readValuesAs(DataSift.class);
      while(it.hasNext()) {
        DataSift ds = it.next();
        offsets.put(ds,(long)concatenation.length());
        concatenation.append(ds.getInteraction().getContent()).append("\n\n");
      }
      
      // Set new document content
      DocumentContent newContent =
          new DocumentContentImpl(concatenation.toString());

      doc.edit(0L, doc.getContent().size(), newContent);
      
      AnnotationSet originalMarkups = doc.getAnnotations("Original markups");
      for (Map.Entry item : offsets.entrySet()) {
        DataSift ds = item.getKey();
        Interaction interaction = ds.getInteraction();
        Long start = item.getValue();
        
        FeatureMap features = interaction.asFeatureMap();        
        features.putAll(ds.getFurtherData());
        
        originalMarkups.add(start,start+interaction.getContent().length(),"Interaction",features);            
      }
      
    } catch(InvalidOffsetException | IOException e) {
      throw new DocumentFormatException(e);
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy