All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.plugin.python.PythonWorker Maven / Gradle / Ivy

/*
 * Copyright (c) 2019 The University of Sheffield.
 *
 * This file is part of gateplugin-Python 
 * (see https://github.com/GateNLP/gateplugin-Python).
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program. If not, see .
 */

package gate.plugin.python;

import gate.Corpus;
import gate.CorpusController;
import gate.Document;
import gate.DocumentExporter;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.ProcessingResource;
import gate.Resource;
import gate.corpora.DocumentStaxUtils;
import gate.creole.AbstractController;
import gate.creole.ExecutionException;
import gate.creole.Plugin;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.gui.ResourceHelper;
import gate.gui.creole.manager.PluginUpdateManager;
import gate.persist.PersistenceException;
import gate.util.GateException;
import gate.util.GateRuntimeException;
import gate.util.persistence.PersistenceManager;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import javax.xml.stream.XMLStreamException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import py4j.GatewayServer;

/**
 * Run Java/GATE from python through this class.
 * 
 * @author Johann Petrak
 */
public class PythonWorker {

  /**
   * Logger
   */
  public transient final Logger LOGGER = LoggerFactory.getLogger(this.getClass());

  protected GatewayServer server;

  /**
   * If we want to log all commands.
   */
  public boolean logActions = true;

  /**
   * Indicate if the worker should be kept running. This is the case if it was started by the PythonWorkerLr
   * or if the worker running does not clear this.
   */
  public boolean keepRunning = true;

  private Corpus tmpCorpus;
  
  // For using the Format_Bdoc API
  // Since the Python plugin run-time depends on the format bdoc plugin,
  // this should ALWAYS SUCCEED without a problem!
  protected ResourceHelper rhBdocApi = (ResourceHelper)Gate.getCreoleRegister()
                     .get("gate.plugin.format.bdoc.API")
                     .getInstantiations().iterator().next();    
  
  /**
   * Create an instance.
   * @throws ResourceInstantiationException error
   */
  public PythonWorker() throws ResourceInstantiationException {   
    tmpCorpus = Factory.newCorpus("tmpCorpus");
  }
  
  /**
   * Load a maven plugin.
   * 
   * @param group maven group
   * @param artifact maven artifact
   * @param version maven version
   * @throws gate.util.GateException if error
   * 
   */
  public void loadMavenPlugin(
          String group, String artifact, String version) throws GateException {
    if (logActions) LOGGER.info("Worker run: load Maven plugin "+group+"/"+artifact+"/"+version);
    Gate.getCreoleRegister().registerPlugin(new Plugin.Maven(
            group, artifact, version));
  }
  
  /**
   * Load a pipeline from a file.
   * 
   * @param path gapp/xgapp file
   * @return the corpus controller
   */
  public CorpusController loadPipelineFromFile(String path) {
    if (logActions) LOGGER.info("Worker run: load controller from "+path);
    try {
      return (CorpusController)PersistenceManager.loadObjectFromFile(new File(path));
    } catch (PersistenceException | IOException | ResourceInstantiationException ex) {
      throw new GateRuntimeException("Could not load pipeline from "+path, ex);
    } 
  }
  
  /**
   * Find and return a loaded Maven plugin instance. 
   * 
   * @param group plugin group
   * @param artifact plugin artifact
   * @return the plugin instance or null of nothing found
   */
  public Plugin.Maven findMavenPlugin(String group, String artifact) {
    if (logActions) LOGGER.info("Worker run: find Maven plugin "+group+"/"+artifact);
    Set allPlugins = new LinkedHashSet<>(Gate.getCreoleRegister().getPlugins());
    allPlugins.addAll(PluginUpdateManager.getDefaultPlugins());
    for (Plugin plugin : allPlugins) {
      if (plugin instanceof Plugin.Maven) {
        Plugin.Maven mp = (Plugin.Maven)plugin;
        if (mp.getGroup().equals(group) && mp.getArtifact().equals(artifact)) {
          return mp;
        }
      }
    }
    return null;
  }
  
  
  /**
   * Load a pipeline from the maven plugin resources.
   * 
   * Example: "uk.ac.gate.plugins", "annie", "/resources/ANNIE_with_defaults.gapp"
   * 
   * @param group the plugin group
   * @param artifact the plugin artifact
   * @param path the path in the plugin resources
   * @return controller
   * @throws java.net.URISyntaxException  exception
   */
  public CorpusController loadPipelineFromPlugin(String group, String artifact, String path) throws URISyntaxException {
    if (logActions) LOGGER.info("Worker run: load pipeline "+path+" from plugin "+group+"/"+artifact);
    Plugin.Maven mp = findMavenPlugin(group, artifact);
    if(mp == null) {
      throw new GateRuntimeException("Could not find plugin, please load it first!");
    }
    ResourceReference rr;
    try {
      rr = new ResourceReference(mp, path);
    } catch (URISyntaxException ex) {
      throw new GateRuntimeException("Could not create ResourceReference for the pipeline");
    }
    if(rr == null) {
      throw new GateRuntimeException("Could not create ResourceReference for the pipeline");
    }
    try {
      return (CorpusController)PersistenceManager.loadObjectFromUri(rr.toURI());
    } catch (PersistenceException | IOException | ResourceInstantiationException ex) {
      throw new GateRuntimeException("Could not load pipeline from "+path, ex);
    } 
  }
  
  /**
   * Load document from the file.
   * 
   * This will load the document in the same way as if only the document 
   * URL had been specified in the GUI, if a document format is registered
   * for the extension, it is used. 
   * 
   * @param path file path of the document to load
   * @return document
   */
  public Document loadDocumentFromFile(String path) {
    if (logActions) LOGGER.info("Worker run: load document from "+path);
    return loadDocumentFromFile(path, null);
  }
  
  /**
   * Create a new document from the text.
   * 
   * @param content the document content
   * @return the document
   */
  public Document createDocument(String content) {
    if (logActions) LOGGER.info("Worker run: createDocument from content");
    try {
      return Factory.newDocument(content);
    } catch (ResourceInstantiationException ex) {
      throw new GateRuntimeException("Could not create document", ex);
    }
  }
  
  /**
   * Create a new corpus.
   * 
   * @return  corpus
   */
  public Corpus newCorpus() {
    if (logActions) LOGGER.info("Worker run: create new corpus");
    try {
      return Factory.newCorpus("Corpus_"+Gate.genSym());
    } catch (ResourceInstantiationException ex) {
      throw new GateRuntimeException("Could not create document", ex);
    }    
  }
  
  /**
   * Delete a GATE resource and release memory.
   * 
   * @param res the resource to remove
   */
  public void deleteResource(Resource res) {
    if (logActions) LOGGER.info("Worker run: remove resource"+res.getName());
    Factory.deleteResource(res);
  }
  
  /**
   * Run a pipeline for a single document.
   * 
   * @param pipeline the pipeline to run
   * @param doc  the document to process
   */
  public void run4Document(CorpusController pipeline, Document doc) {
    if (logActions) LOGGER.info("Worker run: run controller "+pipeline.getName()+" for "+doc.getName());
    tmpCorpus.clear();
    tmpCorpus.add(doc);
    if(pipeline instanceof AbstractController) {
      ((AbstractController)pipeline).setControllerCallbacksEnabled(false);
    }
    pipeline.setCorpus(tmpCorpus);
    try {
      pipeline.execute();
    } catch (ExecutionException ex) {
      throw new GateRuntimeException("Exception when running the pipeline", ex);
    }
  }
  
  /**
   * Invoke the controller execution started code.
   * 
   * This should be run before documents are run individually using the run4doc
   * method.
   * 
   * @param pipeline pipeline
   */
  public void runExecutionStarted(CorpusController pipeline) {
    if (logActions) LOGGER.info("Worker run: run executionStarted for "+pipeline.getName());
    if(pipeline instanceof AbstractController) {
      try {
        ((AbstractController)pipeline).invokeControllerExecutionStarted();
      } catch (ExecutionException ex) {
        throw new GateRuntimeException("Problem running ExecutionStarted", ex);
      }
    }    
  }

  /**
   * Invoke the controller execution finished code.
   * 
   * This should be run after all documents are run individually using the run4doc
   * method.
   * 
   * @param pipeline pipeline
   */
  public void runExecutionFinished(CorpusController pipeline) {
    if (logActions) LOGGER.info("Worker run: run executionFinished for "+pipeline.getName());
    if(pipeline instanceof AbstractController) {
      try {
        ((AbstractController)pipeline).invokeControllerExecutionFinished();
      } catch (ExecutionException ex) {
        throw new GateRuntimeException("Problem running ExecutionFinished", ex);
      }
    }    
  }
  
  /**
   * Run the pipeline on the given corpus.
   * 
   * @param pipeline the pipeline
   * @param corpus  the corpus
   */
  public void run4Corpus(CorpusController pipeline, Corpus corpus) {
    if (logActions) LOGGER.info("Worker run: run pipline "+pipeline.getName()+" for corpus "+corpus.getName());
    if(pipeline instanceof AbstractController) {
      ((AbstractController)pipeline).setControllerCallbacksEnabled(true);
    }
    pipeline.setCorpus(corpus);
    try {
      pipeline.execute();
    } catch (ExecutionException ex) {
      throw new GateRuntimeException("Exception when running the pipeline", ex);
    }  
  }
  
  /**
   * Load document from the file, using mime type
   * @param path file
   * @param mimeType mime type
   * @return document
   */
  public Document loadDocumentFromFile(String path, String mimeType) {
    if (logActions) LOGGER.info("Worker run: load document from "+path+" mimetype:"+mimeType);
    FeatureMap params = Factory.newFeatureMap();
    try {
      params.put("sourceUrl", new File(path).toURI().toURL());      
      if(mimeType != null) {
        params.put("mimeType", mimeType);
      }
      params.put("encoding", "utf-8");
      Document doc = (Document)Factory.createResource("gate.corpora.DocumentImpl", params);
      return doc;
    } catch (ResourceInstantiationException | MalformedURLException ex) {
      throw new GateRuntimeException("Could not load document from "+path, ex);
    }
  }
  
  /**
   * Save document to a file.
   * 
   * NOTE: currently there is no way in GATE to register a document format
   * for saving a document with a specific mime type.So this function currently
   * only recognizes a few hard-coded mime types and rejects all others.
   * 
   * The mime types are: "" (empty string) for the default GATE xml serialization;
   * all mime types supported by the Format_Bdoc plugin and all mime types 
   * supported by the Format_FastInfoset plugin.
   * 
   * NOTE: for fastinfoset the plugin must first have been loaded with 
   * loadMavenPlugin("uk.ac.gate.plugins","format-fastinfoset","8.5") or 
   * whatever the wanted version is.
   * 
   * @param path file
   * @param mimetype  mime type
   * @throws java.io.IOException if something goes wrong saving
   * @throws javax.xml.stream.XMLStreamException if something goes wrong when saving
   */
  public void saveDocumentToFile(Document doc, String path, String mimetype)
          throws IOException, XMLStreamException {
    if (logActions) LOGGER.info("Worker run: save document to "+path+" mimetype:"+mimetype);
    if(mimetype==null || mimetype.isEmpty()) {
      DocumentStaxUtils.writeDocument(doc, new File(path));
    } else if("application/fastinfoset".equals(mimetype)) {
      DocumentExporter docExporter = (DocumentExporter)Gate.getCreoleRegister()
                     .get("gate.corpora.FastInfosetExporter")
                     .getInstantiations().iterator().next();
      docExporter.export(doc, new File(path), Factory.newFeatureMap());
    } else if("text/bdocsjson".equals(mimetype)) {
      DocumentExporter docExporter = (DocumentExporter)Gate.getCreoleRegister()
                     .get("gate.plugin.format.bdoc.FormatBdocSimpleJson")
                     .getInstantiations().iterator().next();
      docExporter.export(doc, new File(path), Factory.newFeatureMap());
    } else if("text/bdocsjson".equals(mimetype) || "text/bdocsjs".equals(mimetype)) {
      DocumentExporter docExporter = (DocumentExporter)Gate.getCreoleRegister()
                     .get("gate.plugin.format.bdoc.FormatBdocSimpleJson")
                     .getInstantiations().iterator().next();
      docExporter.export(doc, new File(path), Factory.newFeatureMap());
    } else if("text/bdocjson".equals(mimetype) || "text/bdocjs".equals(mimetype)) {
      DocumentExporter docExporter = (DocumentExporter)Gate.getCreoleRegister()
                     .get("gate.plugin.format.bdoc.FormatBdocJson")
                     .getInstantiations().iterator().next();
      docExporter.export(doc, new File(path), Factory.newFeatureMap());
    } else if("text/bdocsjson+gzip".equals(mimetype) || "text/bdocsjs+gzip".equals(mimetype)) {
      DocumentExporter docExporter = (DocumentExporter)Gate.getCreoleRegister()
                     .get("gate.plugin.format.bdoc.FormatBdocSimpleJsonGzip")
                     .getInstantiations().iterator().next();
      docExporter.export(doc, new File(path), Factory.newFeatureMap());
    } else if("text/bdocjson+gzip".equals(mimetype) || "text/bdocjs+gzip".equals(mimetype)) {
      DocumentExporter docExporter = (DocumentExporter)Gate.getCreoleRegister()
                     .get("gate.plugin.format.bdoc.FormatBdocJsonGzip")
                     .getInstantiations().iterator().next();
      docExporter.export(doc, new File(path), Factory.newFeatureMap());
    } else if("application/bdocmp".equals(mimetype)) {
      DocumentExporter docExporter = (DocumentExporter)Gate.getCreoleRegister()
                     .get("gate.plugin.format.bdoc.BdocMsgPack")
                     .getInstantiations().iterator().next();
      docExporter.export(doc, new File(path), Factory.newFeatureMap());
    }    
  }
  
  /**
   * Get the JSON serialization of the Bdoc representation of a document.
   * @param doc document
   * @return json
   */
  public String getBdocJson(Document doc) {
    if (logActions) LOGGER.info("Worker run: get bdocjson for "+doc.getName());
    ResourceHelper rh = (ResourceHelper)Gate.getCreoleRegister()
                     .get("gate.plugin.format.bdoc.API")
                     .getInstantiations().iterator().next();
    try {
      String json = (String)rh.call("json_from_doc", doc);
      return json;
    } catch (NoSuchMethodException | IllegalArgumentException | 
            IllegalAccessException | InvocationTargetException ex) {
      throw new GateRuntimeException("Could not convert GATE document to json", ex);
    }
  }
  
  /**
   * Create a new GATE document from the Bdoc JSON serialization.
   * 
   * @param bdocjson the JSON 
   * @return a new GATE document built from the bdoc json
   * @throws gate.creole.ResourceInstantiationException should never occur
   */
  public Document getDocument4BdocJson(String bdocjson) 
          throws ResourceInstantiationException {
    try {
      if (logActions) LOGGER.info("Worker run: create document from bdocjson");
      ResourceHelper rh = (ResourceHelper)Gate.getCreoleRegister()
              .get("gate.plugin.format.bdoc.API")
              .getInstantiations().iterator().next();
      Document theDoc = (Document)rh.call("doc_from_json", null, bdocjson);
      return theDoc;
    } catch (NoSuchMethodException | IllegalArgumentException | IllegalAccessException | InvocationTargetException ex) {
      throw new GateRuntimeException("Error invoking Format Bdoc API method doc_from_json", ex);
    }
  }

  /**
   * Copy string to standard output.
   * 
   * Note: no new line character is appended.
   * 
   * @param txt the string to copy
   */
  public void print2out(String txt) {
    System.out.print(txt);
  }

  /**
   * Copy string to standard error.
   * 
   * Note: no new line character is appended.
   * 
   * @param txt the string to copy
   */
  public void print2err(String txt) {
    System.err.print(txt);
  }
  
  /**
   * Activate the GUI.
   * 
   * Caution: experimental!
   * 
   */
  public void showGui() {
    if (logActions) LOGGER.info("Worker run: sho GUI");
    if (gate.gui.MainFrame.getInstance().isVisible()) {
      return;
    }
    Runnable r = new Runnable() {
      @Override
      public void run() {
        gate.Main.applyUserPreferences();
        gate.gui.MainFrame.getInstance().setVisible(true);
      }      
    };
    javax.swing.SwingUtilities.invokeLater(r);
  }
  
  /**
   * Return a list of all resources with the given name.
   * @param name resource name
   * @return list of matching resources
   * @throws GateException 
   */
  public List getResources4Name(String name) throws GateException {
    if (logActions) LOGGER.info("Worker run: get resource for name "+name);
    return gate.Gate.getCreoleRegister().getAllInstances("gate.Resource");
  }
  
  /**
   * Return a list of all resources with the given name and class.
   * @param name resource name
   * @param clazz resource class name
   * @return list of matching resources
   * @throws GateException 
   */
  public List getResources4Name(String name, String clazz) throws GateException {
    if (logActions) LOGGER.info("Worker run: get resource for name "+name+" and class "+clazz);
    return gate.Gate.getCreoleRegister().getAllInstances(clazz);
  }
  
  /**
   * Return the document with the given name.
   * 
   * If there are more than one, return an arbitrary one of those.
   * 
   * @param name the document name
   * @return document matching the name
   * @throws GateException 
   */
  public Document getDocument4Name(String name) throws GateException {
    if (logActions) LOGGER.info("Worker run: get document for name "+name);
    List matching = getResources4Name(name, "gate.Document");
    if(matching.size() > 0) {
      return (Document)matching.get(0);
    } else {
      return null;
    }
  }

  /**
   * Return the corpus with the given name.
   * 
   * If there are more than one, return an arbitrary one of those.
   * 
   * @param name the corpus name
   * @return corpus matching the name
   * @throws GateException 
   */
  public Corpus getCorpus4Name(String name) throws GateException {
    if (logActions) LOGGER.info("Worker run: get corpus with name "+name);
    List matching = getResources4Name(name, "gate.Corpus");
    if(matching.size() > 0) {
      return (Corpus)matching.get(0);
    } else {
      return null;
    }
  }

  /**
   * Return the pipeline with the given name.
   * 
   * If there are more than one, return an arbitrary one of those.
   * 
   * @param name the pipeline name
   * @return pipeline matching the name
   * @throws GateException 
   */
  public CorpusController getPipeline4Name(String name) throws GateException {
    if (logActions) LOGGER.info("Worker run: get pipeline with name "+name);
    List matching = getResources4Name(name, "gate.CorpusController");
    if(matching.size() > 0) {
      return (CorpusController)matching.get(0);
    } else {
      return null;
    }
  }

  /**
   * Return the processing resource with the given name.
   * 
   * If there are more than one, return an arbitrary one of those.
   * 
   * @param name the pr name
   * @return pr matching the name
   * @throws GateException 
   */
  public ProcessingResource getPr4Name(String name) throws GateException {
    if (logActions) LOGGER.info("Worker run: get processing resource with name "+name);
    List matching = getResources4Name(name, "gate.ProcessingResource");
    if(matching.size() > 0) {
      return (ProcessingResource)matching.get(0);
    } else {
      return null;
    }
  }
  
  /**
   * Return list of all known document names.
   * 
   * @return list of names
   * @throws GateException 
   */
  public List getDocumentNames() throws GateException {
    if (logActions) LOGGER.info("Worker run: get known document names");
    List tmp = gate.Gate.getCreoleRegister().getAllInstances("gate.Document");
    List names = new ArrayList<>();
    for(Resource r : tmp) {
      names.add(r.getName());
    }
    return names;
  }

  /**
   * Return list of all known corpus names.
   * 
   * @return list of names
   * @throws GateException 
   */
  public List getCorpusNames() throws GateException {
    if (logActions) LOGGER.info("Worker run: get known corpus names");
    List tmp = gate.Gate.getCreoleRegister().getAllInstances("gate.Corpus");
    List names = new ArrayList<>();
    for(Resource r : tmp) {
      names.add(r.getName());
    }
    return names;
  }
  
  /**
   * Return list of all known pipeline names.
   * 
   * @return list of names
   * @throws GateException 
   */
  public List getPipelineNames() throws GateException {
    if (logActions) LOGGER.info("Worker run: get known pipeline names");
    List tmp = gate.Gate.getCreoleRegister().getAllInstances("gate.CorpusController");
    List names = new ArrayList<>();
    for(Resource r : tmp) {
      names.add(r.getName());
    }
    return names;
  }

  /**
   * Return list of all known processing resource names.
   * 
   * @return list of names
   * @throws GateException 
   */
  public List getPrNames() throws GateException {
    if (logActions) LOGGER.info("Worker run: get known resource names");
    List tmp = gate.Gate.getCreoleRegister().getAllInstances("gate.ProcessingResource");
    List names = new ArrayList<>();
    for(Resource r : tmp) {
      names.add(r.getName());
    }
    return names;
  }

  
  public String jsonAnnsets4Doc(Document doc, List> annsets) {
    try {
      return (String)rhBdocApi.call("jsonannsets_from_docanns", doc, annsets);
    } catch (IllegalAccessException | IllegalArgumentException | 
            NoSuchMethodException | InvocationTargetException ex) {
      throw new GateRuntimeException("Problem retrieving the annotations", ex);
    }
  }
  
  public String jsonAnnsets4Doc(Document doc) {
    return jsonAnnsets4Doc(doc, null);
  }
  
  
  /**
   * Enable or disable logging the actions.
   *
   * @param flag true or false
   */
  public void logActions(boolean flag) {
    if (logActions) LOGGER.info("Worker run: set logActions to "+flag);
    this.logActions = flag;
  }

  /**
   * Kill the worker.
   * This may cause data to get lost as the worker may not send something that is expected to the python master.
   *
   */
  public void kill() {
    server.shutdown();
  }

  /**
   * Check if the worker is closable.
   * It is closable if not started by the Lr.
   *
   * @return flag
   */
  public boolean isClosable() {
    return !keepRunning;
  }

  /**
   * Return version of the Python plugin.
   * 
   * @return  version
   */
  public String pluginVersion() {
    return VersionLogger.getPluginVersion();
  }
  
  /**
   * Return build (short commit id) of Python plugin. 
   * 
   * @return commit id
   */
  public String pluginBuild() {
    return VersionLogger.getPluginBuild();
  }
  
  /**
   * Return version of GATE.
   * 
   * @return  version
   */
  public String gate_version() {
    return gate.Main.version;
  }
  
  /**
   * Return build (short commit id) of GATE.
   * 
   * @return commit id
   */
  public String gate_build() {
    return gate.Main.build;
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy