All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.plugin.learningframework.engines.EngineDVFileJson Maven / Gradle / Ivy

Go to download

A GATE plugin that provides many different machine learning algorithms for a wide range of NLP-related machine learning tasks like text classification, tagging, or chunking.

There is a newer version: 4.2
Show newest version
/*
 *  Copyright (c) The University of Sheffield.
 *
 *  This file is free software, licensed under the 
 *  GNU Library General Public License, Version 2.1, June 1991.
 *  See the file LICENSE.txt that comes with this software.
 *
 */
package gate.plugin.learningframework.engines;

import com.fasterxml.jackson.databind.ObjectMapper;
import gate.Annotation;
import gate.AnnotationSet;
import gate.lib.interaction.process.Process4StringStream;
import gate.lib.interaction.process.ProcessBase;
import gate.lib.interaction.process.ProcessSimple;
import gate.plugin.learningframework.EvaluationMethod;
import gate.plugin.learningframework.ModelApplication;
import gate.plugin.learningframework.data.CorpusRepresentation;
import gate.plugin.learningframework.data.CorpusRepresentationVolatileDense2JsonStream;
import gate.plugin.learningframework.data.InstanceRepresentation;
import gate.plugin.learningframework.features.FeatureInfo;
import gate.plugin.learningframework.features.TargetType;
import gate.util.Files;
import gate.util.GateRuntimeException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.yaml.snakeyaml.Yaml;

/**
 * Common base class for all Engines which are dense, volatile and write JSON to a file.
 * 
 * 

* See Wiki * * @author Johann Petrak */ public abstract class EngineDVFileJson extends EngineDV { // NOTEs about how to find python: // On linux, python is usually on the path, but we cannot be sure if it is python 3 // In some cases, python3 is the version 3 command. // On Windows (10) if installing using anaconda3, the installer recommends NOT // putting python on the path and to install for the user only by default. // !! In that case, it gets installed by default into C:\\Users\\username\\Anaconda3 which // contains python.exe, but things like ipython.exe are in the Scripts subdirectory. // If installing for all, it gets installed by default into C:\\ProgramData\\Anaconda3 // If installing pythong from python.org we have the options: // 1) Windows executable installer (python-3.6.5-amd64.exe): // this one recommends to install for all users but does not add to PATH by default // Also recommends to disable the path length limit // Installing for all: This gets installed into C:\\Users\\username\\AppData\\Local\\Programs\\Python\\Python36 // which contains python, Scripts contains pip // 2) Web-based installer (python-3.6.5-amd64-webinstall.exe): seems to use the same location // (could not test after already installed using the other installer) // So in order to find python we do the following: // 1) check if there is a config file in the data dir, use the pythonhome variable from there // 2) check if the PYTHON_BIN environment variable is set, use it as full path to the executable // 3) try to find it on the executable path (this is not easy in Java, instead try to run python/python3 // or python.exe with parameter "-V" to get version. May use Runtime.getRuntime().exec("...") in a // try catch for that or own interaction library to get back the output. // 3) if on Windows, check one of the two paths above // 4) if on Linux check in decreasing order of importance: /usr/bin/python3 or /usr/bin/python // Wrapper name: this is set by the actual implementing engine class and // will influence the file name of scripts, config files etc specific to that // wrapper. protected String WRAPPER_NAME = "WRAPPER_NAME MUST BE OVERRIDEN BY IMPLEMENTING SUBCLASS"; protected String MODEL_BASENAME = "MODEL_BASENAME MUST BE OVERRIDEN BY IMPLEMENTING SUBCLASS"; // all implementing subclasses will follow these name conventions protected String COMMAND_BASE_TRAIN = "train"; protected String COMMAND_BASE_APPLY = "apply"; protected File dataDir; // the model/data directory as specified when creating the engine but as a File protected String getWrapperHome() { File wrapperRoot = new File(dataDir, WRAPPER_NAME); return wrapperRoot.getAbsolutePath(); } /** * Get the command path to run for training. * * @return the training command path */ protected String getCommandPathTrain() { File wrapperRoot = new File(dataDir, WRAPPER_NAME); String ext = getShellExtension(); File cmd = new File(wrapperRoot, COMMAND_BASE_TRAIN+ext); return cmd.getAbsolutePath(); } /** * Get the command path to run for application. * * @return application command path */ protected String getCommandPathApply() { File wrapperRoot = new File(dataDir, WRAPPER_NAME); String ext = getShellExtension(); File cmd = new File(wrapperRoot, COMMAND_BASE_APPLY + ext); return cmd.getAbsolutePath(); } /** * Get the shell extensions for the operating system. * @return shell extension, including the dot */ protected String getShellExtension() { return getOsType() == OsType.LINUXLIKE ? ".sh" : ".cmd"; } /** * Return LINUXLIKE, WINDOWSLIKE, or throw an exception for anything else * * @return OS type */ protected OsType getOsType() { boolean linuxLike = System.getProperty("file.separator").equals("/"); boolean windowsLike = System.getProperty("file.separator").equals("\\"); if(linuxLike) { return OsType.LINUXLIKE; } else if(windowsLike) { return OsType.WINDOWSLIKE; } else { throw new GateRuntimeException("It appears this OS is not supported"); } } /** * Known OS types. */ public enum OsType { WINDOWSLIKE, LINUXLIKE } /** * Read a WRAPPERNAME.yaml file from the data dir, if it exists. * @return the map of settings */ @SuppressWarnings("unchecked") public Map getWrapperConfig() { File wrapperInfoFile = new File(dataDir,WRAPPER_NAME+".yaml"); if(!wrapperInfoFile.exists()) { // Windows is just insane and hides a txt extension if a user creates // a "text file" with a yaml extensions, so lets allow .yaml.txt as well wrapperInfoFile = new File(dataDir,WRAPPER_NAME+".yaml.txt"); } // System.err.println("DEBUG: wrapper file: "+wrapperInfoFile.getAbsolutePath()); if(wrapperInfoFile.exists()) { // System.err.println("DEBUG: seems to exist ..."); Yaml yaml = new Yaml(); Object obj; try { obj = yaml.load(new InputStreamReader(new FileInputStream(wrapperInfoFile),"UTF-8")); } catch (FileNotFoundException | UnsupportedEncodingException ex) { throw new GateRuntimeException("Could not load yaml file "+wrapperInfoFile,ex); } Map map = null; if(obj instanceof Map) { map = (Map)obj; // System.err.println("DEBUG: got map: "+map); } else { throw new GateRuntimeException("Info file has strange format: "+wrapperInfoFile.getAbsolutePath()); } return map; } else { // System.err.println("DEBUG: does not exist, returning empty map"); return new HashMap<>(); } } protected ProcessBase process; // For this engine, this will always be a CorpusRepresentationVolatileDense2JsonStream protected CorpusRepresentationVolatileDense2JsonStream corpusRepresentation; @Override public CorpusRepresentation getCorpusRepresentation() { return corpusRepresentation; } @Override protected void initWhenCreating(URL directory, Algorithm algorithm, String parms, FeatureInfo featureInfo, TargetType targetType) { dataDir = Files.fileFromURL(directory); this.featureInfo = featureInfo; corpusRepresentation = new CorpusRepresentationVolatileDense2JsonStream(dataDir, featureInfo); corpusRepresentation.startAdding(); this.featureInfo = featureInfo; // NOTE: we are copying the wrapper code only when starting training, not // here. This allows the user to copy their own code while the PR is running // but creating the corpus has not yet finished. } @Override protected void loadAndSetCorpusRepresentation(URL directory) { //System.err.println("DEBUG EngineDVFileJson: running loadAndSetCorpusRepresentation "+directory); // this does not actually need to load anything but the featureInfo ... // this is needed to convert our instance data to JSON, which is then sent // off to the script or server which is responsible to use any other saved // model info (the model itself, scaling info, vocab info, embeddings etc) dataDir = Files.fileFromURL(directory); featureInfo = FeatureInfo.load(directory); corpusRepresentation = new CorpusRepresentationVolatileDense2JsonStream(dataDir, featureInfo); } protected String getDefaultPythonBin() { // TODO: depending on OS and the result of doing the equivalent of "which python" // provide some useful default here if(getOsType()==OsType.LINUXLIKE) { return "python"; } else { // On windows, use C:\\User\\username\\Miniconda3\python.exe String drive = System.getenv("HOMEDRIVE"); // System.err.println("DEBUG: Windows drive is "+drive); String path = System.getenv("HOMEPATH"); // System.err.println("DEBUG: Windows home path is "+path); return drive + path + "\\Miniconda3\\python.exe"; } } @Override protected void loadModel(URL directory, String parms) { loadAndSetCorpusRepresentation(directory); // the loadModel method should get called before all the applyModel // calls, so here we can start the external process with which we communicate // in applyModel // first, check if the wrapper is present. Normally this should be the case, // but sometimes it may be required to update the wrapper on purpose, e.g. // for debugging or for a bugfix. This can be achieved by removing the wrapper // directory and running apply again which will then re-install the wrapper // here. if(!new File(dataDir,"WRAPPER_NAME").exists()) { Utils4Engines.copyWrapper(WRAPPER_NAME, dataDir); } // Start the process ArrayList finalCommand = new ArrayList<>(); String modelBaseName = new File(dataDir, WRAPPER_NAME+".model").getAbsolutePath(); finalCommand.add(getCommandPathApply()); finalCommand.add(modelBaseName); finalCommand.add(corpusRepresentation.getMetaFile().getAbsolutePath()); finalCommand.add(new File(dataDir,WRAPPER_NAME).getAbsolutePath()); if(!parms.trim().isEmpty()) { String[] tmp = parms.trim().split("\\s+",-1); finalCommand.addAll(Arrays.asList(tmp)); } // if we have a shell command prepend that, and if we have shell parms too, include them Map config = getWrapperConfig(); String shellcmd = config.get("shellcmd"); String shellparms = config.get("shellparms"); if(shellcmd != null) { finalCommand.add(0,shellcmd); if(shellparms != null) { String[] sps = shellparms.trim().split("\\s+"); int i=0; for(String sp : sps) { finalCommand.add(++i,sp); } } } //System.err.println("Running: "); //for(int i=0; i"+finalCommand.get(i)+"<"); //} Map env = new HashMap<>(); env.put("WRAPPER_HOME",getWrapperHome()); env.put("GATE_LF_DATA_DIR", dataDir.getAbsolutePath()); String pythonbin = config.get("PYTHON_BIN"); if (pythonbin != null) { env.put("PYTHON_BIN", pythonbin); } else { env.put("PYTHON_BIN", getDefaultPythonBin()); } process = Process4StringStream.create(dataDir,env,finalCommand); } @Override protected void saveCorpusRepresentation(File directory) { } @Override protected void saveModel(File directory) { // this is all handled by the script we are running for training, nothing // needed in here. } // NOTE: if we already pass and initialise the dataDir when initialising, we // do not need the file as a parameter here??? Refactor to remove this parm! @Override public void trainModel(File dataDirectory, String instanceType, String parms) { // first of all close the corpus and save the metadata corpusRepresentation.finishAdding(); if (corpusRepresentation.nrInstances()==0) { throw new RuntimeException("No training instances found in the corpus, cannot train!"); } // update the info instance with stuff we should know now info.classLabels = corpusRepresentation.getTargetLabels(); info.nrTargetValues = info.classLabels.size(); info.nrTrainingDimensions = corpusRepresentation.getNrFeatures(); // first of all copy the wrapper files into the data directory if needed Utils4Engines.copyWrapper(WRAPPER_NAME, dataDir); ArrayList finalCommand = new ArrayList<>(); String modelBaseName = new File(dataDir, WRAPPER_NAME+".model").getAbsolutePath(); finalCommand.add(getCommandPathTrain()); finalCommand.add(corpusRepresentation.getMetaFile().getAbsolutePath()); finalCommand.add(modelBaseName); if(!parms.trim().isEmpty()) { String[] tmp = parms.trim().split("\\s+",-1); finalCommand.addAll(Arrays.asList(tmp)); } // if we have a shell command prepend that, and if we have shell parms too, include them Map config = getWrapperConfig(); String shellcmd = config.get("shellcmd"); String shellparms = config.get("shellparms"); if(shellcmd != null) { finalCommand.add(0,shellcmd); if(shellparms != null) { String[] sps = shellparms.trim().split("\\s+"); int i=0; for(String sp : sps) { finalCommand.add(++i,sp); } } } //System.err.println("Running: "); //for(int i=0; i"+finalCommand.get(i)+"<"); //} // Create a fake Model jsut to make LF_Apply... happy which checks if this is null model = new Object(); Map env = new HashMap<>(); env.put("WRAPPER_HOME",getWrapperHome()); env.put("GATE_LF_DATA_DIR", dataDir.getAbsolutePath()); String pythonbin = config.get("PYTHON_BIN"); // System.err.println("DEBUG: config python bin: "+pythonbin); if (pythonbin != null) { // System.err.println("DEBUG: python bin from config: "+pythonbin); env.put("PYTHON_BIN", pythonbin); } else { env.put("PYTHON_BIN", getDefaultPythonBin()); // System.err.println("DEBUG: python bin from default: "+getDefaultPythonBin()); } process = ProcessSimple.create(dataDir,env,finalCommand); process.waitFor(); // we also need to save the updated info file info.nrTrainingInstances = corpusRepresentation.nrInstances(); info.engineClass = this.getClass().getName(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); info.modelWhenTrained = sdf.format(new Date()); info.algorithmParameters = parms; info.save(dataDir); featureInfo.save(dataDir); } @Override public EvaluationResult evaluate(String algorithmParameters, EvaluationMethod evaluationMethod, int numberOfFolds, double trainingFraction, int numberOfRepeats) { throw new UnsupportedOperationException("Not supported (yet?)"); } @Override @SuppressWarnings("unchecked") public List applyModel(AnnotationSet instancesAS, AnnotationSet inputAS, AnnotationSet sequenceAS, String parms) { //System.err.println("DEBUG: running applyModel"); corpusRepresentation.stopGrowth(); ObjectMapper mapper = new ObjectMapper(); List modelapps = new ArrayList<>(); if(sequenceAS==null) { // non-sequences List instanceAnnotations = instancesAS.inDocumentOrder(); // We have two choices here: send each instace separately or send them all // together in one go. For now we send each instance separately. // TODO: figure out which mode is better/faster!! for (Annotation instanceAnnotation : instanceAnnotations) { InstanceRepresentation inst = corpusRepresentation.unlabeledAnnotation2Instance(instanceAnnotation, inputAS, null); String json = corpusRepresentation.internal2Json(inst,true); //System.err.println("DEBUG - sending json: "+json); process.writeObject(json); //System.err.println("DEBUG - before reading response"); String returnJson = (String)process.readObject(); //System.err.println("DEBUG - received return json: "+returnJson); Object obj = null; try { obj = mapper.readValue(returnJson,Map.class); } catch (IOException ex) { throw new GateRuntimeException("Could not interpret response json: "+returnJson,ex); } // we always expect a map as a response! MapretMap = (Map)obj; // we always expect these keys, having these types! String status = (String)retMap.get("status"); if(status==null) { status = ""; } if(!"ok".equals(status.toLowerCase())) { // try to get the exception from json String exc = (String)retMap.get("error"); throw new GateRuntimeException("Something went wrong applying the model, got status: "+status+ " error is: "+exc); } // Here we do NOT have a sequence tagging problem, so for now, the only // thing we should get is the label and the scores, which should be // a list of nclasses values. In addition we get the labels array so // we know which score belongs to which label // TODO: we should retrieve the label array only once through a special command! ModelApplication ma = null; if(info.task.equals(AlgorithmKind.REGRESSOR.toString())) { throw new GateRuntimeException("Not implemented yet: task REGRESSION"); // NOTE: this is not actually supported yet, we do not support REGRESSION /* Double output = (Double)retMap.get("output"); if(output==null) { throw new GateRuntimeException("Did not get a regression result from model"); } // NOTE: eventually we may get variance or confidence interval boundaries here: "ci_upper"/"ci_lower"/"ci_p" // Double variance = (Double)retMap.get("variance"); ma = new ModelApplication(instanceAnnotation,output); */ } else if(info.task.equals(AlgorithmKind.CLUSTERING.toString())) { throw new GateRuntimeException("Not implemented yet: task CLUSTERING"); } else if(info.task.equals(AlgorithmKind.CLASSIFIER.toString())) { String output = (String)retMap.get("output"); if(output==null) { throw new GateRuntimeException("Did not get a classification result from model"); } Double conf = (Double)retMap.get("conf"); List dist = (List)retMap.get("dist"); List labels = (List)retMap.get("labels"); ma = new ModelApplication(instanceAnnotation,output, conf, labels, dist); } else if(info.task.equals(AlgorithmKind.SEQUENCE_TAGGER.toString())) { // error: if no sequence AS is specified we should not get this! throw new GateRuntimeException("Model application not possible: no sequenceAS but model expects it!"); } modelapps.add(ma); } } else { // sequences // Again, we could send the data for all sequences in one go but for // now we just send each sequence separately. // TODO: figure out what is better! // DONE: we generally send instances separately. This is mainly for the non-sequence // case where we may need to inject the prediction of the previous instance, which is // really only properly possible if we handle each instance separately. for(Annotation sequenceAnn : sequenceAS) { int seq_id = sequenceAnn.getId(); List instanceAnnotations = gate.Utils.getContainedAnnotations( instancesAS, sequenceAnn).inDocumentOrder(); List insts4seq = corpusRepresentation.unlabeledInstancesForSequence(instancesAS, sequenceAnn, inputAS); String json = corpusRepresentation.internal2Json(insts4seq,true); process.writeObject(json); // TODO: need to decide on the format of the response. Probably best to // expect a map with both data and metadata String returnJson = (String)process.readObject(); Object obj = null; try { obj = mapper.readValue(returnJson,Map.class); } catch (IOException ex) { throw new GateRuntimeException("Could not interpret response json: ",ex); } // we always expect a map as a response! MapretMap = (Map)obj; // we always expect these keys, having these types! String status = (String)retMap.get("status"); if(status==null) { status = ""; } if(!"ok".equals(status.toLowerCase())) { String exc = (String)retMap.get("error"); throw new GateRuntimeException("Something went wrong applying the model, got status: "+status+ " error is: "+exc); } // we expect output to be a list of string and if confidence exists, a list of double List output = (List)retMap.get("output"); if(output==null) { throw new GateRuntimeException("Did not get a classification result from model"); } // note: the confidence actually may be null (missing in the map) meaning we do not have it Listconfidence = (List)retMap.get("conf"); Listlabels = (List)retMap.get("labels"); List>dist = (List>)retMap.get("dist"); ModelApplication ma; if(info.task.equals(AlgorithmKind.SEQUENCE_TAGGER.toString())) { // we need to get back as many labels as there are instances in the insts4seq list int i = 0; for(Annotation ann : instanceAnnotations) { // expects class, confidence, sequence span id Double conf = null; if(confidence!=null) { conf=confidence.get(i); } if(dist!=null && labels != null) { if(dist.get(i) != null) { ma = new ModelApplication(ann, output.get(i), conf, labels, dist.get(i), seq_id); } else { ma = new ModelApplication(ann, output.get(i), conf, seq_id); } } else { ma = new ModelApplication(ann, output.get(i), conf, seq_id); } modelapps.add(ma); i++; } } else { // error: sequence AS is specified but this is not a Sequence tagger model throw new GateRuntimeException("Model application not possible: sequenceAS specified but model does not expect it!"); } } corpusRepresentation.startGrowth(); } // * use the predictions to create the return list // * TODO: how do we terminate the process again? return modelapps; } @Override public void initializeAlgorithm(Algorithm algorithm, String parms) { // TODO } protected void updateInfo() { // TODO: } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy