gate.plugin.learningframework.engines.EngineDVFileJson Maven / Gradle / Ivy
Show all versions of learningframework Show documentation
/*
* Copyright (c) The University of Sheffield.
*
* This file is free software, licensed under the
* GNU Library General Public License, Version 2.1, June 1991.
* See the file LICENSE.txt that comes with this software.
*
*/
package gate.plugin.learningframework.engines;
import com.fasterxml.jackson.databind.ObjectMapper;
import gate.Annotation;
import gate.AnnotationSet;
import gate.lib.interaction.process.Process4StringStream;
import gate.lib.interaction.process.ProcessBase;
import gate.lib.interaction.process.ProcessSimple;
import gate.plugin.learningframework.EvaluationMethod;
import gate.plugin.learningframework.ModelApplication;
import gate.plugin.learningframework.data.CorpusRepresentation;
import gate.plugin.learningframework.data.CorpusRepresentationVolatileDense2JsonStream;
import gate.plugin.learningframework.data.InstanceRepresentation;
import gate.plugin.learningframework.features.FeatureInfo;
import gate.plugin.learningframework.features.TargetType;
import gate.util.Files;
import gate.util.GateRuntimeException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.yaml.snakeyaml.Yaml;
/**
* Common base class for all Engines which are dense, volatile and write JSON to a file.
*
*
* See Wiki
*
* @author Johann Petrak
*/
public abstract class EngineDVFileJson extends EngineDV {
// NOTEs about how to find python:
// On linux, python is usually on the path, but we cannot be sure if it is python 3
// In some cases, python3 is the version 3 command.
// On Windows (10) if installing using anaconda3, the installer recommends NOT
// putting python on the path and to install for the user only by default.
// !! In that case, it gets installed by default into C:\\Users\\username\\Anaconda3 which
// contains python.exe, but things like ipython.exe are in the Scripts subdirectory.
// If installing for all, it gets installed by default into C:\\ProgramData\\Anaconda3
// If installing pythong from python.org we have the options:
// 1) Windows executable installer (python-3.6.5-amd64.exe):
// this one recommends to install for all users but does not add to PATH by default
// Also recommends to disable the path length limit
// Installing for all: This gets installed into C:\\Users\\username\\AppData\\Local\\Programs\\Python\\Python36
// which contains python, Scripts contains pip
// 2) Web-based installer (python-3.6.5-amd64-webinstall.exe): seems to use the same location
// (could not test after already installed using the other installer)
// So in order to find python we do the following:
// 1) check if there is a config file in the data dir, use the pythonhome variable from there
// 2) check if the PYTHON_BIN environment variable is set, use it as full path to the executable
// 3) try to find it on the executable path (this is not easy in Java, instead try to run python/python3
// or python.exe with parameter "-V" to get version. May use Runtime.getRuntime().exec("...") in a
// try catch for that or own interaction library to get back the output.
// 3) if on Windows, check one of the two paths above
// 4) if on Linux check in decreasing order of importance: /usr/bin/python3 or /usr/bin/python
// Wrapper name: this is set by the actual implementing engine class and
// will influence the file name of scripts, config files etc specific to that
// wrapper.
protected String WRAPPER_NAME = "WRAPPER_NAME MUST BE OVERRIDEN BY IMPLEMENTING SUBCLASS";
protected String MODEL_BASENAME = "MODEL_BASENAME MUST BE OVERRIDEN BY IMPLEMENTING SUBCLASS";
// all implementing subclasses will follow these name conventions
protected String COMMAND_BASE_TRAIN = "train";
protected String COMMAND_BASE_APPLY = "apply";
protected File dataDir; // the model/data directory as specified when creating the engine but as a File
protected String getWrapperHome() {
File wrapperRoot = new File(dataDir, WRAPPER_NAME);
return wrapperRoot.getAbsolutePath();
}
/**
* Get the command path to run for training.
*
* @return the training command path
*/
protected String getCommandPathTrain() {
File wrapperRoot = new File(dataDir, WRAPPER_NAME);
String ext = getShellExtension();
File cmd = new File(wrapperRoot, COMMAND_BASE_TRAIN+ext);
return cmd.getAbsolutePath();
}
/**
* Get the command path to run for application.
*
* @return application command path
*/
protected String getCommandPathApply() {
File wrapperRoot = new File(dataDir, WRAPPER_NAME);
String ext = getShellExtension();
File cmd = new File(wrapperRoot, COMMAND_BASE_APPLY + ext);
return cmd.getAbsolutePath();
}
/**
* Get the shell extensions for the operating system.
* @return shell extension, including the dot
*/
protected String getShellExtension() {
return getOsType() == OsType.LINUXLIKE ? ".sh" : ".cmd";
}
/**
* Return LINUXLIKE, WINDOWSLIKE, or throw an exception for anything else
*
* @return OS type
*/
protected OsType getOsType() {
boolean linuxLike = System.getProperty("file.separator").equals("/");
boolean windowsLike = System.getProperty("file.separator").equals("\\");
if(linuxLike) {
return OsType.LINUXLIKE;
} else if(windowsLike) {
return OsType.WINDOWSLIKE;
} else {
throw new GateRuntimeException("It appears this OS is not supported");
}
}
/**
* Known OS types.
*/
public enum OsType {
WINDOWSLIKE,
LINUXLIKE
}
/**
* Read a WRAPPERNAME.yaml file from the data dir, if it exists.
* @return the map of settings
*/
@SuppressWarnings("unchecked")
public Map getWrapperConfig() {
File wrapperInfoFile = new File(dataDir,WRAPPER_NAME+".yaml");
if(!wrapperInfoFile.exists()) {
// Windows is just insane and hides a txt extension if a user creates
// a "text file" with a yaml extensions, so lets allow .yaml.txt as well
wrapperInfoFile = new File(dataDir,WRAPPER_NAME+".yaml.txt");
}
// System.err.println("DEBUG: wrapper file: "+wrapperInfoFile.getAbsolutePath());
if(wrapperInfoFile.exists()) {
// System.err.println("DEBUG: seems to exist ...");
Yaml yaml = new Yaml();
Object obj;
try {
obj = yaml.load(new InputStreamReader(new FileInputStream(wrapperInfoFile),"UTF-8"));
} catch (FileNotFoundException | UnsupportedEncodingException ex) {
throw new GateRuntimeException("Could not load yaml file "+wrapperInfoFile,ex);
}
Map map = null;
if(obj instanceof Map) {
map = (Map)obj;
// System.err.println("DEBUG: got map: "+map);
} else {
throw new GateRuntimeException("Info file has strange format: "+wrapperInfoFile.getAbsolutePath());
}
return map;
} else {
// System.err.println("DEBUG: does not exist, returning empty map");
return new HashMap<>();
}
}
protected ProcessBase process;
// For this engine, this will always be a CorpusRepresentationVolatileDense2JsonStream
protected CorpusRepresentationVolatileDense2JsonStream corpusRepresentation;
@Override
public CorpusRepresentation getCorpusRepresentation() {
return corpusRepresentation;
}
@Override
protected void initWhenCreating(URL directory, Algorithm algorithm,
String parms, FeatureInfo featureInfo, TargetType targetType) {
dataDir = Files.fileFromURL(directory);
this.featureInfo = featureInfo;
corpusRepresentation = new CorpusRepresentationVolatileDense2JsonStream(dataDir, featureInfo);
corpusRepresentation.startAdding();
this.featureInfo = featureInfo;
// NOTE: we are copying the wrapper code only when starting training, not
// here. This allows the user to copy their own code while the PR is running
// but creating the corpus has not yet finished.
}
@Override
protected void loadAndSetCorpusRepresentation(URL directory) {
//System.err.println("DEBUG EngineDVFileJson: running loadAndSetCorpusRepresentation "+directory);
// this does not actually need to load anything but the featureInfo ...
// this is needed to convert our instance data to JSON, which is then sent
// off to the script or server which is responsible to use any other saved
// model info (the model itself, scaling info, vocab info, embeddings etc)
dataDir = Files.fileFromURL(directory);
featureInfo = FeatureInfo.load(directory);
corpusRepresentation = new CorpusRepresentationVolatileDense2JsonStream(dataDir, featureInfo);
}
protected String getDefaultPythonBin() {
// TODO: depending on OS and the result of doing the equivalent of "which python"
// provide some useful default here
if(getOsType()==OsType.LINUXLIKE) {
return "python";
} else {
// On windows, use C:\\User\\username\\Miniconda3\python.exe
String drive = System.getenv("HOMEDRIVE");
// System.err.println("DEBUG: Windows drive is "+drive);
String path = System.getenv("HOMEPATH");
// System.err.println("DEBUG: Windows home path is "+path);
return drive + path + "\\Miniconda3\\python.exe";
}
}
@Override
protected void loadModel(URL directory, String parms) {
loadAndSetCorpusRepresentation(directory);
// the loadModel method should get called before all the applyModel
// calls, so here we can start the external process with which we communicate
// in applyModel
// first, check if the wrapper is present. Normally this should be the case,
// but sometimes it may be required to update the wrapper on purpose, e.g.
// for debugging or for a bugfix. This can be achieved by removing the wrapper
// directory and running apply again which will then re-install the wrapper
// here.
if(!new File(dataDir,"WRAPPER_NAME").exists()) {
Utils4Engines.copyWrapper(WRAPPER_NAME, dataDir);
}
// Start the process
ArrayList finalCommand = new ArrayList<>();
String modelBaseName = new File(dataDir, WRAPPER_NAME+".model").getAbsolutePath();
finalCommand.add(getCommandPathApply());
finalCommand.add(modelBaseName);
finalCommand.add(corpusRepresentation.getMetaFile().getAbsolutePath());
finalCommand.add(new File(dataDir,WRAPPER_NAME).getAbsolutePath());
if(!parms.trim().isEmpty()) {
String[] tmp = parms.trim().split("\\s+",-1);
finalCommand.addAll(Arrays.asList(tmp));
}
// if we have a shell command prepend that, and if we have shell parms too, include them
Map config = getWrapperConfig();
String shellcmd = config.get("shellcmd");
String shellparms = config.get("shellparms");
if(shellcmd != null) {
finalCommand.add(0,shellcmd);
if(shellparms != null) {
String[] sps = shellparms.trim().split("\\s+");
int i=0; for(String sp : sps) { finalCommand.add(++i,sp); }
}
}
//System.err.println("Running: ");
//for(int i=0; i"+finalCommand.get(i)+"<");
//}
Map env = new HashMap<>();
env.put("WRAPPER_HOME",getWrapperHome());
env.put("GATE_LF_DATA_DIR", dataDir.getAbsolutePath());
String pythonbin = config.get("PYTHON_BIN");
if (pythonbin != null) {
env.put("PYTHON_BIN", pythonbin);
} else {
env.put("PYTHON_BIN", getDefaultPythonBin());
}
process = Process4StringStream.create(dataDir,env,finalCommand);
}
@Override
protected void saveCorpusRepresentation(File directory) {
}
@Override
protected void saveModel(File directory) {
// this is all handled by the script we are running for training, nothing
// needed in here.
}
// NOTE: if we already pass and initialise the dataDir when initialising, we
// do not need the file as a parameter here??? Refactor to remove this parm!
@Override
public void trainModel(File dataDirectory, String instanceType, String parms) {
// first of all close the corpus and save the metadata
corpusRepresentation.finishAdding();
if (corpusRepresentation.nrInstances()==0) {
throw new RuntimeException("No training instances found in the corpus, cannot train!");
}
// update the info instance with stuff we should know now
info.classLabels = corpusRepresentation.getTargetLabels();
info.nrTargetValues = info.classLabels.size();
info.nrTrainingDimensions = corpusRepresentation.getNrFeatures();
// first of all copy the wrapper files into the data directory if needed
Utils4Engines.copyWrapper(WRAPPER_NAME, dataDir);
ArrayList finalCommand = new ArrayList<>();
String modelBaseName = new File(dataDir, WRAPPER_NAME+".model").getAbsolutePath();
finalCommand.add(getCommandPathTrain());
finalCommand.add(corpusRepresentation.getMetaFile().getAbsolutePath());
finalCommand.add(modelBaseName);
if(!parms.trim().isEmpty()) {
String[] tmp = parms.trim().split("\\s+",-1);
finalCommand.addAll(Arrays.asList(tmp));
}
// if we have a shell command prepend that, and if we have shell parms too, include them
Map config = getWrapperConfig();
String shellcmd = config.get("shellcmd");
String shellparms = config.get("shellparms");
if(shellcmd != null) {
finalCommand.add(0,shellcmd);
if(shellparms != null) {
String[] sps = shellparms.trim().split("\\s+");
int i=0; for(String sp : sps) { finalCommand.add(++i,sp); }
}
}
//System.err.println("Running: ");
//for(int i=0; i"+finalCommand.get(i)+"<");
//}
// Create a fake Model jsut to make LF_Apply... happy which checks if this is null
model = new Object();
Map env = new HashMap<>();
env.put("WRAPPER_HOME",getWrapperHome());
env.put("GATE_LF_DATA_DIR", dataDir.getAbsolutePath());
String pythonbin = config.get("PYTHON_BIN");
// System.err.println("DEBUG: config python bin: "+pythonbin);
if (pythonbin != null) {
// System.err.println("DEBUG: python bin from config: "+pythonbin);
env.put("PYTHON_BIN", pythonbin);
} else {
env.put("PYTHON_BIN", getDefaultPythonBin());
// System.err.println("DEBUG: python bin from default: "+getDefaultPythonBin());
}
process = ProcessSimple.create(dataDir,env,finalCommand);
process.waitFor();
// we also need to save the updated info file
info.nrTrainingInstances = corpusRepresentation.nrInstances();
info.engineClass = this.getClass().getName();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
info.modelWhenTrained = sdf.format(new Date());
info.algorithmParameters = parms;
info.save(dataDir);
featureInfo.save(dataDir);
}
@Override
public EvaluationResult evaluate(String algorithmParameters, EvaluationMethod evaluationMethod, int numberOfFolds, double trainingFraction, int numberOfRepeats) {
throw new UnsupportedOperationException("Not supported (yet?)");
}
@Override
@SuppressWarnings("unchecked")
public List applyModel(AnnotationSet instancesAS, AnnotationSet inputAS, AnnotationSet sequenceAS, String parms) {
//System.err.println("DEBUG: running applyModel");
corpusRepresentation.stopGrowth();
ObjectMapper mapper = new ObjectMapper();
List modelapps = new ArrayList<>();
if(sequenceAS==null) {
// non-sequences
List instanceAnnotations = instancesAS.inDocumentOrder();
// We have two choices here: send each instace separately or send them all
// together in one go. For now we send each instance separately.
// TODO: figure out which mode is better/faster!!
for (Annotation instanceAnnotation : instanceAnnotations) {
InstanceRepresentation inst =
corpusRepresentation.unlabeledAnnotation2Instance(instanceAnnotation, inputAS, null);
String json = corpusRepresentation.internal2Json(inst,true);
//System.err.println("DEBUG - sending json: "+json);
process.writeObject(json);
//System.err.println("DEBUG - before reading response");
String returnJson = (String)process.readObject();
//System.err.println("DEBUG - received return json: "+returnJson);
Object obj = null;
try {
obj = mapper.readValue(returnJson,Map.class);
} catch (IOException ex) {
throw new GateRuntimeException("Could not interpret response json: "+returnJson,ex);
}
// we always expect a map as a response!
MapretMap = (Map)obj;
// we always expect these keys, having these types!
String status = (String)retMap.get("status");
if(status==null) {
status = "";
}
if(!"ok".equals(status.toLowerCase())) {
// try to get the exception from json
String exc = (String)retMap.get("error");
throw new GateRuntimeException("Something went wrong applying the model, got status: "+status+
" error is: "+exc);
}
// Here we do NOT have a sequence tagging problem, so for now, the only
// thing we should get is the label and the scores, which should be
// a list of nclasses values. In addition we get the labels array so
// we know which score belongs to which label
// TODO: we should retrieve the label array only once through a special command!
ModelApplication ma = null;
if(info.task.equals(AlgorithmKind.REGRESSOR.toString())) {
throw new GateRuntimeException("Not implemented yet: task REGRESSION");
// NOTE: this is not actually supported yet, we do not support REGRESSION
/*
Double output = (Double)retMap.get("output");
if(output==null) {
throw new GateRuntimeException("Did not get a regression result from model");
}
// NOTE: eventually we may get variance or confidence interval boundaries here: "ci_upper"/"ci_lower"/"ci_p"
// Double variance = (Double)retMap.get("variance");
ma = new ModelApplication(instanceAnnotation,output);
*/
} else if(info.task.equals(AlgorithmKind.CLUSTERING.toString())) {
throw new GateRuntimeException("Not implemented yet: task CLUSTERING");
} else if(info.task.equals(AlgorithmKind.CLASSIFIER.toString())) {
String output = (String)retMap.get("output");
if(output==null) {
throw new GateRuntimeException("Did not get a classification result from model");
}
Double conf = (Double)retMap.get("conf");
List dist = (List)retMap.get("dist");
List labels = (List)retMap.get("labels");
ma = new ModelApplication(instanceAnnotation,output, conf, labels, dist);
} else if(info.task.equals(AlgorithmKind.SEQUENCE_TAGGER.toString())) {
// error: if no sequence AS is specified we should not get this!
throw new GateRuntimeException("Model application not possible: no sequenceAS but model expects it!");
}
modelapps.add(ma);
}
} else {
// sequences
// Again, we could send the data for all sequences in one go but for
// now we just send each sequence separately.
// TODO: figure out what is better!
// DONE: we generally send instances separately. This is mainly for the non-sequence
// case where we may need to inject the prediction of the previous instance, which is
// really only properly possible if we handle each instance separately.
for(Annotation sequenceAnn : sequenceAS) {
int seq_id = sequenceAnn.getId();
List instanceAnnotations = gate.Utils.getContainedAnnotations(
instancesAS, sequenceAnn).inDocumentOrder();
List insts4seq
= corpusRepresentation.unlabeledInstancesForSequence(instancesAS, sequenceAnn, inputAS);
String json = corpusRepresentation.internal2Json(insts4seq,true);
process.writeObject(json);
// TODO: need to decide on the format of the response. Probably best to
// expect a map with both data and metadata
String returnJson = (String)process.readObject();
Object obj = null;
try {
obj = mapper.readValue(returnJson,Map.class);
} catch (IOException ex) {
throw new GateRuntimeException("Could not interpret response json: ",ex);
}
// we always expect a map as a response!
MapretMap = (Map)obj;
// we always expect these keys, having these types!
String status = (String)retMap.get("status");
if(status==null) {
status = "";
}
if(!"ok".equals(status.toLowerCase())) {
String exc = (String)retMap.get("error");
throw new GateRuntimeException("Something went wrong applying the model, got status: "+status+
" error is: "+exc);
}
// we expect output to be a list of string and if confidence exists, a list of double
List output = (List)retMap.get("output");
if(output==null) {
throw new GateRuntimeException("Did not get a classification result from model");
}
// note: the confidence actually may be null (missing in the map) meaning we do not have it
Listconfidence = (List)retMap.get("conf");
Listlabels = (List)retMap.get("labels");
List>dist = (List>)retMap.get("dist");
ModelApplication ma;
if(info.task.equals(AlgorithmKind.SEQUENCE_TAGGER.toString())) {
// we need to get back as many labels as there are instances in the insts4seq list
int i = 0;
for(Annotation ann : instanceAnnotations) {
// expects class, confidence, sequence span id
Double conf = null;
if(confidence!=null) {
conf=confidence.get(i);
}
if(dist!=null && labels != null) {
if(dist.get(i) != null) {
ma = new ModelApplication(ann, output.get(i), conf, labels, dist.get(i), seq_id);
} else {
ma = new ModelApplication(ann, output.get(i), conf, seq_id);
}
} else {
ma = new ModelApplication(ann, output.get(i), conf, seq_id);
}
modelapps.add(ma);
i++;
}
} else {
// error: sequence AS is specified but this is not a Sequence tagger model
throw new GateRuntimeException("Model application not possible: sequenceAS specified but model does not expect it!");
}
}
corpusRepresentation.startGrowth();
}
// * use the predictions to create the return list
// * TODO: how do we terminate the process again?
return modelapps;
}
@Override
public void initializeAlgorithm(Algorithm algorithm, String parms) {
// TODO
}
protected void updateInfo() {
// TODO:
}
}