gate.plugin.learningframework.ModelApplication Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of learningframework Show documentation
Show all versions of learningframework Show documentation
A GATE plugin that provides many different machine learning
algorithms for a wide range of NLP-related machine learning tasks like
text classification, tagging, or chunking.
/*
* Copyright (c) 2015-2016 The University Of Sheffield.
*
* This file is part of gateplugin-LearningFramework
* (see https://github.com/GateNLP/gateplugin-LearningFramework).
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this software. If not, see .
*/
package gate.plugin.learningframework;
import java.util.List;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.plugin.learningframework.features.SeqEncoder;
import gate.util.GateRuntimeException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
public class ModelApplication {
private Annotation instance;
private String classAssigned;
private Double targetAssigned = null;
private boolean numericTarget = false;
private Double confidenceScore;
private Integer seqSpanID;
private List classList;
private List confidenceList;
public ModelApplication(Annotation instance, String classAssigned,
Double confidenceScore) {
this.instance = instance;
this.classAssigned = classAssigned;
this.confidenceScore = confidenceScore;
}
public ModelApplication(Annotation instance, double targetAssigned) {
this.instance = instance;
this.targetAssigned = targetAssigned;
numericTarget = true;
}
public ModelApplication(Annotation instance, String classAssigned,
Double confidenceScore, List classes, List confidences) {
this.instance = instance;
this.classAssigned = classAssigned;
this.confidenceScore = confidenceScore;
this.classList = classes;
this.confidenceList = confidences;
}
public ModelApplication(Annotation instance, String classAssigned,
Double confidenceScore, Integer sequenceSpanID) {
this.instance = instance;
this.classAssigned = classAssigned;
this.confidenceScore = confidenceScore;
this.seqSpanID = sequenceSpanID;
}
public ModelApplication(Annotation instance, String classAssigned,
Double confidenceScore, List classes, List confidences,
Integer sequenceSpanID) {
this.instance = instance;
this.classAssigned = classAssigned;
this.confidenceScore = confidenceScore;
this.classList = classes;
this.confidenceList = confidences;
this.seqSpanID = sequenceSpanID;
}
public Annotation getInstance() {
return instance;
}
public void setInstance(Annotation instance) {
this.instance = instance;
}
public String getClassAssigned() {
return classAssigned;
}
public void setClassAssigned(String classAssigned) {
this.classAssigned = classAssigned;
}
public Double getConfidenceScore() {
return confidenceScore;
}
public void setConfidenceScore(Double confidenceScore) {
this.confidenceScore = confidenceScore;
}
public Integer getSeqSpanID() {
return seqSpanID;
}
public List getClassList() {
return classList;
}
public List getConfidenceList() {
return confidenceList;
}
public void setSeqSpanID(Integer sequenceSpanID) {
this.seqSpanID = sequenceSpanID;
}
public boolean isNumericTarget() {
return numericTarget;
}
public Double getNumericTargetAssigned() {
return targetAssigned;
}
/**
* Utility function to apply a list of ModelApplication to a document.
* This creates classification/regression output from a list of ModelApplication objects.
If outputAS is null, then the original instance annotations are modified and receive the
target features and additional LearningFramework-specific features (confidence etc.).
If outputAS is specified, new annotations which are a copy of the instance annotations
are created in the outputAS and the target features are stored in those copies.
*
* If the minConfidence parameter is not null or Double.NaN and the actual classification
* for the instance has a non-null, non-NaN confidence, then nothing is done if
* the confidence of the classification is not at least the minConfidence (no target feature
* is set or updated in that case, or no copy of the instance annotation with the target feature
* set is created)
*
* NOTE: if the original entity confidence is missing, the target will still be assigned
* a default confidence score of 0.0 to make that value always numeric.
*
* @param doc document where the classifications are carried out (only needed for logging)
* @param gcs the list of classifications
* @param targetFeature the name of the target feature
* @param outputAS the annotation set where to place copies or null for updating existing annotations
* @param minConfidence the minimum confidence score a classification must have
*/
public static void applyClassification(Document doc,
List gcs,
String targetFeature,
AnnotationSet outputAS,
Double minConfidence) {
for(ModelApplication gc : gcs) {
Double conf = gc.getConfidenceScore();
if (minConfidence != null &&
conf != null &&
!Double.isNaN(conf) &&
!Double.isNaN(minConfidence) &&
conf < minConfidence) {
//Skip it
continue;
}
if(conf==null) {
conf=0.0;
}
FeatureMap fm;
if(outputAS == null) {
fm = gc.getInstance().getFeatures();
} else {
fm = gate.Utils.toFeatureMap(gc.getInstance().getFeatures());
}
if(gc.isNumericTarget()) {
fm.put(targetFeature, gc.getNumericTargetAssigned());
} else {
fm.put(targetFeature, gc.getClassAssigned());
fm.put(Globals.outputClassFeature, gc.getClassAssigned());
fm.put(Globals.outputProbFeature, conf);
if (gc.getClassList() != null && gc.getConfidenceList() != null) {
fm.put(Globals.outputClassFeature + "_list", gc.getClassList());
fm.put(Globals.outputProbFeature + "_list", gc.getConfidenceList());
}
if (gc.getSeqSpanID() != null) {
fm.put(Globals.outputSequenceSpanIDFeature, gc.getSeqSpanID());
}
}
if(outputAS != null) {
int id = gate.Utils.addAnn(outputAS, gc.getInstance(), gc.getInstance().getType(), fm);
Annotation ann = outputAS.get(id);
// System.err.println("DEBUG adding ann "+ann+", target feature "+targetFeature+" should be "+gc.getClassAssigned());
}
} // for
}
/**
* From an annotation set with e.g. BIO class annotations on the instances,
* create an output annotation set with the actual sequence annotations.
*
*
* TODO/NOTE: for some reasons this passes on intputAS but we do not use it.
* TODO/NOTE: we should really also get the sequence annotation and limit
* resolving BIO to within each sequence.
*
* NOTE: originally, this was just using B/I/O, we now changed to using Type|B
* Type|I and O. However, this should really get moved to the corresponding SeqEncode subclass.
*
* @param unused currently unused, API will change
* @param instanceAS instance annotation set
* @param outputAS output annotation set
* @param outputAnnType output annotation type
* @param minConfidence minimum confidence for making prediction, if null always
* @param seqEncoder sequence encoder instance
*/
public static void addSurroundingAnnotations(
AnnotationSet unused,
AnnotationSet instanceAS,
AnnotationSet outputAS,
String outputAnnType,
Double minConfidence,
SeqEncoder seqEncoder) {
// TODO!! we need to delegate this to the proper method of seqEncoder, in a way
// that abstracts away a little from annotations etc., ideally!!!
// Probably best to process a whole sequence everytime we call the seqEncoder method,
// if we do not have a sequence, then a whole document.
// map of open annotations, per sequence annotation type
Map annsToAdd = new HashMap<>();
int oldSeqId = -1; // keep track of which sequence annotation we are in
for (Annotation inst : instanceAS.inDocumentOrder()) {
// get the sequence id of the current instance, or 0 if no sequence (whole document)
Integer sequenceSpanID = (Integer) inst.getFeatures().get(Globals.outputSequenceSpanIDFeature);
if (sequenceSpanID == null) {
sequenceSpanID = 0;
}
if(sequenceSpanID != oldSeqId) {
// if the oldSeqId is -1, do not worry, this is just the first instance annotation
if(oldSeqId == -1) {
oldSeqId = sequenceSpanID;
} else {
// close any annotations still open and remove
Iterator> it = annsToAdd.entrySet().iterator();
while(it.hasNext()) {
Map.Entry entry = it.next();
//System.err.println("Finishing at seq end: "+entry.getValue().thisEnd);
addSequenceAnn(entry.getValue(), outputAS, minConfidence);
it.remove();
}
oldSeqId = sequenceSpanID;
}
}
// Type|B, Type|I or O??
// We could also get a sequence of different TypeX|B or TypeY|I here
String target = (String) inst.getFeatures().get(Globals.outputClassFeature);
String[] typesAndCodes;
if(target == null) {
target = SeqEncoder.CODE_OUTSIDE;
}
// now we have two cases: either we got an outside or we got one or more
// type/code pairs
// if we have an outside, just end all the open annotations, if any
if(target.equals(SeqEncoder.CODE_OUTSIDE)) {
// finish any open anns of the same type and remove the open anns
Iterator> it = annsToAdd.entrySet().iterator();
while(it.hasNext()) {
Map.Entry entry = it.next();
//System.err.println("Finishing because of O "+entry.getValue().thisEnd);
addSequenceAnn(entry.getValue(), outputAS, minConfidence);
it.remove();
}
} else {
typesAndCodes = target.split(SeqEncoder.TYPESEP_PATTERN);
// otherwise: iterate over all types and codes and process accordingly
// after processing all types and codes, finish all the types which
// are open but where not in the target
Set touchedTypes = new HashSet<>();
for(String typeAndCode : typesAndCodes) {
String[] tac = typeAndCode.split(SeqEncoder.CODESEP_PATTERN);
// This should never happen, but if some external model returns odd targets, we catch
// the case where it cannot be split into the two parts we expect here.
if (tac.length != 2) {
throw new GateRuntimeException("DEBUG: odd type and code of length "+tac.length+": "+typeAndCode+" from label "+target);
}
//System.err.println("type/code="+tac[0]+"/"+tac[1]);
switch (tac[1]) {
case SeqEncoder.CODE_BEGIN:
{
touchedTypes.add(tac[0]);
// finish any ann which is of the same type and remove
Iterator> it = annsToAdd.entrySet().iterator();
while(it.hasNext()) {
Map.Entry entry = it.next();
if(entry.getKey().equals(tac[0])) {
//System.err.println("Finishing because B: "+entry.getValue().thisEnd);
addSequenceAnn(entry.getValue(), outputAS, minConfidence);
it.remove();
}
} // now add a new open annotation for that type
AnnToAdd ata = new AnnToAdd();
ata.thisStart = inst.getStartNode().getOffset();
ata.annType = tac[0];
//Update the end on the offchance that this is it
ata.thisEnd = inst.getEndNode().getOffset();
Object tmpfv = inst.getFeatures().get(Globals.outputProbFeature);
ata.conf += (tmpfv == null ? 0.0 : (Double)tmpfv);
ata.len++;
annsToAdd.put(tac[0], ata);
break;
}
case SeqEncoder.CODE_INSIDE:
{
// go through the open annotations and if we find one with that type, continue
// it
Iterator> it = annsToAdd.entrySet().iterator();
while(it.hasNext()) {
Map.Entry entry = it.next();
if(entry.getKey().equals(tac[0])) {
//System.err.println("extending existing annotation to offset "+inst.getEndNode().getOffset());
touchedTypes.add(tac[0]);
// continue the ann and extend the span
Object tmpfv = inst.getFeatures().get(Globals.outputProbFeature);
entry.getValue().conf += (tmpfv == null ? 0.0 : (Double)tmpfv);
entry.getValue().len++;
//Update the end on the offchance that this is it
entry.getValue().thisEnd = inst.getEndNode().getOffset();
}
} break;
}
default:
throw new GateRuntimeException("Unexpected SeqEncoder code: "+tac[1]+" from label "+target);
}
} // for typeAndCode : typesAndCodes
// after processing all the types/codes in the target, go through the
// open annotations and close those which have not been touched by this target
//System.err.println("Set of touched types: "+touchedTypes);
Iterator> it = annsToAdd.entrySet().iterator();
while(it.hasNext()) {
Map.Entry entry = it.next();
// if this is an open annotation with a type which has not been included
// in the target, close and remove it
if(!touchedTypes.contains(entry.getKey())) {
//System.err.println("finishing untouched ann at "+entry.getValue().thisEnd);
addSequenceAnn(entry.getValue(), outputAS, minConfidence);
it.remove();
}
}
} // if we do not have CODE_OUTSIDE
} // for all instance annotations
}
/**
* If confidence constraint is satisfied, add Annotation and return it, otherwise
* add nothing and return null.
*
* @param annToAdd AnnToAdd instance
* @param outputAS output annotation set
* @param minConfidence minimum confidence for prediction, if null always predict
* @return newly created annotation or null if no annotation should get added
*/
private static Annotation addSequenceAnn(AnnToAdd annToAdd, AnnotationSet outputAS, Double minConfidence) {
Double entityConfidence = annToAdd.conf == null ? null : annToAdd.conf / annToAdd.len;
if(annToAdd.thisStart != -1 && annToAdd.thisEnd != -1 &&
(minConfidence == null || entityConfidence == null || entityConfidence >= minConfidence)) {
FeatureMap fm = Factory.newFeatureMap();
if(entityConfidence == null) {
entityConfidence = 0.0;
}
fm.put(Globals.outputProbFeature, entityConfidence);
// TODO: add the sequence span id? UPDATE: since we return the annotation
// we just created, the caller can add anything to the feature map
int id = gate.Utils.addAnn(outputAS, annToAdd.thisStart, annToAdd.thisEnd, annToAdd.annType, fm);
return outputAS.get(id);
} else {
return null;
}
}
private static class AnnToAdd {
long thisStart = -1;
long thisEnd = -1;
int len = 0;
Double conf = 0.0;
String annType = "INVALID";
}
@Override
public String toString() {
return "ModelApplication{type="+instance.getType()+",at="+gate.Utils.start(instance)+
",target="+(numericTarget?targetAssigned:classAssigned)+"}";
}
}