gate.plugin.learningframework.features.FeatureSpecification Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of learningframework Show documentation
Show all versions of learningframework Show documentation
A GATE plugin that provides many different machine learning
algorithms for a wide range of NLP-related machine learning tasks like
text classification, tagging, or chunking.
/*
* Copyright (c) 2015-2016 The University Of Sheffield.
*
* This file is part of gateplugin-LearningFramework
* (see https://github.com/GateNLP/gateplugin-LearningFramework).
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this software. If not, see .
*/
package gate.plugin.learningframework.features;
import gate.util.GateRuntimeException;
import java.io.File;
import java.io.StringReader;
import java.net.URL;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
/**
* Parse a feature specification and create an initial FeatureInfo object.
*
* @author Johann Petrak
*/
public class FeatureSpecification {
//private static Logger LOGGER = Logger.getLogger(FeatureSpecification.class.getName());
private org.jdom.Document jdomDocConf = null;
private URL url;
// for error checking we remember all mappings from embedding ids to
// each of the things that can be specified about embeddings:
// file, dims, train
private Map embeddingId2file = new HashMap<>();
private Map embeddingId2dims = new HashMap<>();
private Map embeddingId2minfreq = new HashMap<>();
private Map embeddingId2train = new HashMap<>();
/**
* Constructor from URL
* @param configFileURL URL of feature config XML file
*/
public FeatureSpecification(URL configFileURL) {
url = configFileURL;
SAXBuilder saxBuilder = new SAXBuilder(false);
try {
try {
jdomDocConf = saxBuilder.build(configFileURL);
parseConfigXml();
} catch (JDOMException jde) {
throw new GateRuntimeException(jde);
}
} catch (java.io.IOException ex) {
throw new GateRuntimeException("Error parsing config file URL " + url, ex);
}
}
/**
* Constructor from String
* @param configString XML string
*/
public FeatureSpecification(String configString) {
SAXBuilder saxBuilder = new SAXBuilder(false);
try {
try {
jdomDocConf = saxBuilder.build(new StringReader(configString));
parseConfigXml();
} catch (JDOMException jde) {
throw new GateRuntimeException(jde);
}
} catch (java.io.IOException ex) {
throw new GateRuntimeException("Error parsing config file String:\n" + configString, ex);
}
}
/**
* Constructor from File
* @param configFile File for feature config XML file
*/
public FeatureSpecification(File configFile) {
SAXBuilder saxBuilder = new SAXBuilder(false);
try {
try {
this.jdomDocConf = saxBuilder.build(configFile);
parseConfigXml();
} catch (JDOMException jde) {
throw new GateRuntimeException(jde);
}
} catch (java.io.IOException ex) {
throw new GateRuntimeException("Error parsing config file " + configFile, ex);
}
}
private void parseConfigXml() {
// TODO: process children in order, then dispatch how to parse based on type.
// Then, parsing ATTRIBUTE and ATTRIBUTELIST is nearly identical except that
// we parse the range in addition for ATTRIBUTELIST.
// Make an else part where we document how we might add additional stuff...
Element rootElement = jdomDocConf.getRootElement();
@SuppressWarnings("unchecked")
List elements = rootElement.getChildren();
int n = 0;
for (Element element : elements) {
n++;
String elementName = element.getName().toLowerCase();
if (elementName.equals("attribute")) {
featureInfo.add(parseSimpleAttribute(element, n));
} else if (elementName.equals("attributelist")) {
FeatureSpecSimpleAttribute att = parseSimpleAttribute(element, n);
int from = Integer.parseInt(element.getChildText("FROM"));
int to = Integer.parseInt(element.getChildText("TO"));
String withinType = getChildTextOrElse(element, "WITHIN", null);
featureInfo.add(new FeatureSpecAttributeList(att, withinType, from, to));
} else if (elementName.equals("ngram")) {
featureInfo.add(parseNgramAttribute(element, n));
} else {
throw new GateRuntimeException("Not a recognized element name for the LearningFramework config file: " + elementName);
}
}
// go through all the feature specifications and make sure the
// embedding settings are set to whatever we have stored for the id.
// If after this some values are still blank, it is the responsibility
// of the backend code to find the appropriate default values since
// different backends or algorithms could work better with different
// defaults
for (FeatureSpecAttribute fs : featureInfo.featureSpecs) {
if(fs.datatype == Datatype.nominal) {
if(fs.emb_file.isEmpty()) {
String tmp_emb_file = embeddingId2file.get(fs.emb_id);
if(tmp_emb_file != null) {
fs.emb_file = tmp_emb_file;
}
}
if(fs.emb_dims == 0) {
Integer tmp_emb_dims = embeddingId2dims.get(fs.emb_id);
if(tmp_emb_dims != null) {
fs.emb_dims = tmp_emb_dims;
}
}
if(fs.emb_train.isEmpty()) {
String tmp_emb_train = embeddingId2train.get(fs.emb_id);
if(tmp_emb_train != null) {
fs.emb_train = tmp_emb_train;
}
}
if(fs.emb_minfreq == 0) {
Integer tmp_emb_minfreq = embeddingId2minfreq.get(fs.emb_id);
if(tmp_emb_minfreq != null) {
fs.emb_minfreq = tmp_emb_minfreq;
} else {
fs.emb_minfreq = 1;
}
}
}
}
} // parseConfigXml
private FeatureSpecAttribute parseAndAddEmbeddingInfo(Element element, int i, FeatureSpecAttribute spec) {
// expects any FeatureSpec object and will add embedding info to it, if present
// This using the info already stored in the instance members to check for
// contradictions in the specification
// the element is the parent, so lets first get the embedding child, if any
Element emb = getChildOrNull(element, "EMBEDDINGS");
if(emb==null) {
return spec; // nothing there, nothing to do
}
// get all the possible settings for the embedding
String emb_id = getChildTextOrElse(emb, "ID", "");
String emb_file = getChildTextOrElse(emb, "FILE", "");
String emb_dims_str = getChildTextOrElse(emb, "DIMS", "");
String emb_minfreq_str = getChildTextOrElse(emb, "MINFREQ", "");
String emb_train = getChildTextOrElse(emb,"TRAIN","");
// only if any of the file,dim, or train things are set to non-empty,
// we need to bother
if(!emb_file.isEmpty()) {
// TODO: check the file exists already here!!
String have_file = embeddingId2file.get(emb_id);
if(have_file == null) {
embeddingId2file.put(emb_id, emb_file);
spec.emb_file = emb_file;
} else if(!emb_file.equals(have_file)) {
throw new GateRuntimeException("EMBEDDING setting file to "+emb_file+" for attribute "+i+" contradicts "+
have_file+" set previously");
} else {
spec.emb_file = emb_file;
}
} else { // the file spec is empty:
// if the file spec has been set for this id earlier, use that,
// otherwise the default value in the specification object is unchanged
// (empty string)
String have_file = embeddingId2file.get(emb_id);
if(have_file != null) {
spec.emb_file = have_file;
}
}
if(!emb_train.isEmpty()) {
if(!emb_train.equals("yes") && !emb_train.equals("no") && !emb_train.equals("mapping") &&
!emb_train.equals("onehot")) {
throw new GateRuntimeException("EMBEDDING TRAIN setting must be one of yes, no, onehot or mapping for attribute"+i);
}
String have_train = embeddingId2train.get(emb_id);
if(have_train == null) {
embeddingId2train.put(emb_id, emb_train);
spec.emb_train = emb_train;
} else if(!emb_train.equals(have_train)) {
throw new GateRuntimeException("EMBEDDING setting train to "+emb_train+" for attribute "+i+" contradicts "+
have_train+" set previously");
} else {
spec.emb_train = emb_train;
}
} else {
String have_train = embeddingId2train.get(emb_id);
if(have_train != null) {
spec.emb_train = have_train;
}
}
if(!emb_dims_str.isEmpty()) {
Integer have_dims = embeddingId2dims.get(emb_id);
Integer emb_dims = Integer.parseInt(emb_dims_str);
if(have_dims == null) {
embeddingId2dims.put(emb_id, emb_dims);
spec.emb_dims = emb_dims;
} else if(!emb_dims.equals(have_dims)) {
throw new GateRuntimeException("EMBEDDING setting dims to "+emb_dims+" for attribute "+i+" contradicts "+
have_dims+" set previously");
} else {
spec.emb_dims = emb_dims;
}
} else {
Integer have_dims = embeddingId2dims.get(emb_id);
if(have_dims != null) {
spec.emb_dims = have_dims;
}
}
if(!emb_minfreq_str.isEmpty()) {
Integer have_minfreq = embeddingId2minfreq.get(emb_id);
Integer emb_minfreq = Integer.parseInt(emb_minfreq_str);
if(have_minfreq == null) {
embeddingId2minfreq.put(emb_id, emb_minfreq);
spec.emb_minfreq = emb_minfreq;
} else if(!emb_minfreq.equals(have_minfreq)) {
throw new GateRuntimeException("EMBEDDING setting minfreq to "+emb_minfreq+" for attribute "+i+" contradicts "+
have_minfreq+" set previously");
} else {
spec.emb_minfreq = emb_minfreq;
}
} else {
Integer have_minfreq = embeddingId2minfreq.get(emb_id);
if(have_minfreq != null) {
spec.emb_minfreq = have_minfreq;
}
}
spec.emb_id = emb_id;
return spec;
}
private FeatureSpecSimpleAttribute parseSimpleAttribute(Element attributeElement, int i) {
String aname = getChildTextOrElse(attributeElement, "NAME", "").trim();
String feat = getChildTextOrElse(attributeElement, "FEATURE", "").trim();
String dtstr = getChildTextOrElse(attributeElement, "DATATYPE", null);
if (!feat.isEmpty() && dtstr == null) {
throw new GateRuntimeException("DATATYPE not specified for ATTRIBUTE " + i);
}
if(feat.isEmpty()) {
if(dtstr == null) {
dtstr = "bool";
} else if(!dtstr.equals("bool") && !dtstr.equals("boolean")) {
throw new GateRuntimeException("DATATYPE must be bool or not specified if no feature given in ATTRIBUTE "+i);
}
}
if(dtstr.equals("boolean")) {
dtstr = "bool"; // allow both but internally we use bool to avoid keyword clash.
}
Datatype dt = Datatype.valueOf(dtstr);
// TODO: this should be named ANNOTATIONTYPE or ANNTYPE to avoid confusion
// with the datatype
String atype = getChildTextOrElse(attributeElement, "TYPE", "");
// if empty we use the instance annotation type, whatever that is
//if (atype.isEmpty()) {
// System.err.println("Warning: TYPE in ATTRIBUTE "+i+" is empty, using instance annotation type");
//}
String codeasstr = getChildTextOrElse(attributeElement, "CODEAS", "").toLowerCase();
if (!codeasstr.isEmpty() && !codeasstr.equals("one_of_k") && !codeasstr.equals("number")) {
throw new GateRuntimeException("CODEAS for ATTRIBUTE " + i + " specified but not one_of_k or number but " + codeasstr);
}
// codeas currently only makes sense and is used for nominal, so complain if it is specified
// for other datatypes
if(!codeasstr.isEmpty() && (dt != Datatype.nominal) ) {
throw new GateRuntimeException("CODEAS can only be used with DATATYPE nominal for ATTRIBUTE "+i);
}
// for non-nominal, we always really use number
if(codeasstr.isEmpty() && dt != Datatype.nominal) {
codeasstr = "number";
}
// for nominal the default when not specified is on_of_k
if(codeasstr.isEmpty() && dt == Datatype.nominal) {
codeasstr = "one_of_k";
}
CodeAs codeas = CodeAs.valueOf(codeasstr);
// the default for missingvaluetreatment is special_value for numeric and
// number-coded nominal, but for one-of-k coded values, we use "zero_value"
// because this is usually how the absence of such values is coded!
String missingValueTreatmentStr;
String featureName4Value = "";
if(dt==Datatype.nominal && codeas==CodeAs.one_of_k) {
missingValueTreatmentStr = getChildTextOrElse(attributeElement, "MISSINGVALUETREATMENT", "keep");
featureName4Value = getChildTextOrElse(attributeElement,"FEATURENAME4VALUE","");
} else if (dt==Datatype.bool) {
missingValueTreatmentStr = getChildTextOrElse(attributeElement, "MISSINGVALUETREATMENT", "zero_value");
} else {
missingValueTreatmentStr = getChildTextOrElse(attributeElement, "MISSINGVALUETREATMENT", "special_value");
}
MissingValueTreatment mvt = MissingValueTreatment.valueOf(missingValueTreatmentStr);
// If the datatype is not anything other than nominal, we also allow the
// setting "listsep" for automatical list splitting
String listsep = getChildTextOrElse(attributeElement, "LISTSEP", "");
if(!listsep.isEmpty()) {
if(dt!=Datatype.nominal) {
throw new GateRuntimeException("LISTSEP only allowed if datatype is nominal");
}
}
String withinType = getChildTextOrElse(attributeElement, "WITHIN", null);
String defaultMissingValue = "";
if(dt == Datatype.bool) {
defaultMissingValue = "false";
} else if(dt == Datatype.numeric) {
defaultMissingValue = "0.0";
}
String missingValueValue = getChildTextOrElse(attributeElement, "MISSINGVALUE", defaultMissingValue);
// TODO: not implemented yet, but we should add this!!
String scalingMethod = "";
String transformMethod = "";
FeatureSpecSimpleAttribute att = new FeatureSpecSimpleAttribute(
aname,
atype,
feat,
dt,
codeas,
mvt,
missingValueValue,
scalingMethod,
transformMethod,
withinType,
listsep,
featureName4Value
);
// now if this is a nominal attribute, add any embedding block
if(dt == Datatype.nominal) {
att = (FeatureSpecSimpleAttribute)parseAndAddEmbeddingInfo(attributeElement, i, att);
} else {
if(getChildOrNull(attributeElement, "EMBEDDINGS") != null) {
throw new GateRuntimeException("EMBEDDINGS not allowed for non-NOMINAL ATTRIBUTE "+i);
}
}
return att;
}
private FeatureSpecAttribute parseNgramAttribute(Element ngramElement, int i) {
String aname = getChildTextOrElse(ngramElement,"NAME","").trim();
String annType = getChildTextOrElse(ngramElement,"TYPE","").trim();
if (annType.isEmpty()) {
throw new GateRuntimeException("TYPE in NGRAM " + i + " must not be missing or empty");
}
String numberString = getChildTextOrElse(ngramElement,"NUMBER","1").trim();
String featureName4Value = getChildTextOrElse(ngramElement,"FEATURENAME4VALUE","");
String maxlen = getChildTextOrElse(ngramElement,"MAXLEN","0");
String shorten = getChildTextOrElse(ngramElement,"SHORTEN","").toLowerCase(Locale.US);
String feature = getChildTextOrElse(ngramElement,"FEATURE","").trim();
if (feature.isEmpty()) {
throw new GateRuntimeException("FEATURE in NGRAM " + i + " must not be missing or empty");
}
if (!(shorten.equals("") || shorten.equals("left") ||
shorten.equals("both") || shorten.equals("middle"))) {
throw new GateRuntimeException("SHORTEN must be missing, empty or one of right, left, middle, both");
}
FeatureSpecNgram ng = new FeatureSpecNgram(
aname,
Integer.parseInt(numberString),
annType,
feature,
featureName4Value
);
ng.maxlen = Integer.parseInt(maxlen);
ng.shorten = shorten;
ng = (FeatureSpecNgram)parseAndAddEmbeddingInfo(ngramElement, i, ng);
return ng;
}
private FeatureInfo featureInfo = new FeatureInfo();
/**
* Return the FeatureInfo object for this specification.
*
* This will always return a new deep copy of the FeatureInfo that corresponds
* to the information inf the FeatureSepcification.
*
* @return FeatureInfo instance
*/
public FeatureInfo getFeatureInfo() {
return new FeatureInfo(featureInfo); // this returns a cloned copy of the original
}
//// HELPER METHODS
/**
* Return the text of a single child element or a default value. This checks that there is at most
* one child of this annType and throws and exception if there are more than one. If there is no
* child of this name, then the value elseVal is returned. NOTE: the value returned is trimmed if
* it is a string, but case is preserved.
* NOTE: this tries both the all-uppercase and the all-lowercase variant of the given name.
*/
private static String getChildTextOrElse(Element parent, String name, String elseVal) {
@SuppressWarnings("unchecked")
List children = parent.getChildren(name);
if (children.size() > 1) {
throw new GateRuntimeException("Element " + parent.getName() + " has more than one nested " + name + " element");
}
if(children.isEmpty()) {
return elseVal;
}
String tmp = parent.getChildTextTrim(name.toUpperCase());
if(tmp == null) {
tmp = parent.getChildText(name.toLowerCase());
}
if (tmp == null) {
return elseVal;
} else {
return tmp;
}
}
private static Element getChildOrNull(Element parent, String name) {
@SuppressWarnings("unchecked")
List children = parent.getChildren(name);
if (children.size() > 1) {
throw new GateRuntimeException("Element " + parent.getName() + " has more than one nested " + name + " element");
} else if (children.isEmpty()) {
return null;
} else {
return children.get(0);
}
}
@Override
public String toString() {
// The only difference between this and the feature info is that we also
// know about the embedding mapping, so just print the feature info
return featureInfo.toString();
}
}