gate.plugin.learningframework.export.CorpusExporterMRARFF Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of learningframework Show documentation
Show all versions of learningframework Show documentation
A GATE plugin that provides many different machine learning
algorithms for a wide range of NLP-related machine learning tasks like
text classification, tagging, or chunking.
/*
* Copyright (c) 2015-2016 The University Of Sheffield.
*
* This file is part of gateplugin-LearningFramework
* (see https://github.com/GateNLP/gateplugin-LearningFramework).
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this software. If not, see .
*/
package gate.plugin.learningframework.export;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Label;
import gate.plugin.learningframework.Globals;
import gate.plugin.learningframework.data.Attribute;
import gate.plugin.learningframework.data.Attributes;
import gate.plugin.learningframework.data.CorpusRepresentationMallet;
import gate.plugin.learningframework.engines.Info;
import gate.plugin.learningframework.features.CodeAs;
import gate.plugin.learningframework.features.Datatype;
import gate.plugin.learningframework.features.FeatureExtractionMalletSparse;
import gate.plugin.learningframework.features.MissingValueTreatment;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
/**
*
* @author johann
*/
public class CorpusExporterMRARFF extends CorpusExporterMR {
@Override
public Info getInfo() {
Info info = new Info();
info.algorithmClass = "gate.plugin.learningframework.engines.AlgorithmClassification";
info.algorithmName = "DUMMY";
info.engineClass = "gate.plugin.learningframework.engines.EngineWekaExternal";
info.modelClass = "DUMMY";
return info;
}
@Override
public void export() {
exportMeta();
CorpusRepresentationMallet crm = (CorpusRepresentationMallet)corpusRepresentation;
InstanceList malletInstances = crm.getRepresentationMallet();
Pipe pipe = malletInstances.getPipe();
Attributes attrs = new Attributes(pipe,instanceType);
// We create two files: one with just the header and no instances and
// one with everything
File headerOnlyFile = new File(dataDirFile,Globals.headerBasename+".arff");
File dataFile = new File(dataDirFile,Globals.dataBasename+".arff");
PrintStream headerOut = null;
try {
headerOut = new PrintStream(new FileOutputStream(headerOnlyFile));
} catch (FileNotFoundException ex) {
throw new RuntimeException("Could not open "+headerOnlyFile.getAbsolutePath(),ex);
}
PrintStream dataOut = null;
try {
dataOut = new PrintStream(new FileOutputStream(dataFile));
} catch (FileNotFoundException ex) {
throw new RuntimeException("Could not open "+dataFile.getAbsolutePath(),ex);
}
headerOut.println("@RELATION GateLearningFramework");
dataOut.println("@RELATION GateLearningFramework");
for(Attribute attr : attrs) {
headerOut.print("@ATTRIBUTE ");
dataOut.print("@ATTRIBUTE ");
// get the name, if necessary, escape it properly
String name = escape4Arff(attr.name);
headerOut.print(name);
dataOut.print(name);
headerOut.print(" ");
dataOut.print(" ");
if(attr.datatype == Datatype.numeric) {
headerOut.print("NUMERIC");
dataOut.print("NUMERIC");
} else if(attr.datatype == Datatype.nominal && attr.codeAs == CodeAs.number ||
attr.datatype == Datatype.bool) {
if(attr.alphabet == null) {
throw new RuntimeException("Attribute is not numeric but no alphabet: "+attr);
}
String vals = alphabet2Arff(attr.alphabet,attr.mvTreatment);
headerOut.print(vals);
dataOut.print(vals);
} else {
// fall back is NUMERIC
headerOut.print("NUMERIC");
dataOut.print("NUMERIC");
}
headerOut.println();
dataOut.println();
} // for attr : attrs
// Now one more line for the target
Attribute target = attrs.getTargetAttribute();
headerOut.print("@ATTRIBUTE ");
dataOut.print("@ATTRIBUTE ");
// get the name, if necessary, escape it properly
String name = escape4Arff(target.name);
headerOut.print(name);
dataOut.print(name);
headerOut.print(" ");
dataOut.print(" ");
if(target.datatype == Datatype.numeric) {
headerOut.print("NUMERIC");
dataOut.print("NUMERIC");
} else {
if(target.alphabet == null) {
throw new RuntimeException("target is not numeric but no alphabet: "+target);
}
String vals = alphabet2Arff(target.alphabet,null);
headerOut.print(vals);
dataOut.print(vals);
}
headerOut.println();
dataOut.println();
headerOut.println("@DATA");
dataOut.println("@DATA");
try {
headerOut.close();
} catch(Exception ex) {
//
}
// export the actual data in sparse format
// TODO: make sure we respect the flag to ignore an instance with missing values
// TODO: if the instance has a weight, also output the weight!!
for(Instance inst : malletInstances) {
String line = instance2WekaArffLine(inst,attrs);
dataOut.println(line);
}
try {
dataOut.close();
} catch(Exception ex) {
//
}
} // export
/**
* Escape characters as needed for the ARFF format.
* If the string contains a quote character, a percent character or
* any whitespace those characters are escaped with a backslash.
* Also, a backslash is escaped with a backslash.
* If any character needed to be escaped, the whole string is quoted.
* The string is also quoted if it contains curly braces.
*
* @param what the string to escape
* @return escaped string
*/
public static String escape4Arff(String what) {
if(what == null) {
what = "";
}
if(what.trim().isEmpty()) {
return "'" + what + "'";
}
int len = what.length();
what = what.replaceAll("([\"'%\\n\\r \\t\\\\])", "\\\\$1");
if(what.length()!=len || what.contains("{") || what.contains("}") || what.contains(",")) {
what = "'" + what + "'";
}
return what;
}
/**
* Convert alphabet to ARFF declaration string.
* @param alph Mallet alphabet
* @param mvt missing value treatment setting
* @return ARFF declaration
*/
public String alphabet2Arff(Alphabet alph, MissingValueTreatment mvt) {
// NOTE: mvt can be null, if this is used for a target!!
StringBuilder sb = new StringBuilder();
sb.append("{");
for(int i=0; i0) sb.append(",");
String val = alph.lookupObject(i).toString();
sb.append(escape4Arff(val));
}
// TODO: we may need to add the definition for the missing value here,
// but by default, we do not do that.
sb.append("}");
return sb.toString();
}
/**
* Convert an instance to the ARFF representation.
*
* This does not include a final new-line character!
* NOTE: this returns null if the instance is flagged that it should
* be ignored because it contains a missing value.
*
* @param inst instance
* @param attrs attributes
* @return arff data line
*/
public String instance2WekaArffLine(Instance inst, Attributes attrs) {
return instance2WekaArffLine(inst,attrs,true);
}
private String instance2WekaArffLine(Instance inst, Attributes attrs, boolean filterMVs) {
StringBuilder sb = new StringBuilder();
if(filterMVs) {
Object ignore = inst.getProperty(FeatureExtractionMalletSparse.PROP_IGNORE_HAS_MV);
// If the flag says the instance should get ignored, return null
// to indicate to the caller that this is an ignored instance.
if(ignore != null && ignore.equals(true)) {
return null;
}
}
Double instanceWeight = (Double)inst.getProperty("instanceWeight");
Object data = inst.getData();
if(data instanceof FeatureVector) {
FeatureVector vector = (FeatureVector)data;
sb.append("{");
boolean first = true;
// TODO: maybe it is easier to do
// for(int idx : vector.getIndices)
for(int i=0; i