gate.plugin.learningframework.export.CorpusExporterMR Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of learningframework Show documentation
Show all versions of learningframework Show documentation
A GATE plugin that provides many different machine learning
algorithms for a wide range of NLP-related machine learning tasks like
text classification, tagging, or chunking.
/*
* Copyright (c) 2015-2016 The University Of Sheffield.
*
* This file is part of gateplugin-LearningFramework
* (see https://github.com/GateNLP/gateplugin-LearningFramework).
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this software. If not, see .
*/
package gate.plugin.learningframework.export;
import gate.plugin.learningframework.ScalingMethod;
import gate.plugin.learningframework.data.CorpusRepresentationMallet;
import gate.plugin.learningframework.data.CorpusRepresentationMalletTarget;
import gate.plugin.learningframework.engines.Info;
import gate.plugin.learningframework.mallet.LFPipe;
import java.util.ArrayList;
/**
* Common base class of all mallet-related exporters.
* @author johann
*/
public abstract class CorpusExporterMR extends CorpusExporter {
@Override
public void initWhenCreating() {
// for all mallet related exporters, we need to create a mallet corpus
// representation here, either seq or target, depending on the actual
// exporter. We provide a default implementation here which creates a
// target CR, the seq exporters then override in turn
// TODO: need to properly support scaling when exporting!
corpusRepresentation = new CorpusRepresentationMalletTarget(
featureInfo,
targetType);
}
// All the mallet related exporters also write the pipe and the info, each
// of the export() implementations should call this method
// This is done as the first step in the export() method and since
// the scaling needs to be done before exporting, the finishAdding() method
// is called in here always, just to be sure. The finishAdding() method is
// not doing anything on any call after the first call.
public void exportMeta() {
CorpusRepresentationMallet crm = (CorpusRepresentationMallet)corpusRepresentation;
crm.finishAdding();
// get the pre-filled info object
Info info = getInfo();
// In addition to the actual data file exported by the methods above,
// always also export the pipe and a template info file!
info.classAnnotationType = "null";
LFPipe lfpipe = crm.getPipe();
if (lfpipe.getTargetAlphabet() == null) {
info.classLabels = null;
} else {
//info.classLabels = lfpipe.getTargetAlphabet().toArray();
Object[] objs = lfpipe.getTargetAlphabet().toArray();
info.nrTargetValues = objs.length;
ArrayList labels = new ArrayList<>();
for (Object obj : objs) {
labels.add(obj.toString());
}
info.classLabels = labels;
}
info.nrTrainingDimensions = lfpipe.getDataAlphabet().size();
info.nrTrainingDocuments = 0;
info.nrTrainingInstances = crm.getRepresentationMallet().size();
info.targetFeature = "class";
info.task = "CLASSIFIER";
info.trainerClass = "";
info.trainingCorpusName = "";
info.save(dataDirFile);
// finally save the Mallet corpus representation
crm.savePipe(dataDirFile);
}
}