All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.plugin.learningframework.export.CorpusExporterMRLibSVM Maven / Gradle / Ivy

Go to download

A GATE plugin that provides many different machine learning algorithms for a wide range of NLP-related machine learning tasks like text classification, tagging, or chunking.

There is a newer version: 4.2
Show newest version
/*
 * Copyright (c) 2015-2016 The University Of Sheffield.
 *
 * This file is part of gateplugin-LearningFramework 
 * (see https://github.com/GateNLP/gateplugin-LearningFramework).
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 2.1 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this software. If not, see .
 */

package gate.plugin.learningframework.export;

import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Label;
import gate.plugin.learningframework.data.CorpusRepresentationMallet;
import gate.plugin.learningframework.engines.Info;
import gate.plugin.learningframework.features.FeatureExtractionMalletSparse;
import gate.util.GateRuntimeException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.text.DecimalFormat;

/**
 *
 * @author johann
 */
public class CorpusExporterMRLibSVM extends CorpusExporterMR {

  @Override
  public Info getInfo() {
    Info info = new Info();
    // TODO: we should check for regression vs classification HER£E
    info.algorithmClass = "gate.plugin.learningframework.engines.AlgorithmClassification";
    info.algorithmName = "DUMMY";
    info.engineClass = "DUMMY";
    info.modelClass =  "DUMMY";    
    return info;
  }

  @Override
  public void export() {
    exportMeta();
    CorpusRepresentationMallet crm = (CorpusRepresentationMallet)corpusRepresentation;
    PrintStream out = null;
    File outFile = new File(dataDirFile, "data.libsvm");
    try {
      out = new PrintStream(outFile);
    } catch (FileNotFoundException ex) {
      throw new GateRuntimeException("Could not open output file "+outFile,ex);
    }
    // For libSVM format, the all features and the target are always represented 
    // as numeric values and NaN is not allowed. 
    // If an instance has been marked to be skipped because of a MV we do that,
    // otherwise if a MV is encountered we replace it with 0 without warning
    
    // IMPORTANT! according to LibSVM README the libsvm sparse vector representation
    // in the external file must start with location 1 and have increasing locations.
    // However, in the code for read_problem() this does not seem to be the case and
    // precomputed kernels are only used if a parameter is specified.
    // However the examples all use 1 as the first index, so we do this here too
    
    
    InstanceList instances = crm.getRepresentationMallet();
    DecimalFormat DFf = new DecimalFormat("#.##########");
    for(Instance instance : instances) {
      Boolean ignoreInstance = (Boolean)instance.getProperty(FeatureExtractionMalletSparse.PROP_IGNORE_HAS_MV);
      if(ignoreInstance) {
        continue;
      }
      Boolean haveMV = (Boolean)instance.getProperty(FeatureExtractionMalletSparse.PROP_HAVE_MV);      
      Object targetObj = instance.getTarget();
      double target = 0.0;
      if(targetObj == null) {
        // TODO: NOTE: we export instances with missing targets in the ARFF exporter, so 
        // we do it here too. However, other than there we always have to output a target
        // because the format requires it. We therefore always output target 0.0 
        // which is the default value, so nothing to do        
      } else if (targetObj instanceof Double) {
        target = (Double)targetObj;
      } else if(targetObj instanceof Label) {
        Label label = (Label)targetObj;
        target = label.getIndex();
      }
      out.print(DFf.format(target));
      out.print(" ");
      // Now print all the feature values, but no MVs which will be ignored (as this is a sparse 
      // representation and we want to replace them with zero here).
      // NOTE: if the values were transformed, zero may not be the correct value to use,
      // this problem is ignored for now!!!
      Object fvObj = instance.getData();
      if(fvObj instanceof FeatureVector) {
        FeatureVector fv = (FeatureVector)fvObj;
        //for(int idx : fv.getIndices()) {
        for(int loc=0; loc




© 2015 - 2024 Weber Informatics LLC | Privacy Policy