All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.plugin.learningframework.export.CorpusExporterMRCSV Maven / Gradle / Ivy

Go to download

A GATE plugin that provides many different machine learning algorithms for a wide range of NLP-related machine learning tasks like text classification, tagging, or chunking.

There is a newer version: 4.2
Show newest version
/*
 * Copyright (c) 2015-2016 The University Of Sheffield.
 *
 * This file is part of gateplugin-LearningFramework 
 * (see https://github.com/GateNLP/gateplugin-LearningFramework).
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 2.1 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this software. If not, see .
 */

package gate.plugin.learningframework.export;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import gate.plugin.learningframework.Globals;
import gate.plugin.learningframework.data.Attribute;
import gate.plugin.learningframework.data.Attributes;
import gate.plugin.learningframework.data.CorpusRepresentationMallet;
import gate.plugin.learningframework.engines.Info;
import gate.plugin.learningframework.engines.Parms;
import gate.plugin.learningframework.features.CodeAs;
import gate.plugin.learningframework.features.Datatype;
import gate.plugin.learningframework.features.FeatureExtractionMalletSparse;
import gate.plugin.learningframework.mallet.NominalTargetWithCosts;
import gate.util.GateRuntimeException;
import gate.util.Strings;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;

/**
 * Exporter for comma separated values and tab separated values formats.
 * 
 * @author johann
 */
public class CorpusExporterMRCSV extends CorpusExporterMR {

  @Override
  public Info getInfo() {
    Info info = new Info();
    
    info.algorithmClass = "gate.plugin.learningframework.engines.AlgorithmClassification";
    info.algorithmName = "DUMMY";
    info.engineClass = "gate.plugin.learningframework.engines.EngineDUMMY";
    info.modelClass =  "DUMMY";    
    return info;
  }

  @Override
  public void export() {   
    exportMeta();
    CorpusRepresentationMallet crm = (CorpusRepresentationMallet)corpusRepresentation;
    InstanceList malletInstances = crm.getRepresentationMallet();
    Pipe pipe = malletInstances.getPipe();
    Attributes attrs = new Attributes(pipe,instanceType);
    // We create either one or two files: if the parameter -twofiles or -t
    // is specified, then there will be one file for the independent variables
    // and one file for the targets, otherwise the targets will be at the
    // end of each record. 
    // By default, the forst row is a header row that contains all the names
    // of the features and "target" for the target column.
    // The header lines is suppressed with the option -n or -noheader
    // The format of the file is CSV by default, but can be switched to 
    // TSV if -T or -TSV is specified.
    // TODO: If the first instance has an instance weight, then an additional file
    // for the instance weights is created.
    // TODO: add option for how to treat missing values: filter instance or 
    // use whatever value is stored or set to 0.0 or set to NaN etc??
    // TODO: add parameter that will output the target label and not the target 
    // index
    
    Parms ps = new Parms(parms, "t:twofiles:b", "n:noheader:b","T:TSV:b","s:separator:s","S:string:b");
    boolean twofiles = (boolean)ps.getValueOrElse("twofiles", false);
    boolean noheader = (boolean)ps.getValueOrElse("noheader", false);
    boolean tsv = (boolean)ps.getValueOrElse("TSV",false);
    // TODO: 
    boolean filterMV = false;
    String defaultSep = tsv ? "\\t" : ",";
    String separator = (String)ps.getValueOrElse("separator",defaultSep);
    separator = Strings.unescape(separator);
    String extension = tsv ? ".tsv" : ".csv";
    boolean asString = (boolean) ps.getValueOrElse("string", false);
    if(asString && !tsv) {
      throw new GateRuntimeException("Option S/string only supported for TSV format (option T/TSV)");
    }
    System.err.println("DEBUG: writing nominal values as string: "+asString);
    
    PrintStream dataOut = null;
    File dataFile = null;
    try {
      String basename = Globals.dataBasename;
      if(twofiles) {
        basename = "indep";
      }
      dataFile = new File(dataDirFile,basename+extension);
      dataOut = new PrintStream(new FileOutputStream(dataFile));
    } catch (FileNotFoundException ex) {
      throw new RuntimeException("Could not open "+dataFile.getAbsolutePath(),ex);
    }
    
    PrintStream targetOut = null;
    File targetFile = null;
    if(twofiles) {
      try {
        targetFile = new File(dataDirFile,"dep"+extension);
        targetOut = new PrintStream(new FileOutputStream(targetFile));
        //System.err.println("DEBUG: opened dep file "+targetFile.getAbsolutePath());
      } catch (FileNotFoundException ex) {
        throw new RuntimeException("Could not open "+targetFile.getAbsolutePath(),ex);
      }
    } else {
      targetOut = dataOut;
    }
    if(!noheader) {
      boolean firstField = true;
      for (Attribute attr : attrs) {
        if (firstField) {
          firstField = false;
        } else {
          dataOut.print(separator);
        }
        // get the name, if necessary, escape it properly
        String name = tsv ? prepare4TSV(attr.name) : escape4CSV(attr.name);
        dataOut.print(name);
      } // for attr : attrs
      // Now add the header for the target column if we just have one file,
      // otherwise write it into the target file
      if (twofiles) {
        targetOut.println("target");
      } else {
        targetOut.print(separator);
        targetOut.println("target");
      }
    }
    int nrFeatures = pipe.getDataAlphabet().size();
    // export the actual data in dense format
    // TODO: make sure we respect the flag to ignore an instance with missing values
    // TODO: if the instance has a weight, also output the weight!!
    for(Instance inst : malletInstances) {
      // TODO
      if(filterMV) {
        Object ignore = inst.getProperty(FeatureExtractionMalletSparse.PROP_IGNORE_HAS_MV);    
        // If the flag says the instance should get ignored, return null
        // to indicate to the caller that this is an ignored instance.
        if(ignore != null && ignore.equals(true)) {
          continue;
        }        
      } // filterMV
      
      /////////////////////////////////////////////////////
      
      Double instanceWeight = (Double)inst.getProperty("instanceWeight");
      Object data = inst.getData();
      if(data instanceof FeatureVector) {
        FeatureVector vector = (FeatureVector)data;
        boolean first = true;
        for(int i=0; i




© 2015 - 2024 Weber Informatics LLC | Privacy Policy