All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.plugin.learningframework.mbstats.FVStatsMeanVarAll Maven / Gradle / Ivy

Go to download

A GATE plugin that provides many different machine learning algorithms for a wide range of NLP-related machine learning tasks like text classification, tagging, or chunking.

There is a newer version: 4.2
Show newest version
/*
 * Copyright (c) 2015-2016 The University Of Sheffield.
 *
 * This file is part of gateplugin-LearningFramework 
 * (see https://github.com/GateNLP/gateplugin-LearningFramework).
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 2.1 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this software. If not, see .
 */

package gate.plugin.learningframework.mbstats;

import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import java.util.ArrayList;
import java.util.List;

/**
 * Stats object that calculates the means and variances of all features in the 
 * vectors.
 * 
 * @author Johann Petrak
 */
public class FVStatsMeanVarAll implements FeatureVectorStats {

  /**
   * Constructor from instance list.
   * @param instances instances
   */
  public FVStatsMeanVarAll(InstanceList instances) {
    for(Instance instance : instances) {
      FeatureVector fv = (FeatureVector)instance.getData();
      addFeatureVector(fv);
    }
    finish();
  }
  
  protected List pfs = new ArrayList();
  protected int nrInstances = 0;
  protected boolean immutable = false;
  
  @Override
  public void addFeatureVector(FeatureVector fv) {
    if(immutable) {
      throw new RuntimeException("Stats object is immutable cannot add feature Vector");
    }
    nrInstances++;
    int[] indices = fv.getIndices();
    double[] values = fv.getValues();
    for (int j = 0; j < indices.length; j++) {
      int index = indices[j];
      double value = values[j];
      // If the value is a NaN, we disregard it: however, if this feature 
      // ends up having some non-NaN features, the mean and variance should be
      // unaffected because we still count this feature vector (and not adding
      // anything to sum and sum squares is the same as adding 0).
      // However, the calculation of min or max may be wrong in some cases.
      if(Double.isNaN(value)) {
        continue;
      }
      // get the PerFeatureStats values for this feature or create it.
      // also, if needed, extend the array of PerFeatureStats objects
      
      // to accommodate index i we need size i+1
      if(pfs.size()<(index+1)) {
        for(int i=pfs.size(); i<=index; i++) {
          pfs.add(null);
        }
      }
      PerFeatureStats pf = pfs.get(index);
      if(pf==null) {        
        pf = new PerFeatureStats();
        pfs.set(index, pf);
      }
      
      // now do the actual stats collection: if the value in pf is still NaN
      // set it, otherwise recalculate it. However, we do not check all the 
      // values in pf for NaN because we can infer e.g. that sumofsquares is
      // NaN if sum is NaN.
      if(Double.isNaN(pf.sum)) {
        pf.sum = value;
        pf.sumOfSquares = value*value;
        pf.min = value;
        pf.max = value;
        pf.binary = value == 0.0 || value == 1.0;
      } else {
        pf.sum += value;
        pf.sumOfSquares += value*value;
        if(pf.binary == true && value != 0.0 && value != 1.0) {
          pf.binary = false;
        }
      }
      // we re-calculate mean and variance immediately
      pf.mean = pf.sum / nrInstances;
      // TODO: population or sample variance?
      pf.var = pf.sumOfSquares / nrInstances;
    } // for indices

  }

  @Override
  public void finish() {
    // Nothing to do really since we calculate everything on the fly
    immutable = true;
  }
  
  public List getStats() {
    return pfs;
  }

  
  @Override
  public String toString() {
    StringBuilder sb = new StringBuilder();
    sb.append("FVStatsMeanVarAll{");
    sb.append("n="); sb.append(nrInstances);
    sb.append(",");
    int i = 0;
    for(PerFeatureStats pf : pfs) {
      if(i!=0) {
        sb.append(",");
      }
      sb.append(i++);
      sb.append("=");
      if(pf==null) {
        sb.append("(null)");
      } else {
        sb.append(pf.toString());
      }
    }
    sb.append("}");
    return sb.toString();
  }
  
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy