All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.stemmers.Stemming Maven / Gradle / Ivy

Go to download

The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This is the stable version. Apart from bugfixes, this version does not receive any other updates.

There is a newer version: 3.8.6
Show newest version
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 * Stemming.java
 * Copyright (C) 2005 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.stemmers;

import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.Utils;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.util.Enumeration;
import java.util.Vector;

/**
 * A helper class for using the stemmers. Run with option '-h' to list
 * all the available options.
 *
 * @author  FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 1.3 $
 */
public class Stemming
  implements RevisionHandler {

  /**
   * lists all the options on the command line
   *
   * @param stemmer     the stemmer to list the parameters for
   * @return 		the option string
   */
  protected static String makeOptionsString(Stemmer stemmer) {
    Vector          options;
    Enumeration     enm;
    StringBuffer    result;

    options = new Vector();
    
    // general options
    options.add(
        new Option(
          "\tDisplays this help.",
          "h", 0, "-h"));

    options.add(
        new Option(
          "\tThe file to process.",
          "i", 1, "-i "));

    options.add(
        new Option(
          "\tThe file to output the processed data to (default stdout).",
          "o", 1, "-o "));

    options.add(
        new Option(
          "\tUses lowercase strings.",
          "l", 0, "-l"));

    // stemmer options?
    if (stemmer instanceof OptionHandler) {
      enm = ((OptionHandler) stemmer).listOptions();
      while (enm.hasMoreElements())
        options.add(enm.nextElement());
    }

    // print options
    result = new StringBuffer();
    result.append("\nStemmer options:\n\n");
    enm = options.elements();
    while (enm.hasMoreElements()) {
      Option option = (Option) enm.nextElement();
      result.append(option.synopsis() + "\n");
      result.append(option.description() + "\n");
    }

    return result.toString();
  }
  
  /**
   * Applies the given stemmer according to the given options. '-h' lists
   * all the available options for the given stemmer.
   *
   * @param stemmer     the stemmer to use
   * @param options     the options for the stemmer
   * @throws Exception	if something goes wrong
   */
  public static void useStemmer(Stemmer stemmer, String[] options) 
    throws Exception {

    Reader          reader;
    StringBuffer    input;
    Writer          output;
    String          tmpStr;
    boolean         lowerCase;
    
    // help?
    if (Utils.getFlag('h', options)) {
      System.out.println(makeOptionsString(stemmer));
      return;
    }

    // input file
    tmpStr = Utils.getOption('i', options);
    if (tmpStr.length() == 0)
      throw new IllegalArgumentException(
          "No input file defined!" + makeOptionsString(stemmer));
    else
      reader = new BufferedReader(
                  new InputStreamReader(new FileInputStream(tmpStr)));

    input = new StringBuffer();

    // output file?
    tmpStr = Utils.getOption('o', options);
    if (tmpStr.length() == 0)
      output = new BufferedWriter(
                  new OutputStreamWriter(System.out));
    else
      output = new BufferedWriter(
                  new OutputStreamWriter(new FileOutputStream(tmpStr)));

    // lowercase?
    lowerCase = Utils.getFlag('l', options);

    // stemmer options
    if (stemmer instanceof OptionHandler)
      ((OptionHandler) stemmer).setOptions(options);
    
    // unknown options?
    try {
      Utils.checkForRemainingOptions(options);
    }
    catch (Exception e) {
      System.out.println(e.getMessage());
      System.out.println(makeOptionsString(stemmer));
      return;
    }
    
    // process file
    int character;
    while ((character = reader.read()) != -1) {
      char ch = (char) character;
      if (Character.isWhitespace((char) ch)) {
        if (input.length() > 0) {
          output.write(stemmer.stem(input.toString()));
          input = new StringBuffer();
        }
        output.write(ch);
      } 
      else {
        if (lowerCase)
          input.append(Character.toLowerCase(ch));
        else
          input.append(ch);
      }
    }
    output.flush();
  }
  
  /**
   * Returns the revision string.
   * 
   * @return		the revision
   */
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 1.3 $");
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy