All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.stemmers.SnowballStemmer Maven / Gradle / Ivy

/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 * SnowballStemmer.java
 * Copyright (C) 2005 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.stemmers;

import weka.core.ClassDiscovery;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.gui.GenericObjectEditor;

import java.lang.reflect.Method;
import java.util.Enumeration;
import java.util.Vector;

/**
 
 * A wrapper class for the Snowball stemmers. Only available if the Snowball classes are in the classpath.
* If the class discovery is not dynamic, i.e., the property 'UseDynamic' in the props file 'weka/gui/GenericPropertiesCreator.props' is 'false', then the property 'org.tartarus.snowball.SnowballProgram' in the 'weka/gui/GenericObjectEditor.props' file has to be uncommented as well. If necessary you have to discover and fill in the snowball stemmers manually. You can use the 'weka.core.ClassDiscovery' for this:
* java weka.core.ClassDiscovery org.tartarus.snowball.SnowballProgram org.tartarus.snowball.ext
*
* For more information visit these web sites:
* http://weka.wikispaces.com/Stemmers
* http://snowball.tartarus.org/
*

* * Valid options are:

* *

 -S <name>
 *  The name of the snowball stemmer (default 'porter').
 *  available stemmers:
 *     danish, dutch, english, finnish, french, german, italian, 
 *     norwegian, porter, portuguese, russian, spanish, swedish
 * 
* * * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 5836 $ */ public class SnowballStemmer implements Stemmer, OptionHandler { /** for serialization. */ static final long serialVersionUID = -6111170431963015178L; /** the package name for snowball. */ public final static String PACKAGE = "org.tartarus.snowball"; /** the package name where the stemmers are located. */ public final static String PACKAGE_EXT = PACKAGE + ".ext"; /** the snowball program, all stemmers are derived from. */ protected final static String SNOWBALL_PROGRAM = PACKAGE + ".SnowballProgram"; /** whether the snowball stemmers are in the Classpath. */ protected static boolean m_Present = false; /** contains the all the found stemmers (language names). */ protected static Vector m_Stemmers; /** the current stemmer. */ protected Object m_Stemmer; /** the stem method. */ protected transient Method m_StemMethod; /** the setCurrent method. */ protected transient Method m_SetCurrentMethod; /** the getCurrent method. */ protected transient Method m_GetCurrentMethod; /** check for Snowball statically (needs only to be done once) */ static { checkForSnowball(); } /** * initializes the stemmer ("porter"). */ public SnowballStemmer() { this("porter"); initStemmers(); } /** * initializes the stemmer with the given stemmer. * * @param name the name of the stemmer */ public SnowballStemmer(String name) { super(); setStemmer(name); } /** * checks whether Snowball is present in the classpath. */ private static void checkForSnowball() { try { Class.forName(SNOWBALL_PROGRAM); m_Present = true; } catch (Exception e) { m_Present = false; } } /** * Returns a string describing the stemmer. * * @return a description suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "A wrapper class for the Snowball stemmers. Only available if the " + "Snowball classes are in the classpath.\n" + "If the class discovery is not dynamic, i.e., the property 'UseDynamic' " + "in the props file 'weka/gui/GenericPropertiesCreator.props' is 'false', " + "then the property 'org.tartarus.snowball.SnowballProgram' in the " + "'weka/gui/GenericObjectEditor.props' file has to be uncommented " + "as well. If necessary you have to discover and fill in the snowball " + "stemmers manually. You can use the 'weka.core.ClassDiscovery' for this:\n" + " java weka.core.ClassDiscovery org.tartarus.snowball.SnowballProgram org.tartarus.snowball.ext\n" + "\n" + "For more information visit these web sites:\n" + " http://weka.wikispaces.com/Stemmers\n" + " http://snowball.tartarus.org/\n"; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector result; result = new Vector(); result.addElement(new Option( "\tThe name of the snowball stemmer (default 'porter').\n" + "\tavailable stemmers:\n" + getStemmerList(65, "\t "), "S", 1, "-S ")); return result.elements(); } /** * Parses the options.

* * Valid options are:

* *

 -S <name>
   *  The name of the snowball stemmer (default 'porter').
   *  available stemmers:
   *     danish, dutch, english, finnish, french, german, italian, 
   *     norwegian, porter, portuguese, russian, spanish, swedish
   * 
* * * @param options the options to parse * @throws Exception if parsing fails */ public void setOptions(String[] options) throws Exception { String tmpStr; tmpStr = Utils.getOption('S', options); if (tmpStr.length() != 0) setStemmer(tmpStr); else setStemmer("porter"); } /** * Gets the current settings of the classifier. * * @return an array of strings suitable for passing to setOptions */ public String[] getOptions() { Vector result; result = new Vector(); if (getStemmer() != null) { result.add("-S"); result.add("" + getStemmer()); } return (String[]) result.toArray(new String[result.size()]); } /** * extracts the stemmer name form the classname. * * @param classname the full classname of the stemmer * @return the name of the stemmer */ private static String getStemmerName(String classname) { return classname.replaceAll(".*\\.", "").replaceAll("Stemmer$", ""); } /** * returns the full classname of the stemmer. * * @param name the name of the stemmer * @return the full classname of the stemmer * @see #PACKAGE_EXT */ private static String getStemmerClassname(String name) { return PACKAGE_EXT + "." + name + "Stemmer"; } /** * retrieves the language names of the availabel stemmers. */ private static void initStemmers() { Vector classnames; int i; if (m_Stemmers != null) return; m_Stemmers = new Vector(); if (!m_Present) return; classnames = GenericObjectEditor.getClassnames(SNOWBALL_PROGRAM); // try dynamic discovery if not in props file if (classnames.size() == 0) { classnames = ClassDiscovery.find(SNOWBALL_PROGRAM, PACKAGE_EXT); for (i = 0; i < classnames.size(); i++) m_Stemmers.add(getStemmerName(classnames.get(i).toString())); } } /** * returns whether Snowball is present or not, i.e. whether the classes are * in the classpath or not * * @return whether Snowball is available */ public static boolean isPresent() { return m_Present; } /** * returns an enumeration over all currently stored stemmer names. * * @return all available stemmers */ public static Enumeration listStemmers() { initStemmers(); return m_Stemmers.elements(); } /** * generates a comma list of the available stemmers. * * @param lineLength the max line length, before a linefeed is inserted * (0 is unlimited) * @param indention the indention of a line * @return the generated list */ private static String getStemmerList(int lineLength, String indention) { String result; Enumeration enm; String name; String line; result = ""; line = ""; enm = listStemmers(); while (enm.hasMoreElements()) { name = enm.nextElement().toString(); if (line.length() > 0) line += ", "; if ( (lineLength > 0) && (line.length() + name.length() > lineLength) ) { result += indention + line + "\n"; line = ""; } line += name; } if (line.length() > 0) result += indention + line + "\n"; return result; } /** * returns the name of the current stemmer, null if none is set. * * @return the name of the stemmer */ public String getStemmer() { initStemmers(); if (m_Stemmer == null) return null; else return getStemmerName(m_Stemmer.getClass().getName()); } /** * sets the stemmer with the given name, e.g., "porter". * * @param name the name of the stemmer, e.g., "porter" */ public void setStemmer(String name) { Class snowballClass; Class[] argClasses; initStemmers(); if (m_Stemmers.contains(name)) { try { snowballClass = Class.forName(getStemmerClassname(name)); m_Stemmer = snowballClass.newInstance(); // methods argClasses = new Class[0]; m_StemMethod = snowballClass.getMethod("stem", argClasses); argClasses = new Class[1]; argClasses[0] = String.class; m_SetCurrentMethod = snowballClass.getMethod("setCurrent", argClasses); argClasses = new Class[0]; m_GetCurrentMethod = snowballClass.getMethod("getCurrent", argClasses); } catch (Exception e) { System.out.println( "Error initializing stemmer '" + name + "'!" + e.getMessage()); m_Stemmer = null; } } else { System.err.println("Stemmer '" + name + "' unknown!"); m_Stemmer = null; } } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String stemmerTipText() { return "The Snowball stemmer to use, available: " + getStemmerList(0, ""); } /** * Returns the word in its stemmed form. * * @param word the unstemmed word * @return the stemmed word */ public String stem(String word) { String result; Object[] args; if (m_Stemmer == null) { result = new String(word); } else { // after de-serialization, the methods are null and need to be // re-initialized if (m_SetCurrentMethod == null) setStemmer(getStemmer()); try { // set word args = new Object[1]; args[0] = word; m_SetCurrentMethod.invoke(m_Stemmer, args); // stem word args = new Object[0]; m_StemMethod.invoke(m_Stemmer, args); // get word args = new Object[0]; result = (String) m_GetCurrentMethod.invoke(m_Stemmer, args); } catch (Exception e) { e.printStackTrace(); result = word; } } return result; } /** * returns a string representation of the stemmer. * * @return a string representation of the stemmer */ public String toString() { String result; result = getClass().getName(); result += " " + Utils.joinOptions(getOptions()); return result.trim(); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 5836 $"); } /** * Runs the stemmer with the given options. * * @param args the options */ public static void main(String[] args) { try { Stemming.useStemmer(new SnowballStemmer(), args); } catch (Exception e) { e.printStackTrace(); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy