All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.stemmers.PTStemmer Maven / Gradle / Ivy

/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 * PTStemmer.java
 * Copyright (C) 2009 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.stemmers;

import java.io.File;
import java.util.Enumeration;
import java.util.Vector;

import ptstemmer.exceptions.PTStemmerException;
import ptstemmer.implementations.OrengoStemmer;
import ptstemmer.implementations.PorterStemmer;
import ptstemmer.implementations.SavoyStemmer;
import ptstemmer.support.PTStemmerUtilities;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.Utils;

/**
 
 * A wrapper for PTStemmer (developed by Pedro Oliveira):
* http://code.google.com/p/ptstemmer/ *

* * Valid options are:

* *

 -S <ORENGO|PORTER|SAVOY>
 *  The type of stemmer algorithm to use:
 *  ORENGO = Orengo
 *  PORTER = Porter
 *  SAVOY = Savoy
 *  (default: ORENGO)
* *
 -N <file>
 *  The file with the named entities to ignore (optional).
 *  File format: simple text file with one entity per line.
 *  (default: none)
 * 
* *
 -W <file>
 *  The file with the stopwords (optional).
 *  File format: simple text file with one stopword per line.
 *  (default: none)
 * 
* *
 -C <int>
 *  The size of the cache. Disable with 0.
 *  (default: 1000)
 * 
* * * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 1179 $ */ public class PTStemmer implements Stemmer, OptionHandler { /** for serialization. */ static final long serialVersionUID = -6113024782588197L; /** orengo stemmer. */ public static final int STEMMER_ORENGO = 0; /** porter stemmer. */ public static final int STEMMER_PORTER = 1; /** savoy stemmer. */ public static final int STEMMER_SAVOY = 2; /** stemmers. */ public static final Tag[] TAGS_STEMMERS = { new Tag(STEMMER_ORENGO, "orengo", "Orengo"), new Tag(STEMMER_PORTER, "porter", "Porter"), new Tag(STEMMER_SAVOY, "savoy", "Savoy") }; /** the type of stemmer to use. */ protected int m_Stemmer = STEMMER_ORENGO; /** the named entities. */ protected File m_NamedEntities = new File("."); /** the stopwords. */ protected File m_Stopwords = new File("."); /** the cache size. */ protected int m_Cache = 1000; /** the actual stemmer. */ protected ptstemmer.Stemmer m_ActualStemmer; /** * Returns a string describing the stemmer. * * @return a description suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "A wrapper for PTStemmer (developed by Pedro Oliveira):\n" + "http://code.google.com/p/ptstemmer/"; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector




© 2015 - 2025 Weber Informatics LLC | Privacy Policy