All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.stopwords.Rainbow Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 * Rainbow.java
 * Copyright (C) 2014 University of Waikato, Hamilton, New Zealand
 */

package weka.core.stopwords;

import java.util.HashSet;

/**
 
 * Stopwords list based on Rainbow:
* http://www.cs.cmu.edu/~mccallum/bow/rainbow/ *

* * Valid options are:

* *

 -D
 *  If set, stopword scheme is run in debug mode and
 *  may output additional info to the console
* * * @author fracpete (fracpete at waikato dot ac dot nz) * @version $Revision: 10978 $ */ public class Rainbow extends AbstractStopwords { /** for serialization. */ private static final long serialVersionUID = -722795295494945193L; /** The hash set containing the list of stopwords. */ protected HashSet m_Words; /** * Returns a string describing the stopwords scheme. * * @return a description suitable for displaying in the gui */ @Override public String globalInfo() { return "Stopwords list based on Rainbow:\n" + "http://www.cs.cmu.edu/~mccallum/bow/rainbow/"; } /** * Performs intialization of the scheme. */ @Override protected void initialize() { super.initialize(); m_Words = new HashSet(); m_Words.add("a"); m_Words.add("able"); m_Words.add("about"); m_Words.add("above"); m_Words.add("according"); m_Words.add("accordingly"); m_Words.add("across"); m_Words.add("actually"); m_Words.add("after"); m_Words.add("afterwards"); m_Words.add("again"); m_Words.add("against"); m_Words.add("all"); m_Words.add("allow"); m_Words.add("allows"); m_Words.add("almost"); m_Words.add("alone"); m_Words.add("along"); m_Words.add("already"); m_Words.add("also"); m_Words.add("although"); m_Words.add("always"); m_Words.add("am"); m_Words.add("among"); m_Words.add("amongst"); m_Words.add("an"); m_Words.add("and"); m_Words.add("another"); m_Words.add("any"); m_Words.add("anybody"); m_Words.add("anyhow"); m_Words.add("anyone"); m_Words.add("anything"); m_Words.add("anyway"); m_Words.add("anyways"); m_Words.add("anywhere"); m_Words.add("apart"); m_Words.add("appear"); m_Words.add("appreciate"); m_Words.add("appropriate"); m_Words.add("are"); m_Words.add("around"); m_Words.add("as"); m_Words.add("aside"); m_Words.add("ask"); m_Words.add("asking"); m_Words.add("associated"); m_Words.add("at"); m_Words.add("available"); m_Words.add("away"); m_Words.add("awfully"); m_Words.add("b"); m_Words.add("be"); m_Words.add("became"); m_Words.add("because"); m_Words.add("become"); m_Words.add("becomes"); m_Words.add("becoming"); m_Words.add("been"); m_Words.add("before"); m_Words.add("beforehand"); m_Words.add("behind"); m_Words.add("being"); m_Words.add("believe"); m_Words.add("below"); m_Words.add("beside"); m_Words.add("besides"); m_Words.add("best"); m_Words.add("better"); m_Words.add("between"); m_Words.add("beyond"); m_Words.add("both"); m_Words.add("brief"); m_Words.add("but"); m_Words.add("by"); m_Words.add("c"); m_Words.add("came"); m_Words.add("can"); m_Words.add("cannot"); m_Words.add("cant"); m_Words.add("cause"); m_Words.add("causes"); m_Words.add("certain"); m_Words.add("certainly"); m_Words.add("changes"); m_Words.add("clearly"); m_Words.add("co"); m_Words.add("com"); m_Words.add("come"); m_Words.add("comes"); m_Words.add("concerning"); m_Words.add("consequently"); m_Words.add("consider"); m_Words.add("considering"); m_Words.add("contain"); m_Words.add("containing"); m_Words.add("contains"); m_Words.add("corresponding"); m_Words.add("could"); m_Words.add("course"); m_Words.add("currently"); m_Words.add("d"); m_Words.add("definitely"); m_Words.add("described"); m_Words.add("despite"); m_Words.add("did"); m_Words.add("different"); m_Words.add("do"); m_Words.add("does"); m_Words.add("doing"); m_Words.add("done"); m_Words.add("down"); m_Words.add("downwards"); m_Words.add("during"); m_Words.add("e"); m_Words.add("each"); m_Words.add("edu"); m_Words.add("eg"); m_Words.add("eight"); m_Words.add("either"); m_Words.add("else"); m_Words.add("elsewhere"); m_Words.add("enough"); m_Words.add("entirely"); m_Words.add("especially"); m_Words.add("et"); m_Words.add("etc"); m_Words.add("even"); m_Words.add("ever"); m_Words.add("every"); m_Words.add("everybody"); m_Words.add("everyone"); m_Words.add("everything"); m_Words.add("everywhere"); m_Words.add("ex"); m_Words.add("exactly"); m_Words.add("example"); m_Words.add("except"); m_Words.add("f"); m_Words.add("far"); m_Words.add("few"); m_Words.add("fifth"); m_Words.add("first"); m_Words.add("five"); m_Words.add("followed"); m_Words.add("following"); m_Words.add("follows"); m_Words.add("for"); m_Words.add("former"); m_Words.add("formerly"); m_Words.add("forth"); m_Words.add("four"); m_Words.add("from"); m_Words.add("further"); m_Words.add("furthermore"); m_Words.add("g"); m_Words.add("get"); m_Words.add("gets"); m_Words.add("getting"); m_Words.add("given"); m_Words.add("gives"); m_Words.add("go"); m_Words.add("goes"); m_Words.add("going"); m_Words.add("gone"); m_Words.add("got"); m_Words.add("gotten"); m_Words.add("greetings"); m_Words.add("h"); m_Words.add("had"); m_Words.add("happens"); m_Words.add("hardly"); m_Words.add("has"); m_Words.add("have"); m_Words.add("having"); m_Words.add("he"); m_Words.add("hello"); m_Words.add("help"); m_Words.add("hence"); m_Words.add("her"); m_Words.add("here"); m_Words.add("hereafter"); m_Words.add("hereby"); m_Words.add("herein"); m_Words.add("hereupon"); m_Words.add("hers"); m_Words.add("herself"); m_Words.add("hi"); m_Words.add("him"); m_Words.add("himself"); m_Words.add("his"); m_Words.add("hither"); m_Words.add("hopefully"); m_Words.add("how"); m_Words.add("howbeit"); m_Words.add("however"); m_Words.add("i"); m_Words.add("ie"); m_Words.add("if"); m_Words.add("ignored"); m_Words.add("immediate"); m_Words.add("in"); m_Words.add("inasmuch"); m_Words.add("inc"); m_Words.add("indeed"); m_Words.add("indicate"); m_Words.add("indicated"); m_Words.add("indicates"); m_Words.add("inner"); m_Words.add("insofar"); m_Words.add("instead"); m_Words.add("into"); m_Words.add("inward"); m_Words.add("is"); m_Words.add("it"); m_Words.add("its"); m_Words.add("itself"); m_Words.add("j"); m_Words.add("just"); m_Words.add("k"); m_Words.add("keep"); m_Words.add("keeps"); m_Words.add("kept"); m_Words.add("know"); m_Words.add("knows"); m_Words.add("known"); m_Words.add("l"); m_Words.add("last"); m_Words.add("lately"); m_Words.add("later"); m_Words.add("latter"); m_Words.add("latterly"); m_Words.add("least"); m_Words.add("less"); m_Words.add("lest"); m_Words.add("let"); m_Words.add("like"); m_Words.add("liked"); m_Words.add("likely"); m_Words.add("little"); m_Words.add("ll"); // added to avoid words like you'll,I'll etc. m_Words.add("look"); m_Words.add("looking"); m_Words.add("looks"); m_Words.add("ltd"); m_Words.add("m"); m_Words.add("mainly"); m_Words.add("many"); m_Words.add("may"); m_Words.add("maybe"); m_Words.add("me"); m_Words.add("mean"); m_Words.add("meanwhile"); m_Words.add("merely"); m_Words.add("might"); m_Words.add("more"); m_Words.add("moreover"); m_Words.add("most"); m_Words.add("mostly"); m_Words.add("much"); m_Words.add("must"); m_Words.add("my"); m_Words.add("myself"); m_Words.add("n"); m_Words.add("name"); m_Words.add("namely"); m_Words.add("nd"); m_Words.add("near"); m_Words.add("nearly"); m_Words.add("necessary"); m_Words.add("need"); m_Words.add("needs"); m_Words.add("neither"); m_Words.add("never"); m_Words.add("nevertheless"); m_Words.add("new"); m_Words.add("next"); m_Words.add("nine"); m_Words.add("no"); m_Words.add("nobody"); m_Words.add("non"); m_Words.add("none"); m_Words.add("noone"); m_Words.add("nor"); m_Words.add("normally"); m_Words.add("not"); m_Words.add("nothing"); m_Words.add("novel"); m_Words.add("now"); m_Words.add("nowhere"); m_Words.add("o"); m_Words.add("obviously"); m_Words.add("of"); m_Words.add("off"); m_Words.add("often"); m_Words.add("oh"); m_Words.add("ok"); m_Words.add("okay"); m_Words.add("old"); m_Words.add("on"); m_Words.add("once"); m_Words.add("one"); m_Words.add("ones"); m_Words.add("only"); m_Words.add("onto"); m_Words.add("or"); m_Words.add("other"); m_Words.add("others"); m_Words.add("otherwise"); m_Words.add("ought"); m_Words.add("our"); m_Words.add("ours"); m_Words.add("ourselves"); m_Words.add("out"); m_Words.add("outside"); m_Words.add("over"); m_Words.add("overall"); m_Words.add("own"); m_Words.add("p"); m_Words.add("particular"); m_Words.add("particularly"); m_Words.add("per"); m_Words.add("perhaps"); m_Words.add("placed"); m_Words.add("please"); m_Words.add("plus"); m_Words.add("possible"); m_Words.add("presumably"); m_Words.add("probably"); m_Words.add("provides"); m_Words.add("q"); m_Words.add("que"); m_Words.add("quite"); m_Words.add("qv"); m_Words.add("r"); m_Words.add("rather"); m_Words.add("rd"); m_Words.add("re"); m_Words.add("really"); m_Words.add("reasonably"); m_Words.add("regarding"); m_Words.add("regardless"); m_Words.add("regards"); m_Words.add("relatively"); m_Words.add("respectively"); m_Words.add("right"); m_Words.add("s"); m_Words.add("said"); m_Words.add("same"); m_Words.add("saw"); m_Words.add("say"); m_Words.add("saying"); m_Words.add("says"); m_Words.add("second"); m_Words.add("secondly"); m_Words.add("see"); m_Words.add("seeing"); m_Words.add("seem"); m_Words.add("seemed"); m_Words.add("seeming"); m_Words.add("seems"); m_Words.add("seen"); m_Words.add("self"); m_Words.add("selves"); m_Words.add("sensible"); m_Words.add("sent"); m_Words.add("serious"); m_Words.add("seriously"); m_Words.add("seven"); m_Words.add("several"); m_Words.add("shall"); m_Words.add("she"); m_Words.add("should"); m_Words.add("since"); m_Words.add("six"); m_Words.add("so"); m_Words.add("some"); m_Words.add("somebody"); m_Words.add("somehow"); m_Words.add("someone"); m_Words.add("something"); m_Words.add("sometime"); m_Words.add("sometimes"); m_Words.add("somewhat"); m_Words.add("somewhere"); m_Words.add("soon"); m_Words.add("sorry"); m_Words.add("specified"); m_Words.add("specify"); m_Words.add("specifying"); m_Words.add("still"); m_Words.add("sub"); m_Words.add("such"); m_Words.add("sup"); m_Words.add("sure"); m_Words.add("t"); m_Words.add("take"); m_Words.add("taken"); m_Words.add("tell"); m_Words.add("tends"); m_Words.add("th"); m_Words.add("than"); m_Words.add("thank"); m_Words.add("thanks"); m_Words.add("thanx"); m_Words.add("that"); m_Words.add("thats"); m_Words.add("the"); m_Words.add("their"); m_Words.add("theirs"); m_Words.add("them"); m_Words.add("themselves"); m_Words.add("then"); m_Words.add("thence"); m_Words.add("there"); m_Words.add("thereafter"); m_Words.add("thereby"); m_Words.add("therefore"); m_Words.add("therein"); m_Words.add("theres"); m_Words.add("thereupon"); m_Words.add("these"); m_Words.add("they"); m_Words.add("think"); m_Words.add("third"); m_Words.add("this"); m_Words.add("thorough"); m_Words.add("thoroughly"); m_Words.add("those"); m_Words.add("though"); m_Words.add("three"); m_Words.add("through"); m_Words.add("throughout"); m_Words.add("thru"); m_Words.add("thus"); m_Words.add("to"); m_Words.add("together"); m_Words.add("too"); m_Words.add("took"); m_Words.add("toward"); m_Words.add("towards"); m_Words.add("tried"); m_Words.add("tries"); m_Words.add("truly"); m_Words.add("try"); m_Words.add("trying"); m_Words.add("twice"); m_Words.add("two"); m_Words.add("u"); m_Words.add("un"); m_Words.add("under"); m_Words.add("unfortunately"); m_Words.add("unless"); m_Words.add("unlikely"); m_Words.add("until"); m_Words.add("unto"); m_Words.add("up"); m_Words.add("upon"); m_Words.add("us"); m_Words.add("use"); m_Words.add("used"); m_Words.add("useful"); m_Words.add("uses"); m_Words.add("using"); m_Words.add("usually"); m_Words.add("uucp"); m_Words.add("v"); m_Words.add("value"); m_Words.add("various"); m_Words.add("ve"); // added to avoid words like I've,you've etc. m_Words.add("very"); m_Words.add("via"); m_Words.add("viz"); m_Words.add("vs"); m_Words.add("w"); m_Words.add("want"); m_Words.add("wants"); m_Words.add("was"); m_Words.add("way"); m_Words.add("we"); m_Words.add("welcome"); m_Words.add("well"); m_Words.add("went"); m_Words.add("were"); m_Words.add("what"); m_Words.add("whatever"); m_Words.add("when"); m_Words.add("whence"); m_Words.add("whenever"); m_Words.add("where"); m_Words.add("whereafter"); m_Words.add("whereas"); m_Words.add("whereby"); m_Words.add("wherein"); m_Words.add("whereupon"); m_Words.add("wherever"); m_Words.add("whether"); m_Words.add("which"); m_Words.add("while"); m_Words.add("whither"); m_Words.add("who"); m_Words.add("whoever"); m_Words.add("whole"); m_Words.add("whom"); m_Words.add("whose"); m_Words.add("why"); m_Words.add("will"); m_Words.add("willing"); m_Words.add("wish"); m_Words.add("with"); m_Words.add("within"); m_Words.add("without"); m_Words.add("wonder"); m_Words.add("would"); m_Words.add("would"); m_Words.add("x"); m_Words.add("y"); m_Words.add("yes"); m_Words.add("yet"); m_Words.add("you"); m_Words.add("your"); m_Words.add("yours"); m_Words.add("yourself"); m_Words.add("yourselves"); m_Words.add("z"); m_Words.add("zero"); } /** * Returns true if the given string is a stop word. * * @param word the word to test * @return true if the word is a stopword */ @Override protected boolean is(String word) { return m_Words.contains(word.trim().toLowerCase()); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy