cc.mallet.pipe.TokenSequenceRemoveStopwords Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
There is a newer version: 2.0.12
Show newest version
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */





package cc.mallet.pipe;


import java.util.HashSet;
import java.util.ArrayList;
import java.io.*;

import cc.mallet.types.FeatureSequenceWithBigrams;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
/**
 * Remove tokens from the token sequence in the data field whose text is in the stopword list.
 @author Andrew McCallum [email protected]
*/

public class TokenSequenceRemoveStopwords extends Pipe implements Serializable
{
	// xxx Use a gnu.trove collection instead
	HashSet stoplist = null;
	boolean caseSensitive = true;
	boolean markDeletions = false;

	private HashSet newDefaultStopList ()
	{
		HashSet sl = new HashSet();
		for (int i = 0; i < stopwords.length; i++)
			sl.add (stopwords[i]);
		return sl;
	}


	public TokenSequenceRemoveStopwords (boolean caseSensitive, boolean markDeletions)
	{
		stoplist = newDefaultStopList();
		this.caseSensitive = caseSensitive;
		this.markDeletions = markDeletions;
	}

	public TokenSequenceRemoveStopwords (boolean caseSensitive)
	{
		stoplist = newDefaultStopList();
		this.caseSensitive = caseSensitive;
	}

	public TokenSequenceRemoveStopwords ()
	{
		this (false);
	}

	/**
	 *  Load a stoplist from a file.
	 *  @param stoplistFile    The file to load
	 *  @param encoding        The encoding of the stoplist file (eg UTF-8)
	 *  @param includeDefault  Whether to include the standard mallet English stoplist
	 */
	public TokenSequenceRemoveStopwords(File stoplistFile, String encoding, boolean includeDefault,
										boolean caseSensitive, boolean markDeletions) {
		if (! includeDefault) { stoplist = new HashSet(); }
		else { stoplist = newDefaultStopList(); }

		addStopWords (fileToStringArray(stoplistFile, encoding));

		this.caseSensitive = caseSensitive;
        this.markDeletions = markDeletions;
	}

	public TokenSequenceRemoveStopwords setCaseSensitive (boolean flag)
	{
		this.caseSensitive = flag;
		return this;
	}

	public TokenSequenceRemoveStopwords setMarkDeletions (boolean flag)
	{
		this.markDeletions = flag;
		return this;
	}

	public TokenSequenceRemoveStopwords addStopWords (String[] words)
	{
		for (int i = 0; i < words.length; i++)
			stoplist.add (words[i]);
		return this;
	}


	public TokenSequenceRemoveStopwords removeStopWords (String[] words)
	{
		for (int i = 0; i < words.length; i++)
			stoplist.remove (words[i]);
		return this;
	}

	/** Remove whitespace-separated tokens in file "wordlist" to the stoplist. */
	public TokenSequenceRemoveStopwords removeStopWords (File wordlist)
	{
		this.removeStopWords (fileToStringArray(wordlist, null));
		return this;
	}

	/** Add whitespace-separated tokens in file "wordlist" to the stoplist. */
	public TokenSequenceRemoveStopwords addStopWords (File wordlist)
	{
		if (wordlist != null)
			this.addStopWords (fileToStringArray(wordlist, null));
		return this;
	}


	private String[] fileToStringArray (File f, String encoding)
	{
		ArrayList wordarray = new ArrayList();

		try {

			BufferedReader input = null;
			if (encoding == null) {
				input = new BufferedReader (new FileReader (f));
			}
			else {
				input = new BufferedReader( new InputStreamReader( new FileInputStream(f), encoding ));
			}
			String line;

			while (( line = input.readLine()) != null) {
				String[] words = line.split ("\\s+");
				for (int i = 0; i < words.length; i++)
					wordarray.add (words[i]);
			}

		} catch (IOException e) {
			throw new IllegalArgumentException("Trouble reading file "+f);
		}
		return (String[]) wordarray.toArray(new String[]{});
	}
	
	public Instance pipe (Instance carrier)
	{
		TokenSequence ts = (TokenSequence) carrier.getData();
		// xxx This doesn't seem so efficient.  Perhaps have TokenSequence
		// use a LinkedList, and remove Tokens from it? -?
		// But a LinkedList implementation of TokenSequence would be quite inefficient -AKM
		TokenSequence ret = new TokenSequence ();
		Token prevToken = null;
		for (int i = 0; i < ts.size(); i++) {
			Token t = ts.get(i);
			if (! stoplist.contains (caseSensitive ? t.getText() : t.getText().toLowerCase())) {
				// xxx Should we instead make and add a copy of the Token?
				ret.add (t);
				prevToken = t;
			} else if (markDeletions && prevToken != null)
				prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText());
		}
		carrier.setData(ret);
		return carrier;
	}

	// Serialization 
	
	private static final long serialVersionUID = 1;
	private static final int CURRENT_SERIAL_VERSION = 2;
	
	private void writeObject (ObjectOutputStream out) throws IOException {
		out.writeInt (CURRENT_SERIAL_VERSION);
		out.writeBoolean(caseSensitive);
		out.writeBoolean(markDeletions);
		out.writeObject(stoplist); // New as of CURRENT_SERIAL_VERSION 2
	}
	
	private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
		int version = in.readInt ();
		caseSensitive = in.readBoolean();
		if (version > 0)
			markDeletions = in.readBoolean();
		if (version > 1) {
			stoplist = (HashSet) in.readObject();
		}

	}

	
	static final String[] stopwords =
	{
		"a",
		"able",
		"about",
		"above",
		"according",
		"accordingly",
		"across",
		"actually",
		"after",
		"afterwards",
		"again",
		"against",
		"all",
		"allow",
		"allows",
		"almost",
		"alone",
		"along",
		"already",
		"also",
		"although",
		"always",
		"am",
		"among",
		"amongst",
		"an",
		"and",
		"another",
		"any",
		"anybody",
		"anyhow",
		"anyone",
		"anything",
		"anyway",
		"anyways",
		"anywhere",
		"apart",
		"appear",
		"appreciate",
		"appropriate",
		"are",
		"around",
		"as",
		"aside",
		"ask",
		"asking",
		"associated",
		"at",
		"available",
		"away",
		"awfully",
		"b",
		"be",
		"became",
		"because",
		"become",
		"becomes",
		"becoming",
		"been",
		"before",
		"beforehand",
		"behind",
		"being",
		"believe",
		"below",
		"beside",
		"besides",
		"best",
		"better",
		"between",
		"beyond",
		"both",
		"brief",
		"but",
		"by",
		"c",
		"came",
		"can",
		"cannot",
		"cant",
		"cause",
		"causes",
		"certain",
		"certainly",
		"changes",
		"clearly",
		"co",
		"com",
		"come",
		"comes",
		"concerning",
		"consequently",
		"consider",
		"considering",
		"contain",
		"containing",
		"contains",
		"corresponding",
		"could",
		"course",
		"currently",
		"d",
		"definitely",
		"described",
		"despite",
		"did",
		"different",
		"do",
		"does",
		"doing",
		"done",
		"down",
		"downwards",
		"during",
		"e",
		"each",
		"edu",
		"eg",
		"eight",
		"either",
		"else",
		"elsewhere",
		"enough",
		"entirely",
		"especially",
		"et",
		"etc",
		"even",
		"ever",
		"every",
		"everybody",
		"everyone",
		"everything",
		"everywhere",
		"ex",
		"exactly",
		"example",
		"except",
		"f",
		"far",
		"few",
		"fifth",
		"first",
		"five",
		"followed",
		"following",
		"follows",
		"for",
		"former",
		"formerly",
		"forth",
		"four",
		"from",
		"further",
		"furthermore",
		"g",
		"get",
		"gets",
		"getting",
		"given",
		"gives",
		"go",
		"goes",
		"going",
		"gone",
		"got",
		"gotten",
		"greetings",
		"h",
		"had",
		"happens",
		"hardly",
		"has",
		"have",
		"having",
		"he",
		"hello",
		"help",
		"hence",
		"her",
		"here",
		"hereafter",
		"hereby",
		"herein",
		"hereupon",
		"hers",
		"herself",
		"hi",
		"him",
		"himself",
		"his",
		"hither",
		"hopefully",
		"how",
		"howbeit",
		"however",
		"i",
		"ie",
		"if",
		"ignored",
		"immediate",
		"in",
		"inasmuch",
		"inc",
		"indeed",
		"indicate",
		"indicated",
		"indicates",
		"inner",
		"insofar",
		"instead",
		"into",
		"inward",
		"is",
		"it",
		"its",
		"itself",
		"j",
		"just",
		"k",
		"keep",
		"keeps",
		"kept",
		"know",
		"knows",
		"known",
		"l",
		"last",
		"lately",
		"later",
		"latter",
		"latterly",
		"least",
		"less",
		"lest",
		"let",
		"like",
		"liked",
		"likely",
		"little",
		"look",
		"looking",
		"looks",
		"ltd",
		"m",
		"mainly",
		"many",
		"may",
		"maybe",
		"me",
		"mean",
		"meanwhile",
		"merely",
		"might",
		"more",
		"moreover",
		"most",
		"mostly",
		"much",
		"must",
		"my",
		"myself",
		"n",
		"name",
		"namely",
		"nd",
		"near",
		"nearly",
		"necessary",
		"need",
		"needs",
		"neither",
		"never",
		"nevertheless",
		"new",
		"next",
		"nine",
		"no",
		"nobody",
		"non",
		"none",
		"noone",
		"nor",
		"normally",
		"not",
		"nothing",
		"novel",
		"now",
		"nowhere",
		"o",
		"obviously",
		"of",
		"off",
		"often",
		"oh",
		"ok",
		"okay",
		"old",
		"on",
		"once",
		"one",
		"ones",
		"only",
		"onto",
		"or",
		"other",
		"others",
		"otherwise",
		"ought",
		"our",
		"ours",
		"ourselves",
		"out",
		"outside",
		"over",
		"overall",
		"own",
		"p",
		"particular",
		"particularly",
		"per",
		"perhaps",
		"placed",
		"please",
		"plus",
		"possible",
		"presumably",
		"probably",
		"provides",
		"q",
		"que",
		"quite",
		"qv",
		"r",
		"rather",
		"rd",
		"re",
		"really",
		"reasonably",
		"regarding",
		"regardless",
		"regards",
		"relatively",
		"respectively",
		"right",
		"s",
		"said",
		"same",
		"saw",
		"say",
		"saying",
		"says",
		"second",
		"secondly",
		"see",
		"seeing",
		"seem",
		"seemed",
		"seeming",
		"seems",
		"seen",
		"self",
		"selves",
		"sensible",
		"sent",
		"serious",
		"seriously",
		"seven",
		"several",
		"shall",
		"she",
		"should",
		"since",
		"six",
		"so",
		"some",
		"somebody",
		"somehow",
		"someone",
		"something",
		"sometime",
		"sometimes",
		"somewhat",
		"somewhere",
		"soon",
		"sorry",
		"specified",
		"specify",
		"specifying",
		"still",
		"sub",
		"such",
		"sup",
		"sure",
		"t",
		"take",
		"taken",
		"tell",
		"tends",
		"th",
		"than",
		"thank",
		"thanks",
		"thanx",
		"that",
		"thats",
		"the",
		"their",
		"theirs",
		"them",
		"themselves",
		"then",
		"thence",
		"there",
		"thereafter",
		"thereby",
		"therefore",
		"therein",
		"theres",
		"thereupon",
		"these",
		"they",
		"think",
		"third",
		"this",
		"thorough",
		"thoroughly",
		"those",
		"though",
		"three",
		"through",
		"throughout",
		"thru",
		"thus",
		"to",
		"together",
		"too",
		"took",
		"toward",
		"towards",
		"tried",
		"tries",
		"truly",
		"try",
		"trying",
		"twice",
		"two",
		"u",
		"un",
		"under",
		"unfortunately",
		"unless",
		"unlikely",
		"until",
		"unto",
		"up",
		"upon",
		"us",
		"use",
		"used",
		"useful",
		"uses",
		"using",
		"usually",
		"uucp",
		"v",
		"value",
		"various",
		"very",
		"via",
		"viz",
		"vs",
		"w",
		"want",
		"wants",
		"was",
		"way",
		"we",
		"welcome",
		"well",
		"went",
		"were",
		"what",
		"whatever",
		"when",
		"whence",
		"whenever",
		"where",
		"whereafter",
		"whereas",
		"whereby",
		"wherein",
		"whereupon",
		"wherever",
		"whether",
		"which",
		"while",
		"whither",
		"who",
		"whoever",
		"whole",
		"whom",
		"whose",
		"why",
		"will",
		"willing",
		"wish",
		"with",
		"within",
		"without",
		"wonder",
		"would",
		"would",
		"x",
		"y",
		"yes",
		"yet",
		"you",
		"your",
		"yours",
		"yourself",
		"yourselves",
		"z",
		"zero",
		// stop words for paper abstracts
		//		"abstract",
		//"paper",
		//"presents",
		//"discuss",
		//"discusses",
		//"conclude",
		//"concludes",
		//"based",
		//"approach"
	};	
		//stopwords for french, added by Limin Yao
	static final String[] stopwordsFrench = {
		"fut",
		"S",
		"ces",
		"ral",
		"new",
		"tr",
		"arm",
		"y",
		"autres",
		"o",
		"tait",
		"dont",
		"ann",
		"apr",
		"sous",
		"ans",
		"cette",
		"politique",
		"of",
		"c",
		"contre",
		"leur",
		"ville",
		"fait",
		"res",
		"on",
		"deux",
		"cle",
		"v",
		"publique",
		"france",
		"te",
		"guerre",
		"sident",
		"unis",
		"mais",
		"entre",
		"aussi",
		"tat",
		"ais",
		"ses",
		"sa",
		"ont",
		"tre",
		"d",
		"pays",
		"en",
		"Il",
		"tats",
		"comme",
		"am",
		"si",
		"c",
		"fran",
		"pas",
		"g",
		"qu",
		"R",
		"aux",
		"ce",
		"f",
		"p",
		"ne",
		"son",
		"me",
		"avec",
		"l",
		"se",
		"ou",
		"sont",
		"il",
		"Les",
		"re",
		"plus",
		"m",
		"es",
		"pr",
		"la",
		"sur",
		"que",
		"pour",
		"modifier",
		"a",
		"qui",
		"Le",
		"t",
		"n",
		"au",
		"dans",
		"une",
		"par",
		"un",
		"r",
		"est",
		"e",
		"du",
		"s",
		"les",
		"en",
		"des",
		"le",
		"et",
		"l",
		"d",
		"la",
		"de",

	};


}