
cc.mallet.share.upenn.ner.ListMember Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
package cc.mallet.share.upenn.ner;
import java.io.*;
import java.util.*;
import cc.mallet.pipe.*;
import cc.mallet.types.*;
import gnu.trove.*;
/**
* Checks membership in a lexicon in a text file. Multi-token items are supported,
* but only if the tokens are uniformly separated or not separated by spaces: that is,
* U.S.A. is acceptable, as is San Francisco, but not St. Petersburg.
*/
public class ListMember extends Pipe implements java.io.Serializable {
String name;
Set lexicon;
boolean ignoreCase;
int min, max;
public ListMember (String featureName, File lexFile, boolean ignoreCase) {
this.name = featureName;
this.ignoreCase = ignoreCase;
if (!lexFile.exists())
throw new IllegalArgumentException("File "+lexFile+" not found.");
try {
lexicon = new THashSet();
min = 99999;
max = -1;
BufferedReader br = new BufferedReader(new FileReader(lexFile));
while (br.ready()) {
String s = br.readLine().trim();
if (s.equals("")) continue; // ignore blank lines
int count = countTokens(s);
if (count < min) min = count;
if (count > max) max = count;
if (ignoreCase)
lexicon.add(s.toLowerCase());
else
lexicon.add(s);
}
} catch (IOException e) {
System.err.println("Problem with "+lexFile+": "+e);
System.exit(0);
}
}
public Instance pipe (Instance carrier) {
TokenSequence seq = (TokenSequence)carrier.getData();
boolean[] marked = new boolean[seq.size()];
for (int i=0; i= min && (lexicon.contains(test) || lexicon.contains(tests)))
markFrom(i, j, marked);
}
}
for (int i=0; i?/ \t\n\r", true);
return wordst.countTokens();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy