
cc.mallet.share.upenn.ner.LongRegexMatches Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
package cc.mallet.share.upenn.ner;
import java.util.regex.*;
import cc.mallet.pipe.*;
import cc.mallet.types.*;
/**
* Matches a regular expression which spans several tokens.
*/
public class LongRegexMatches extends Pipe implements java.io.Serializable {
String name;
Pattern regex;
int min; // how many tokens to merge for a match
int max;
public LongRegexMatches (String featureName, Pattern regex, int min, int max) {
this.name = featureName;
this.regex = regex;
this.min = min;
this.max = max;
}
public Instance pipe (Instance carrier) {
TokenSequence ts = (TokenSequence) carrier.getData();
boolean[] marked = new boolean[ts.size()]; // avoid setting features twice
for (int i=0; i < ts.size(); i++) {
// On reaching a new token, test all strings with at least
// min tokens which end in the new token.
StringBuffer sb = new StringBuffer();
// start by testing rightmost suffix, and grow leftward
for (int length = 1; length <= max; length++) {
int loc = i - length + 1;
if (loc < 0) break; // take another token
sb.insert(0, ts.get(loc).getText()); // else prepend token
// On a match, mark all participating tokens.
if (length >= min && regex.matcher(sb.toString()).matches()) {
for (int j=0; j
© 2015 - 2025 Weber Informatics LLC | Privacy Policy