All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.share.upenn.ner.LongRegexMatches Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!
package cc.mallet.share.upenn.ner;


import java.util.regex.*;

import cc.mallet.pipe.*;
import cc.mallet.types.*;

/**
 * Matches a regular expression which spans several tokens.
 */
public class LongRegexMatches extends Pipe implements java.io.Serializable {

    String name;
    Pattern regex;
    int min; // how many tokens to merge for a match
    int max;

    public LongRegexMatches (String featureName, Pattern regex, int min, int max) {
		this.name = featureName;
		this.regex = regex;
        this.min = min;
        this.max = max;
	}

    public Instance pipe (Instance carrier) {
        TokenSequence ts = (TokenSequence) carrier.getData();
        boolean[] marked = new boolean[ts.size()]; // avoid setting features twice

        for (int i=0; i < ts.size(); i++) {
            // On reaching a new token, test all strings with at least
            // min tokens which end in the new token.
            StringBuffer sb = new StringBuffer();
            // start by testing rightmost suffix, and grow leftward
            for (int length = 1; length <= max; length++) {
                int loc = i - length + 1;
                if (loc < 0) break; // take another token
                sb.insert(0, ts.get(loc).getText()); // else prepend token
                // On a match, mark all participating tokens.
                if (length >= min && regex.matcher(sb.toString()).matches()) {
                    for (int j=0; j




© 2015 - 2025 Weber Informatics LLC | Privacy Policy