cc.mallet.share.upenn.ner.ListMember Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

There is a newer version: 2.0.12

Show newest version

package cc.mallet.share.upenn.ner;


import java.io.*;
import java.util.*;

import cc.mallet.pipe.*;
import cc.mallet.types.*;

import gnu.trove.*;

/**
 * Checks membership in a lexicon in a text file.  Multi-token items are supported,
 * but only if the tokens are uniformly separated or not separated by spaces: that is,
 * U.S.A. is acceptable, as is San Francisco, but not St. Petersburg.
 */
public class ListMember extends Pipe implements java.io.Serializable {
    
    String name;
    Set lexicon;
    boolean ignoreCase;
    int min, max;

    public ListMember (String featureName, File lexFile, boolean ignoreCase) {
        this.name = featureName;
        this.ignoreCase = ignoreCase;

        if (!lexFile.exists())
            throw new IllegalArgumentException("File "+lexFile+" not found.");

        try {
            lexicon = new THashSet();
            min = 99999;
            max = -1;
            BufferedReader br = new BufferedReader(new FileReader(lexFile));
            while (br.ready()) {
                String s = br.readLine().trim();
                if (s.equals("")) continue; // ignore blank lines
                
                int count = countTokens(s);
                if (count < min) min = count;
                if (count > max) max = count;
                if (ignoreCase)
                    lexicon.add(s.toLowerCase());
                else
                    lexicon.add(s);
            }            
        } catch (IOException e) {
            System.err.println("Problem with "+lexFile+": "+e);
            System.exit(0);
        }
    }

    public Instance pipe (Instance carrier) {
        TokenSequence seq = (TokenSequence)carrier.getData();
        boolean[] marked = new boolean[seq.size()];
        for (int i=0; i= min && (lexicon.contains(test) || lexicon.contains(tests)))
                    markFrom(i, j, marked);
            }
        }

        for (int i=0; i?/ \t\n\r", true);
        return wordst.countTokens();
    }
}