gov.nih.nlm.nls.lvg.Flows.ToStripStopWords Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lvg2010dist Show documentation
LVG tools is used by Apache cTAKES.
The newest version!
package gov.nih.nlm.nls.lvg.Flows;
import java.util.*;
import java.io.*;
import gov.nih.nlm.nls.lvg.Util.*;
import gov.nih.nlm.nls.lvg.Lib.*;
/*****************************************************************************
* This class strips stop words from a specified term.  A stop word is:
* 
*  a high frequency word, such as a preposition.
* 
 a grammer word, which does not contribute the meaning of the sentence 
* too much.
* 
*  The defualt stop words include:
* 
"of", "and", "with", "for", "nos", "to", "in", "by", "on", "the",
* (non mesh)".
* The stop words list is configurable by modifying the configuration file.
*
* 
History:
* 

* 
*
* @author NLM NLS Development Team
*
* @see 
* Design Document 
*
* @version    V-2010
****************************************************************************/
public class ToStripStopWords extends Transformation implements Cloneable
{
    // public methods
    /**
    * Performs the mutation of this flow component.
    *
    * @param   in   a LexItem as the input for this flow component
    * @param   stopWords   Vector - stop words list
    * @param   detailsFlag   a boolean flag for processing details information
    * @param   mutateFlag   a boolean flag for processing mutate information
    *
    * @return  the results from this flow component - a collection (Vector) 
    * of LexItems
    */
    public static Vector Mutate(LexItem in, Vector stopWords,
        boolean detailsFlag, boolean mutateFlag)
    {
        // mutate the term: strip stop words
        String term = StripStopWords(in.GetSourceTerm(), stopWords);
        // strip multiple stop words, such as (non mesh)
        Vector multipleStopWords = GetMultipleStopWords(stopWords);
        term = Strip.StripStrings(term, multipleStopWords, false);
        // details & mutate
        String details = null;
        String mutate = null;
        if(detailsFlag == true)
        {
            details = INFO;
        }
        if(mutateFlag == true)
        {
            mutate = Transformation.NO_MUTATE_INFO;
        }
        // updatea target 
        Vector out = new Vector();
        LexItem temp = UpdateLexItem(in, term, Flow.STRIP_STOP_WORDS, 
            Transformation.UPDATE, Transformation.UPDATE, details, mutate);
        out.addElement(temp);
        return out;
    }
    /**
    * Read in stop words from configuration file
    *
    * @param   config   Configuratin object
    *
    * @return   Vector - stop words
    */
    public static Vector GetStopWordsFromFile(Configuration config)
    {
        String fName = config.GetConfiguration(Configuration.LVG_DIR)
            + config.GetConfiguration(Configuration.STOP_WORD_FILE);
        String line = null; 
        Vector stopWords = new Vector();
        try        // load stop words from file
        {
            BufferedReader in = new BufferedReader(new FileReader(fName));
            // read in line by line from a file
            while((line = in.readLine()) != null)
            {
                // skip the line if it is empty or comments (#)
                if((line.length() > 0) && (line.charAt(0) != '#'))
                {
                    stopWords.addElement(line);
                }
            }
            in.close();
        }
        catch (Exception e)
        {
            System.err.println("Exception: " + e.toString());
            System.err.println(
                "** Error: problem of opening/reading stop words file: '" + 
                fName + "'.");
        }
        return stopWords;
    }
    /**
    * A unit test driver for this flow component.
    */
    public static void main(String[] args)
    {
        // load config file
        Configuration conf = new Configuration("data.config.lvg", true);
        String testStr = GetTestStr(args, "On the Top");
        Vector stopWords = GetStopWordsFromFile(conf);
        // mutate
        LexItem in = new LexItem(testStr);
        Vector outs = ToStripStopWords.Mutate(in, stopWords, true, 
            true);
        PrintResults(in, outs);     // print out results
    }
    // private method
    private static String StripStopWords(String inStr, Vector stopWords)
    {
        Vector list = StripToken.Tokenize(inStr); 
        Vector newList = new Vector(list);
        // Strip stop words
        for(int i = 0; i < list.size(); i++)
        {
            StrTokenObject cur = list.elementAt(i);
            if((cur.GetTokenType() == StrTokenObject.TOKEN)
            && (IsContain(stopWords, cur.GetTokenStr()) == true))
            {
                // change the type of token to be stripped
                StrTokenObject temp = 
                    new StrTokenObject(" ", StrTokenObject.STRIPPED);
                newList.setElementAt(temp, i);
            }
        }
        // compose the string
        Vector cleanList = StripToken.CleanUpToken(newList);
        return (StripToken.ComposeString(cleanList)).trim();
    }
    // check if the input string contain any element of the specified list
    private static Vector GetMultipleStopWords(Vector stopWords)
    {
        Vector out = new Vector();
        if(stopWords == null)
        {
            return stopWords;
        }
        for(int i = 0; i < stopWords.size(); i++)
        {
            String cur = stopWords.elementAt(i);
            if(cur.indexOf(" ") != -1)
            {
                out.addElement(cur);
            }
        }
        return out;
    }
    private static boolean IsContain(Vector list, String inStr)
    {
        if(list == null)
        {
            return false;
        }
        boolean isContain = false;
        for(int i = 0; i < list.size(); i++)
        {
            if(inStr.equalsIgnoreCase(list.elementAt(i)) == true)
            {
                isContain = true;
                break;
            }
        }
        return isContain;
    }
    // data members
    private static final String INFO = "Strip Stop Words";
}