gov.nih.nlm.nls.lvg.Flows.ToNormUninflectWords Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lvg2010dist Show documentation
LVG tools is used by Apache cTAKES.
The newest version!
package gov.nih.nlm.nls.lvg.Flows;
import java.util.*;
import java.sql.*;
import gov.nih.nlm.nls.lvg.Lib.*;
import gov.nih.nlm.nls.lvg.Util.*;
import gov.nih.nlm.nls.lvg.Db.*;
import gov.nih.nlm.nls.lvg.Trie.*;
/*****************************************************************************
* This class gets normalized uninflected variants (citation forms) of all words 
* of a specified term and returns every combination of lexical names.  
* The lexical names are generated by Lvg facts.  If a word is not found in
* lexicon, an uninflected term is generated by rule (pick up the first one 
* in an alphabetical order.
* A heuristic rule is applied when more than ten (confgiurable) output 
* combinations is generated by the rules.  In such case, the original term is 
* used to replace all output combinations.
*
* History:
* 

* 
*
* @author NLM NLS Development Team
*
* @see 
* Design Document 
*
* @version    V-2010
****************************************************************************/
public class ToNormUninflectWords extends Transformation
{
    // public methods
    /**
    * Performs the mutation of this flow component.
    *
    * @param   in   a LexItem as the input for this flow component
    * @param   maxTerm   the maxinum number of permutation term (uninflect)
    * @param   conn   LVG database connection
    * @param   trie   LVG persistent trie
    * @param   detailsFlag   a boolean flag for processing details information
    * @param   mutateFlag   a boolean flag for processing mutate information
    *
    * @return  Vector - results from this flow component 
    *
    * @exception SQLException if errors occurr while connect to LVG database.
    *
    * @see DbBase
    */
    public static Vector Mutate(LexItem in, int maxTerm, 
        Connection conn, RamTrie trie, boolean detailsFlag, boolean mutateFlag) 
        throws SQLException
    {
        // mutate the term
        Vector termList = NormUninflectWords(in.GetSourceTerm(), 
            maxTerm, conn, trie);
        // update target LexItem
        Vector out = new Vector();
        for(int i = 0; i < termList.size(); i++)
        {
            String term = termList.elementAt(i);
            // details & mutate
            String details = null;
            String mutate = null;
            if(detailsFlag == true)
            {
                details = INFO;
            }
            if(mutateFlag == true)
            {
                mutate = Transformation.NO_MUTATE_INFO;
            }
            LexItem temp = UpdateLexItem(in, term, Flow.NORM_UNINFLECT_WORDS, 
                Category.ALL_BIT_VALUE, 
                Inflection.GetBitValue(Inflection.BASE_BIT),
                details, mutate);
            out.addElement(temp);
        }
        return out;
    }
    /**
    * A unit test driver for this flow component.
    */
    public static void main(String[] args)
    {
        // load config file
        Configuration conf = new Configuration("data.config.lvg", true);
        String testStr = GetTestStr(args, "Color colour");
        int minTermLen = Integer.parseInt(
            conf.GetConfiguration(Configuration.MIN_TERM_LENGTH));
        String lvgDir = conf.GetConfiguration(Configuration.LVG_DIR);
        int maxTerm = Integer.parseInt(
            conf.GetConfiguration(Configuration.MAX_UNINFLS));
        // Mutate: connect to DB
        LexItem in = new LexItem(testStr);
        Vector outs = new Vector();
        try
        {
            Connection conn = DbBase.OpenConnection(conf);
            boolean isInflection = true;
            RamTrie trie = new RamTrie(isInflection, minTermLen, lvgDir, 0);
            if(conn != null)
            {
                outs = ToNormUninflectWords.Mutate(in, maxTerm, conn, trie, 
                    true, true);
            }
            DbBase.CloseConnection(conn, conf);
        }
        catch (Exception e)
        {
            System.err.println(e.getMessage());
        }
        PrintResults(in, outs);     // print out results
    }
    // private methods
    /**
    * Get uninflected variants of each word of a specified term and return
    * all combinations of these uninflected variants.
    *
    * @param   inStr   a input term for finding it's uninflection
    * @param   conn   LVG database connection
    * @param   trie   LVG persistent trie
    *
    * @return  Vector - results from this flow component 
    *
    * @see DbBase
    */
    private static Vector NormUninflectWords(String inStr, int maxTerm,
        Connection conn, RamTrie trie)
    {
        // tokenize wrods form the input term
        StringTokenizer buf = new StringTokenizer(inStr, " \t");
        Vector out = new Vector();
        Vector> strList = new Vector>();  // all lexical names 
        // Each element is a Vector, contains all lexical names for a wrod 
        long totalNum = 1;        // total number of permutations
        // get uninflections for all wrods
        while(buf.hasMoreTokens() == true)
        {
            String curStr = buf.nextToken();
            try
            {
                // Fact: get lexical names from database
                Vector factList 
                    = DbCitation.GetCitations(curStr, conn);
                Vector wordList = new Vector();  // lexical names for a word
                for(int i = 0; i < factList.size(); i++)
                {
                    InflectionRecord record = factList.elementAt(i);
                    String citationTerm = record.GetCitationTerm();
                    String citationTermLc = citationTerm.toLowerCase();
                    if(wordList.contains(citationTermLc) == false)
                    {
                        wordList.addElement(citationTermLc);
                    }
                }
                // apply Trie rules to get uninflected if no lexical name found
                if(factList.size() == 0)
                {
                    Vector ruleList 
                        = trie.GetUninflectedTermsByRules(curStr,
                        Category.ALL_BIT_VALUE, Inflection.ALL_BIT_VALUE, true);
                    // form the list from the result of Trie
                    // This flow is for LuiNorm, thus only one word is needed
                    String smallestStr = null;
                    for(int i = 0; i < ruleList.size(); i++)
                    {
                        RuleResult result = ruleList.elementAt(i);
                        String uninflectedTerm = result.GetOutTerm();
                        String uninflectedTermLc = 
                            uninflectedTerm.toLowerCase();
                        // check if the uninflected term exist in Lexicon
                        if(DbUninflection.IsExistUninflectedTerm(
                            uninflectedTermLc, conn) == false)
                        {
                            if(smallestStr == null)
                            {
                                smallestStr = uninflectedTermLc;
                            }
                            else if((uninflectedTermLc.compareTo(smallestStr) 
                                < 0)
                            && (DbUninflection.IsExistUninflectedTerm(
                                uninflectedTerm, conn) == false))
                            {
                                smallestStr = uninflectedTermLc;
                            }
                        }
                    }
                    wordList.addElement(smallestStr);
                }
                // apply heuristic rules: use original term if size is too big
                totalNum = totalNum*wordList.size();
                if(totalNum > maxTerm)
                {
                    out.removeAllElements();
                    out.addElement(inStr);
                    return out;
                }
                strList.addElement(wordList);        // add wordList if legal
            }
            catch (Exception e)
            {
                break;
            }
        }
        // do the permutation & sorting
        out = ToUninflectWords.FormCombinations(strList); // form the String
        // re-sort by Dictionary order
        if(strList.size() >= 1)
        {
            LvgComparator lc = new LvgComparator();
            lc.SetLengthFlag(true);       // needed for luiNorm
            lc.SetCase(false);
            Collections.sort(out, lc);    // sort output in dictionary order
        }
        return out;
    }
    // data members
    private static final String INFO = "Normalize Uninflect Words";
}