All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gov.nih.nlm.nls.lvg.Flows.ToNormUninflectWords Maven / Gradle / Ivy

The newest version!
package gov.nih.nlm.nls.lvg.Flows;
import java.util.*;
import java.sql.*;
import gov.nih.nlm.nls.lvg.Lib.*;
import gov.nih.nlm.nls.lvg.Util.*;
import gov.nih.nlm.nls.lvg.Db.*;
import gov.nih.nlm.nls.lvg.Trie.*;
/*****************************************************************************
* This class gets normalized uninflected variants (citation forms) of all words 
* of a specified term and returns every combination of lexical names.  
* The lexical names are generated by Lvg facts.  If a word is not found in
* lexicon, an uninflected term is generated by rule (pick up the first one 
* in an alphabetical order.
* A heuristic rule is applied when more than ten (confgiurable) output 
* combinations is generated by the rules.  In such case, the original term is 
* used to replace all output combinations.
*
* 

History: *

    *
* * @author NLM NLS Development Team * * @see * Design Document * * @version V-2010 ****************************************************************************/ public class ToNormUninflectWords extends Transformation { // public methods /** * Performs the mutation of this flow component. * * @param in a LexItem as the input for this flow component * @param maxTerm the maxinum number of permutation term (uninflect) * @param conn LVG database connection * @param trie LVG persistent trie * @param detailsFlag a boolean flag for processing details information * @param mutateFlag a boolean flag for processing mutate information * * @return Vector - results from this flow component * * @exception SQLException if errors occurr while connect to LVG database. * * @see DbBase */ public static Vector Mutate(LexItem in, int maxTerm, Connection conn, RamTrie trie, boolean detailsFlag, boolean mutateFlag) throws SQLException { // mutate the term Vector termList = NormUninflectWords(in.GetSourceTerm(), maxTerm, conn, trie); // update target LexItem Vector out = new Vector(); for(int i = 0; i < termList.size(); i++) { String term = termList.elementAt(i); // details & mutate String details = null; String mutate = null; if(detailsFlag == true) { details = INFO; } if(mutateFlag == true) { mutate = Transformation.NO_MUTATE_INFO; } LexItem temp = UpdateLexItem(in, term, Flow.NORM_UNINFLECT_WORDS, Category.ALL_BIT_VALUE, Inflection.GetBitValue(Inflection.BASE_BIT), details, mutate); out.addElement(temp); } return out; } /** * A unit test driver for this flow component. */ public static void main(String[] args) { // load config file Configuration conf = new Configuration("data.config.lvg", true); String testStr = GetTestStr(args, "Color colour"); int minTermLen = Integer.parseInt( conf.GetConfiguration(Configuration.MIN_TERM_LENGTH)); String lvgDir = conf.GetConfiguration(Configuration.LVG_DIR); int maxTerm = Integer.parseInt( conf.GetConfiguration(Configuration.MAX_UNINFLS)); // Mutate: connect to DB LexItem in = new LexItem(testStr); Vector outs = new Vector(); try { Connection conn = DbBase.OpenConnection(conf); boolean isInflection = true; RamTrie trie = new RamTrie(isInflection, minTermLen, lvgDir, 0); if(conn != null) { outs = ToNormUninflectWords.Mutate(in, maxTerm, conn, trie, true, true); } DbBase.CloseConnection(conn, conf); } catch (Exception e) { System.err.println(e.getMessage()); } PrintResults(in, outs); // print out results } // private methods /** * Get uninflected variants of each word of a specified term and return * all combinations of these uninflected variants. * * @param inStr a input term for finding it's uninflection * @param conn LVG database connection * @param trie LVG persistent trie * * @return Vector - results from this flow component * * @see DbBase */ private static Vector NormUninflectWords(String inStr, int maxTerm, Connection conn, RamTrie trie) { // tokenize wrods form the input term StringTokenizer buf = new StringTokenizer(inStr, " \t"); Vector out = new Vector(); Vector> strList = new Vector>(); // all lexical names // Each element is a Vector, contains all lexical names for a wrod long totalNum = 1; // total number of permutations // get uninflections for all wrods while(buf.hasMoreTokens() == true) { String curStr = buf.nextToken(); try { // Fact: get lexical names from database Vector factList = DbCitation.GetCitations(curStr, conn); Vector wordList = new Vector(); // lexical names for a word for(int i = 0; i < factList.size(); i++) { InflectionRecord record = factList.elementAt(i); String citationTerm = record.GetCitationTerm(); String citationTermLc = citationTerm.toLowerCase(); if(wordList.contains(citationTermLc) == false) { wordList.addElement(citationTermLc); } } // apply Trie rules to get uninflected if no lexical name found if(factList.size() == 0) { Vector ruleList = trie.GetUninflectedTermsByRules(curStr, Category.ALL_BIT_VALUE, Inflection.ALL_BIT_VALUE, true); // form the list from the result of Trie // This flow is for LuiNorm, thus only one word is needed String smallestStr = null; for(int i = 0; i < ruleList.size(); i++) { RuleResult result = ruleList.elementAt(i); String uninflectedTerm = result.GetOutTerm(); String uninflectedTermLc = uninflectedTerm.toLowerCase(); // check if the uninflected term exist in Lexicon if(DbUninflection.IsExistUninflectedTerm( uninflectedTermLc, conn) == false) { if(smallestStr == null) { smallestStr = uninflectedTermLc; } else if((uninflectedTermLc.compareTo(smallestStr) < 0) && (DbUninflection.IsExistUninflectedTerm( uninflectedTerm, conn) == false)) { smallestStr = uninflectedTermLc; } } } wordList.addElement(smallestStr); } // apply heuristic rules: use original term if size is too big totalNum = totalNum*wordList.size(); if(totalNum > maxTerm) { out.removeAllElements(); out.addElement(inStr); return out; } strList.addElement(wordList); // add wordList if legal } catch (Exception e) { break; } } // do the permutation & sorting out = ToUninflectWords.FormCombinations(strList); // form the String // re-sort by Dictionary order if(strList.size() >= 1) { LvgComparator lc = new LvgComparator(); lc.SetLengthFlag(true); // needed for luiNorm lc.SetCase(false); Collections.sort(out, lc); // sort output in dictionary order } return out; } // data members private static final String INFO = "Normalize Uninflect Words"; }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy