![JAR search and dependency download from the Maven repository](/logo.png)
gov.nih.nlm.nls.lvg.Flows.ToNormUninflectWords Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lvg2010dist Show documentation
Show all versions of lvg2010dist Show documentation
LVG tools is used by Apache cTAKES.
The newest version!
package gov.nih.nlm.nls.lvg.Flows;
import java.util.*;
import java.sql.*;
import gov.nih.nlm.nls.lvg.Lib.*;
import gov.nih.nlm.nls.lvg.Util.*;
import gov.nih.nlm.nls.lvg.Db.*;
import gov.nih.nlm.nls.lvg.Trie.*;
/*****************************************************************************
* This class gets normalized uninflected variants (citation forms) of all words
* of a specified term and returns every combination of lexical names.
* The lexical names are generated by Lvg facts. If a word is not found in
* lexicon, an uninflected term is generated by rule (pick up the first one
* in an alphabetical order.
* A heuristic rule is applied when more than ten (confgiurable) output
* combinations is generated by the rules. In such case, the original term is
* used to replace all output combinations.
*
* History:
*
*
*
* @author NLM NLS Development Team
*
* @see
* Design Document
*
* @version V-2010
****************************************************************************/
public class ToNormUninflectWords extends Transformation
{
// public methods
/**
* Performs the mutation of this flow component.
*
* @param in a LexItem as the input for this flow component
* @param maxTerm the maxinum number of permutation term (uninflect)
* @param conn LVG database connection
* @param trie LVG persistent trie
* @param detailsFlag a boolean flag for processing details information
* @param mutateFlag a boolean flag for processing mutate information
*
* @return Vector - results from this flow component
*
* @exception SQLException if errors occurr while connect to LVG database.
*
* @see DbBase
*/
public static Vector Mutate(LexItem in, int maxTerm,
Connection conn, RamTrie trie, boolean detailsFlag, boolean mutateFlag)
throws SQLException
{
// mutate the term
Vector termList = NormUninflectWords(in.GetSourceTerm(),
maxTerm, conn, trie);
// update target LexItem
Vector out = new Vector();
for(int i = 0; i < termList.size(); i++)
{
String term = termList.elementAt(i);
// details & mutate
String details = null;
String mutate = null;
if(detailsFlag == true)
{
details = INFO;
}
if(mutateFlag == true)
{
mutate = Transformation.NO_MUTATE_INFO;
}
LexItem temp = UpdateLexItem(in, term, Flow.NORM_UNINFLECT_WORDS,
Category.ALL_BIT_VALUE,
Inflection.GetBitValue(Inflection.BASE_BIT),
details, mutate);
out.addElement(temp);
}
return out;
}
/**
* A unit test driver for this flow component.
*/
public static void main(String[] args)
{
// load config file
Configuration conf = new Configuration("data.config.lvg", true);
String testStr = GetTestStr(args, "Color colour");
int minTermLen = Integer.parseInt(
conf.GetConfiguration(Configuration.MIN_TERM_LENGTH));
String lvgDir = conf.GetConfiguration(Configuration.LVG_DIR);
int maxTerm = Integer.parseInt(
conf.GetConfiguration(Configuration.MAX_UNINFLS));
// Mutate: connect to DB
LexItem in = new LexItem(testStr);
Vector outs = new Vector();
try
{
Connection conn = DbBase.OpenConnection(conf);
boolean isInflection = true;
RamTrie trie = new RamTrie(isInflection, minTermLen, lvgDir, 0);
if(conn != null)
{
outs = ToNormUninflectWords.Mutate(in, maxTerm, conn, trie,
true, true);
}
DbBase.CloseConnection(conn, conf);
}
catch (Exception e)
{
System.err.println(e.getMessage());
}
PrintResults(in, outs); // print out results
}
// private methods
/**
* Get uninflected variants of each word of a specified term and return
* all combinations of these uninflected variants.
*
* @param inStr a input term for finding it's uninflection
* @param conn LVG database connection
* @param trie LVG persistent trie
*
* @return Vector - results from this flow component
*
* @see DbBase
*/
private static Vector NormUninflectWords(String inStr, int maxTerm,
Connection conn, RamTrie trie)
{
// tokenize wrods form the input term
StringTokenizer buf = new StringTokenizer(inStr, " \t");
Vector out = new Vector();
Vector> strList = new Vector>(); // all lexical names
// Each element is a Vector, contains all lexical names for a wrod
long totalNum = 1; // total number of permutations
// get uninflections for all wrods
while(buf.hasMoreTokens() == true)
{
String curStr = buf.nextToken();
try
{
// Fact: get lexical names from database
Vector factList
= DbCitation.GetCitations(curStr, conn);
Vector wordList = new Vector(); // lexical names for a word
for(int i = 0; i < factList.size(); i++)
{
InflectionRecord record = factList.elementAt(i);
String citationTerm = record.GetCitationTerm();
String citationTermLc = citationTerm.toLowerCase();
if(wordList.contains(citationTermLc) == false)
{
wordList.addElement(citationTermLc);
}
}
// apply Trie rules to get uninflected if no lexical name found
if(factList.size() == 0)
{
Vector ruleList
= trie.GetUninflectedTermsByRules(curStr,
Category.ALL_BIT_VALUE, Inflection.ALL_BIT_VALUE, true);
// form the list from the result of Trie
// This flow is for LuiNorm, thus only one word is needed
String smallestStr = null;
for(int i = 0; i < ruleList.size(); i++)
{
RuleResult result = ruleList.elementAt(i);
String uninflectedTerm = result.GetOutTerm();
String uninflectedTermLc =
uninflectedTerm.toLowerCase();
// check if the uninflected term exist in Lexicon
if(DbUninflection.IsExistUninflectedTerm(
uninflectedTermLc, conn) == false)
{
if(smallestStr == null)
{
smallestStr = uninflectedTermLc;
}
else if((uninflectedTermLc.compareTo(smallestStr)
< 0)
&& (DbUninflection.IsExistUninflectedTerm(
uninflectedTerm, conn) == false))
{
smallestStr = uninflectedTermLc;
}
}
}
wordList.addElement(smallestStr);
}
// apply heuristic rules: use original term if size is too big
totalNum = totalNum*wordList.size();
if(totalNum > maxTerm)
{
out.removeAllElements();
out.addElement(inStr);
return out;
}
strList.addElement(wordList); // add wordList if legal
}
catch (Exception e)
{
break;
}
}
// do the permutation & sorting
out = ToUninflectWords.FormCombinations(strList); // form the String
// re-sort by Dictionary order
if(strList.size() >= 1)
{
LvgComparator lc = new LvgComparator();
lc.SetLengthFlag(true); // needed for luiNorm
lc.SetCase(false);
Collections.sort(out, lc); // sort output in dictionary order
}
return out;
}
// data members
private static final String INFO = "Normalize Uninflect Words";
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy