gov.nih.nlm.nls.lvg.Trie.RamTrie Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lvg2010dist Show documentation
Show all versions of lvg2010dist Show documentation
LVG tools is used by Apache cTAKES.
The newest version!
package gov.nih.nlm.nls.lvg.Trie;
import java.util.*;
import gov.nih.nlm.nls.lvg.Lib.*;
import gov.nih.nlm.nls.lvg.Util.*;
/*****************************************************************************
* This class establishes Trie from flat files and put them into Ram.
*
* History:
*
*
*
* @author NLM NLS Development Team
*
* @see
* Design Document
*
* @version V-2010
****************************************************************************/
final public class RamTrie
{
// public constructors
/**
* Create an object of LVG trie, using a flag to indicate
* using inflection or derivation trie.
*
* @param isInflection true or false to indicate the persistent trie
* type as inflections or derivations.
* @param minTermLength minimum length of rule generated out term
* @param dir the top directory of LVG
* @param minTrieStemLength min. legal stem length in trie
*/
public RamTrie(boolean isInflection, int minTermLength, String dir,
int minTrieStemLength)
{
isInflection_ = isInflection;
minTermLength_ = minTermLength;
minTrieStemLength_ = minTrieStemLength;
// inflection trie
String trieDir = dir + "/data/rules/";
String ruleFileName = "im.rul";
// derivation trie
if(isInflection == false)
{
ruleFileName = "dm.rul";
}
trie_.LoadRulesFromFile(trieDir, ruleFileName, false, true); // forward
trie_.LoadRulesFromFile(trieDir, ruleFileName, true, false); // reverse
}
// public methods
/**
* Traverse along the trie tree, find the matching suffix pattern, modify
* the suffix according to the rules, and print out results.
*
* @param term a term to be used for finding inflection or derivation
* @param term the term to be found for uninflections
* @param inCategory the input category
* @param inInflection the input inflection
* @param outCategory the output category
* @param outInflection the output inflection
* @param showAll set false to get uninflected terms from a node that has
* matching suffix and ignore all other matching nodes above it in the
* same tree branch.
*
set true to get all uninflected terms from all nodes that has matching
* suffix in the entire tree.
*/
private Vector Mutate(String term, boolean showAll,
long inCategory, long inInflection, long outCategory,
long outInflection)
{
Vector resultList = new Vector();
// go through trie to find the rule if match nodes
Vector matchNodeList = trie_.FindRule(term);
matchedNodeNum_ = matchNodeList.size();
// Go through all match nodes
int curLevel = 0;
for(int i = 0; i < matchedNodeNum_; i++)
{
TrieNode node = matchNodeList.elementAt(i);
Vector rules = node.GetRules();
if((node == null) || (rules == null))
{
System.err.println("** Error: null in TrieNode or it's rules");
break;
}
// go through all matched rules
for(int j = 0; j < rules.size(); j++)
{
InflectionRule rule = rules.elementAt(j);
// suffix rule, check stem length for derivation
if((isInflection_ == false)
&& (minTrieStemLength_ > 0)
&& (GetStemLength(term, rule) <= minTrieStemLength_))
{
continue;
}
// apply rules and add into result
if(IsException(term, rule) == false)
{
RuleResult result = ApplyRules(term, rule,
inCategory, inInflection, outCategory, outInflection);
// showAll: add all nodes on the traversal path
if(result != null)
{
if(showAll == true)
{
resultList.add(result);
}
else // add only the lowest node on traversal path
{
// filter out the result
// reset and add result if at higher level
if(node.GetLevel() > curLevel)
{
curLevel = node.GetLevel();
resultList.removeAllElements();
resultList.add(result);
} // add result if at the same level
else if(node.GetLevel() == curLevel)
{
resultList.add(result);
}
}
}
}
}
}
// heuristic rule: check the length
Vector out = new Vector();
for(int i = 0; i < resultList.size(); i++)
{
RuleResult temp = resultList.elementAt(i);
String tempStr = temp.GetOutTerm();
if((tempStr.length() >= minTermLength_) // check min. length
|| (tempStr.equals(term))) // return the input term
{
out.addElement(temp);
}
}
return out;
}
/**
* Get uninflected terms for a specific inflected term from LVG trie rules.
*
* @param term the term to be found for uninflections
* @param inCat the input category
* @param inInfl the input inflection
* @param showAll set false to get uninflected terms from a node that has
* matching suffix and ignore all other matching nodes above it in the
* same tree branch.
*
set true to get all uninflected terms from all nodes that has matching
* suffix in the entire tree.
*
* @return Vector of uninflected term
*/
public Vector GetUninflectedTermsByRules(String term,
long inCat, long inInfl, boolean showAll)
{
// get all result by the out Inflection is BASE
Vector resultList = Mutate(term, showAll, inCat,
inInfl, PersistentTrie.LEGAL_CATEGORY, PersistentTrie.LEGAL_BASE);
// sort resut list by the order of noun, verb, adj, and adv
RuleResultComparator rrc = new RuleResultComparator();
//TBD: Collections.sort(resultList, rrc);
return resultList;
}
/**
* Get all possible categories and inflections for a term from trie rules.
*
* @param term the term to be found all possible categories and inflections
* @param inCat input categories
* @param inInfl input inflections
*
* @return a record of combined categories and inflections
*/
public CatInfl GetCatInflByRules(String term, long inCat, long inInfl)
{
long cat = 0;
long infl = 0;
// go through trie to find the rule if match nodes
Vector matchNodeList = trie_.FindRule(term);
matchedNodeNum_ = matchNodeList.size();
// Go through all match nodes
int curLevel = 0;
for(int i = 0; i < matchNodeList.size(); i++)
{
TrieNode node = matchNodeList.elementAt(i);
Vector rules = node.GetRules();
if((node == null) || (rules == null))
{
System.err.println("** Error: null in TrieNode or it's rules");
break;
}
// go through all matched rules
for(int j = 0; j < rules.size(); j++)
{
InflectionRule rule = rules.elementAt(j);
if(IsException(term, rule) == false)
{
long curCat = rule.GetInCategory();
long curInfl = rule.GetInInflection();
if((Bit.Contain(inCat, curCat) == true)
&& (Bit.Contain(inInfl, curInfl) == true))
{
cat = cat | curCat;
infl = infl | curInfl;
}
}
}
}
CatInfl catInfl = new CatInfl(cat, infl);
return catInfl;
}
/**
* Get all possible categories and inflections for a term from trie rules.
*
* @param term the term to be found all possible categories and inflections
* @param inCat input categories
* @param inInfl input inflections
*
* @return Vector - categories and inflections
*/
public Vector GetCatInflsByRules(String term, long inCat,
long inInfl)
{
long cat = 0;
long infl = 0;
Vector outs = new Vector();
// go through trie to find the rule if match nodes
Vector matchNodeList = trie_.FindRule(term);
matchedNodeNum_ = matchNodeList.size();
// Go through all match nodes
int curLevel = 0;
for(int i = 0; i < matchNodeList.size(); i++)
{
TrieNode node = matchNodeList.elementAt(i);
Vector rules = node.GetRules();
if((node == null) || (rules == null))
{
System.err.println("** Error: null in TrieNode or it's rules");
break;
}
// go through all matched rules
for(int j = 0; j < rules.size(); j++)
{
InflectionRule rule = rules.elementAt(j);
// check exception
if(IsException(term, rule) == false)
{
long curCat = rule.GetInCategory();
long curInfl = rule.GetInInflection();
CatInfl catInfl = new CatInfl(curCat, curInfl);
outs.addElement(catInfl);
}
}
}
return outs;
}
/**
* Get inflected terms for a specific term from LVG trie rules.
*
* @param term the term to be found for inflections
* @param inCat the input category
* @param inInfl the input inflection
* @param showAll set false to get inflected terms from a node that has
* matching suffix and ignore all other matching nodes above it in the
* same tree branch.
*
set true to get all inflected terms from all nodes that has matching
* suffix in the entire tree.
*
* @return Vector of inflected term
*/
public Vector GetInflectedTermsByRules(String term, long inCat,
long inInfl, boolean showAll)
{
// get all uninflected terms by the out Inflection is BASE
Vector uninflectedList = Mutate(term, showAll, inCat,
inInfl, PersistentTrie.LEGAL_CATEGORY, PersistentTrie.LEGAL_BASE);
// get all inflected term by go through all base term
Vector resultList = new Vector();
String lastTerm = null;
long lastCat = -1;
for(int i = 0; i < uninflectedList.size(); i ++)
{
RuleResult temp = uninflectedList.elementAt(i);
String tempTerm = temp.GetOutTerm();
String ruleStr = temp.GetRuleString();
long cat = Category.ToValue(temp.GetOutCategory()); // out Cat
// get Uninflected terms only if the infinitive are different
Vector tempResult = new Vector();
if((cat != lastCat)
|| (tempTerm.equals(lastTerm) != true))
{
tempResult = Mutate(tempTerm, showAll, cat,
PersistentTrie.LEGAL_INFLECTION,
PersistentTrie.LEGAL_CATEGORY,
PersistentTrie.LEGAL_INFLECTION);
}
lastTerm = tempTerm;
lastCat = cat;
// Hueristic rule: if the result is an uninflected term with
// different spelling with the base, it should be dropped
Vector newTempResult
= RemoveIllegalTerms(tempTerm, tempResult);
// Add temp result into resultList if it is not exist
resultList = PersistentTrie.AddRusultsToInflectList(
resultList, newTempResult);
}
// sort resut list by the order of noun, verb, adj, and adv
RuleResultComparator rrc
= new RuleResultComparator();
Collections.sort(resultList, rrc);
return resultList;
}
/**
* Get derivation for a specific term from LVG trie rules.
*
* @param term the term to be found for derivations
* @param inCat the input category
* @param inInfl the input inflection
* @param showAll set false to get derivation from a node that has
* matching suffix and ignore all other matching nodes above it in the
* same tree branch.
*
set true to get all derivation from all nodes that has matching
* suffix in the entire tree.
*
* @return Vector - of derivation
*/
public Vector GetDerivationsByRules(String term, long inCat,
long inInfl, boolean showAll)
{
// get all result by the out Inflection is BASE
Vector resultList = Mutate(term, showAll, inCat,
inInfl, PersistentTrie.LEGAL_CATEGORY, PersistentTrie.LEGAL_BASE);
return resultList;
}
/**
* Print out a collection of trie ruleresult.
*
* @param resultList A vector of ruleResult to be print out.
*/
public void PrintResults(Vector resultList)
{
// print out result
System.out.println("-- matchNodeList size: " + GetMatchedNodeNum());
for(int i = 0; i < resultList.size(); i++)
{
RuleResult result = resultList.elementAt(i);
System.out.println(result.GetInTerm() + " --> " +
result.GetOutTerm() + " ... Rule: " +
result.GetRuleString());
}
}
/**
* Get the object of trie tree.
*
* @return the trie tree is using
*/
public TrieTree GetTrie()
{
return trie_;
}
/**
* Get the total number of nodes which match suffix
*
* @return A vector of ruleResult to be print out.
*/
public int GetMatchedNodeNum()
{
return matchedNodeNum_;
}
/**
* Set the minimum term length
*
* @param minTermLength minimum term length used in Morpology
*/
public void SetMinTermLength(int minTermLength)
{
minTermLength_ = minTermLength;
}
/**
* This is the executable program for using LVG rule trie through RAM.
* In other words, this program read all information of LVG rules and load
* them up into RAM. The command of running this program is:
*
java2 RamTrie <-i/-d> <-ps>
*
< term >: input term for testing
*
< -i >: mutate with all branch rules applied
*
< -p >: print details, rule, & exceptions
*
< -s >: mutate with all branch rules applied
*/
public static void main(String[] args)
{
if((args.length != 3))
{
System.out.println("Usage: java RamTrie <-i/d> <-ps>");
System.out.println(" term: input term for testing");
System.out.println(" -i: mutate with all branch rules applied");
System.out.println(" -p: print details, rule, & exceptions");
System.out.println(" -s: mutate with all branch rules applied");
}
else
{
String inStr = args[0];
Configuration conf = new Configuration("data.config.lvg", true);
String dir = conf.GetConfiguration(Configuration.LVG_DIR)
+ "/data/rules/";
// inflection or derivation
boolean isInflection = true;
if(args[1].equals("-d") == true)
{
isInflection = false;
}
// show all
boolean showAll = false;
if(args[2].equals("-s") == true)
{
showAll = true; // not function in this test driver
}
int minTrieStemLength = Integer.parseInt(
conf.GetConfiguration(Configuration.DIR_TRIE_STEM_LENGTH));
RamTrie trie = new RamTrie(isInflection, 3, dir, minTrieStemLength);
if(isInflection == true)
{
System.out.println("-------- Uninflected Terms ----------");
Vector result =
trie.GetUninflectedTermsByRules(inStr,
Category.ALL_BIT_VALUE, Inflection.ALL_BIT_VALUE, true);
trie.PrintResults(result);
System.out.println("-------- Inflected Terms ------------");
result = trie.GetInflectedTermsByRules(inStr,
Category.ALL_BIT_VALUE, Inflection.ALL_BIT_VALUE, true);
trie.PrintResults(result);
System.out.println("------ Category & Inflection -----");
CatInfl catInfl = trie.GetCatInflByRules(inStr,
Category.ALL_BIT_VALUE, Inflection.ALL_BIT_VALUE);
System.out.println(catInfl.GetCategory() + ", " +
catInfl.GetInflection());
System.out.println("------ Categories & Inflections -----");
Vector result2 = trie.GetCatInflsByRules(inStr,
Category.ALL_BIT_VALUE, Inflection.ALL_BIT_VALUE);
for(int i = 0; i < result2.size(); i++)
{
catInfl = result2.elementAt(i);
System.out.println(catInfl.GetCategory() + ", "
+ catInfl.GetInflection());
}
}
else
{
System.out.println("---------- Derivations -------------");
Vector result = trie.GetDerivationsByRules(inStr,
Category.ALL_BIT_VALUE, Inflection.ALL_BIT_VALUE, true);
trie.PrintResults(result);
}
}
}
// private methods
private Vector RemoveIllegalTerms(String base,
Vector inflections)
{
Vector out = new Vector();
for(int i = 0; i < inflections.size(); i++)
{
RuleResult temp = inflections.elementAt(i);
String tempTerm = temp.GetOutTerm();
String ruleStr = temp.GetRuleString();
long infl = Inflection.ToValue(temp.GetOutInflection());
// drop it if it is an uninflected form && different spelling
// add it if it is not an uninflected form || same spelling
if((Inflection.Contains(PersistentTrie.LEGAL_BASE, infl) == false)
|| (base.equals(tempTerm) == true))
{
out.addElement(temp);
}
}
return out;
}
private boolean IsException(String inStr, InflectionRule rule)
{
boolean isException = false;
Hashtable exceptions = rule.GetExceptions();
if(exceptions != null)
{
isException = exceptions.containsKey(inStr);
}
return isException;
}
private int GetStemLength(String term, InflectionRule rule)
{
int stemLength = term.length() + 1 - rule.GetInSuffix().length();
return stemLength;
}
private RuleResult ApplyRules(String inStr, InflectionRule rule,
long inCategory, long inInflection, long outCategory,
long outInflection)
{
String tempStr = inStr + '$';
String inSuffix = rule.GetInSuffix();
String outSuffix = rule.GetOutSuffix();
int tempSize = tempStr.length();
int inSize = inSuffix.length();
long inCat = rule.GetInCategory();
long outCat = rule.GetOutCategory();
long inInf = rule.GetInInflection();
long outInf = rule.GetOutInflection();
String unchangeStr = tempStr.substring(0, tempSize-inSize);
String changeStr = WildCard.GetSuffix(inSuffix, outSuffix, tempStr);
String outStr = unchangeStr + changeStr;
outStr = outStr.substring(0, outStr.length()-1); // remove '$'
RuleResult out = null;
// check category and inflection
if((Category.Contains(inCategory, inCat) == true)
&& (Category.Contains(outCategory, outCat) == true)
&& (Inflection.Contains(inInflection, inInf) == true)
&& (Inflection.Contains(outInflection, outInf) == true))
{
// add rule and new result into output
out = new RuleResult(inStr, outStr, rule.GetRuleStr());
}
return out;
}
// data members
private TrieTree trie_ = new TrieTree(true);
private int minTermLength_ = 3;
private int matchedNodeNum_ = 0;
private int minTrieStemLength_ = 0;
private boolean isInflection_ = true;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy