gov.nih.nlm.nls.lvg.Flows.ToLuiNormalize Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lvg2010dist Show documentation
LVG tools is used by Apache cTAKES.
The newest version!
package gov.nih.nlm.nls.lvg.Flows;
import java.util.*;
import java.sql.*;
import gov.nih.nlm.nls.lvg.Lib.*;
import gov.nih.nlm.nls.lvg.Db.*;
import gov.nih.nlm.nls.lvg.Trie.*;
/*****************************************************************************
* This class provides features of generating luiNorm for a specified term.
* This flow component is consist of 10 other flow components in a serial order.
* They are: q7, g, rs, o, t, l, B, C, w, q8.
*
* If the flow component B generates multiple outputs in LuiNorm,
* the results are determined as shown in the following two cases: 
* 

*  Fact:
*   
 All citation forms are in the database and thus the results will 
*   be all the same (after canonicalized).  The program will use the first
*   output as the input for next flow component, Canonicalize.
* 
 Uninflected by rules:
*   
 In most cases, the uninflected forms will not in the Lvg database.
*   Thus, different results will be generated by canonicalization
*   depending on which uninflected term the program pick.  In this release, 
*   the results from uninflected terms (by rules) is sorted by an alphabetical 
*   order.  Thus, theis program pick up the first one from the uninflected 
*   terms for canonicalization since it is unique and consistent.
* 
* Accordingly, the output should only contain one LexItem.
*
* History:
* 

* 
*
* @author NLM NLS Development Team
*
* @see 
* Design Document 
* @see ToUnicodeCoreNorm
* @see ToRemoveGenitive
* @see ToRemoveS
* @see ToReplacePunctuationWithSpace
* @see ToStripStopWords
* @see ToLowerCase
* @see ToUninflectWords
* @see ToCanonicalize
* @see ToStripMapUnicode
* @see ToSortWordsByOrder
*
* @version    V-2010
****************************************************************************/
public class ToLuiNormalize extends Transformation implements Cloneable
{
    // public methods
    /**
    * Performs the mutation of this flow component.
    *
    * @param   in   a LexItem as the input for this flow component
    * @param   maxTerm   tthe maxinum number of permutation term (uninflect)
    * @param   stopWords   Vector - stop wrods list
    * @param   conn   LVG database connection
    * @param   trie   LVG ram trie
    * @param   symbolMap   a hash table contains the unicode symbols mapping
    * @param   unicodeMap   a hash table contains the unicode mapping
    * @param   ligatureMap   a hash table contains the mapping of ligatures
    * @param   diacriticMap  a hash table contains the mapping of diacritics
    * @param   nonStripMap   a hash table contains the non-Strip map unicode
    * @param   removeSTree   a reverse trie tree of removeS pattern rules
    * @param   detailsFlag   a boolean flag for processing details information
    * @param   mutateFlag   a boolean flag for processing mutate information
    *
    * @return  the results from this flow component - a collection (Vector)
    * of LexItems
    *
    * @exception SQLException if errors occurr while connect to LVG database.
    *
    * @see DbBase
    */
    public static Vector Mutate(LexItem in, int maxTerm, 
        Vector stopWords, Connection conn, RamTrie trie, 
        Hashtable symbolMap,
        Hashtable unicodeMap,
        Hashtable ligatureMap, 
        Hashtable diacriticMap,
        Hashtable nonStripMap,
        RTrieTree removeSTree, boolean detailsFlag, boolean mutateFlag) 
        throws SQLException
    {
        Vector outList = new Vector();
        Vector inList = new Vector();
        inList.addElement(in);
        // go through all 10 flow components: q7, g, rs, o, t, l, B, C, w, q8
        for(int i = 0; i < LUI_NORM_STEPS; i++)
        {
            outList = GetLuiNormBySteps(i, inList, maxTerm, stopWords, conn, 
                trie, symbolMap, unicodeMap, ligatureMap, diacriticMap,
                nonStripMap, removeSTree, detailsFlag, mutateFlag);
            // convert current out to next in
            inList.removeAllElements();
            for(int j = 0; j < outList.size(); j++)
            {
                LexItem out = outList.elementAt(j);
                LexItem temp = LexItem.TargetToSource(out);
                inList.addElement(temp);
            }
        }
        // no need to update history since it's done in each flow component
        // reset mutate information
        for(int i = 0; i < outList.size(); i++)
        {
            outList.elementAt(i).SetMutateInformation(
                Transformation.NO_MUTATE_INFO);
        }
        return outList;
    }
    /**
    * A unit test driver for this flow component.
    */
    public static void main(String[] args)
    {
        // read in configuration file
        Configuration conf = new Configuration("data.config.lvg", true);
        String testStr = GetTestStr(args, "fingers");
        int minTermLen = Integer.parseInt(
            conf.GetConfiguration(Configuration.MIN_TERM_LENGTH));
        String lvgDir = conf.GetConfiguration(Configuration.LVG_DIR);
        int maxTerm = Integer.parseInt(
            conf.GetConfiguration(Configuration.MAX_UNINFLS));
        Vector stopWords = ToStripStopWords.GetStopWordsFromFile(conf);
        Hashtable symbolMap
            = ToMapSymbolToAscii.GetSymbolMapFromFile(conf);
        Hashtable unicodeMap
            = ToMapUnicodeToAscii.GetUnicodeMapFromFile(conf);
        Hashtable ligatureMap 
            = ToSplitLigatures.GetLigatureMapFromFile(conf);
        Hashtable diacriticMap 
            = ToStripDiacritics.GetDiacriticMapFromFile(conf);
        Hashtable nonStripMap
            = ToStripMapUnicode.GetNonStripMapFromFile(conf);
        RTrieTree removeSTree = ToRemoveS.GetRTrieTreeFromFile(conf);
        // connect to DB
        LexItem in = new LexItem(testStr);
        Vector outs = new Vector();
        try
        {
            Connection conn = DbBase.OpenConnection(conf);
            boolean isInflection = true;
            RamTrie trie = new RamTrie(isInflection, minTermLen, lvgDir, 0);
            if(conn != null)
            {
                outs = ToLuiNormalize.Mutate(in, maxTerm, stopWords, conn, 
                    trie, symbolMap, unicodeMap, ligatureMap, diacriticMap,
                    nonStripMap, removeSTree, true, true);
            }
            DbBase.CloseConnection(conn, conf);
        }
        catch (Exception e)
        {
            System.err.println(e.getMessage());
        }
        // print out results
        PrintResults(in, outs);
    }
    // private method
    private static Vector GetLuiNormBySteps(int step, 
        Vector ins, int maxTerm, Vector stopWords, 
        Connection conn, RamTrie trie,
        Hashtable symbolMap,
        Hashtable unicodeMap,
        Hashtable ligatureMap,
        Hashtable diacriticMap, 
        Hashtable nonStripMap,
        RTrieTree removeSTree, boolean detailsFlag, boolean mutateFlag) 
        throws SQLException
    {
        Vector outs = new Vector();
        int index = 0;
        // go through all elements for the ins
        for(int i = 0; i < ins.size(); i++)
        {
            LexItem in = ins.elementAt(i);
            Vector tempOuts = new Vector();
            switch(step)
            {
                case 0:         // -f:q7
                    tempOuts = ToUnicodeCoreNorm.Mutate(in, symbolMap,
                        unicodeMap, ligatureMap, diacriticMap,
                        detailsFlag, mutateFlag);
                    break;
                case 1:         // -f:g
                    tempOuts = ToRemoveGenitive.Mutate(in, detailsFlag, 
                        mutateFlag);
                    break;
                case 2:         // -f:rs
                    tempOuts = ToRemoveS.Mutate(in, removeSTree,
                        detailsFlag, mutateFlag);
                    break;
                case 3:         // -f:o
                    tempOuts = ToReplacePunctuationWithSpace.Mutate(in, 
                        detailsFlag, mutateFlag);
                    break;
                case 4:         // -f:t
                    tempOuts = ToStripStopWords.Mutate(in, stopWords,
                        detailsFlag, mutateFlag);
                    break;
                case 5:         // -f:l
                    tempOuts = ToLowerCase.Mutate(in, detailsFlag, mutateFlag);
                    break;
                case 6:         // -f:B
                    tempOuts = ToUninflectWords.Mutate(in, maxTerm, conn, trie, 
                        detailsFlag, mutateFlag);
                    break;
                case 7:         // -f:C
                    tempOuts = ToCanonicalize.Mutate(in, conn, detailsFlag, 
                        mutateFlag);
                    break;
                case 8:         // -f:q8
                    tempOuts = ToStripMapUnicode.Mutate(in, 
                        nonStripMap, detailsFlag, mutateFlag);
                    break;
                case 9:         // -f:w
                    tempOuts = ToSortWordsByOrder.Mutate(in, detailsFlag, 
                        mutateFlag);
                    break;
            }
            outs.addAll(tempOuts);
            // only take the first output for Canonicalize
            if(step == 7)
            {
                break;
            }
        }
        return outs;
    }
    // data members
    private final static int LUI_NORM_STEPS = 10;
}