gov.nih.nlm.nls.lvg.Flows.ToNormalize Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lvg2010dist Show documentation
LVG tools is used by Apache cTAKES.
The newest version!
package gov.nih.nlm.nls.lvg.Flows;
import java.util.*;
import java.sql.*;
import gov.nih.nlm.nls.lvg.Lib.*;
import gov.nih.nlm.nls.lvg.Db.*;
import gov.nih.nlm.nls.lvg.Trie.*;
/*****************************************************************************
* This class provides features of the normalize flow component.  This flow 
* component consists of 10 other flow components in a serial order.
* They are: q7, g, rs, o, t, l, B, Ct, w, q8.
*
* History:
* 

* 
*
* @author NLM NLS Development Team
*
* @see 
* Design Document 
* @see ToMapSymbolToAscii
* @see ToRemoveGenitive
* @see ToRemoveS
* @see ToReplacePunctuationWithSpace
* @see ToStripStopWords
* @see ToLowerCase
* @see ToUninflectWords
* @see ToCitation
* @see ToUnicodeCoreNorm
* @see ToStripMapUnicode
* @see ToSortWordsByOrder
*
* @version    V-2010
****************************************************************************/
public class ToNormalize extends Transformation implements Cloneable
{
    // public methods
    /**
    * Performs the mutation of this flow component.  Normalize flow component
    * is significantly changed in release 2008. 
    *
    * @param   in   a LexItem as the input for this flow component
    * @param   maxTerm   the maxinum number of permutation term (uninflect)
    * @param   stopWords   Vector - stop wrods list
    * @param   conn   LVG database connection
    * @param   trie   LVG Ram trie
    * @param   symbolMap   a hash table contains the unicode symbols mapping
    * @param   unicodeMap   a hash table contains the unicode mapping
    * @param   ligatureMap   a hash table contains the mapping of ligatures
    * @param   diacriticMap  a hash table contains the mapping of diacritics
    * @param   nonStripMap   a hash table contains the non-Strip map unicode
    * @param   removeSTree   a reverse trie tree of removeS pattern rules
    * @param   detailsFlag   a boolean flag for processing details information
    * @param   mutateFlag   a boolean flag for processing mutate information
    *
    * @return  the results from this flow component - a collection (Vector)
    * of LexItems
    *
    * @exception SQLException if errors occurr while connect to LVG database.
    *
    * @see DbBase
    * @see PersistentTrie
    */
    public static Vector Mutate(LexItem in, int maxTerm, 
        Vector stopWords, Connection conn, RamTrie trie, 
        Hashtable symbolMap,
        Hashtable unicodeMap,
        Hashtable ligatureMap, 
        Hashtable diacriticMap, 
        Hashtable nonStripMap,
        RTrieTree removeSTree, boolean detailsFlag, boolean mutateFlag) 
        throws SQLException
    {
        Vector outList = new Vector();
        Vector inList = new Vector();
        inList.addElement(in);
        // go through 10 flow components: q7, g, rs, o, t, l, B, Ct, w, q8
        for(int i = 0; i < NORM_STEPS; i++)
        {
            outList = GetNormBySteps(i, inList, maxTerm, stopWords, conn, trie, 
                symbolMap, unicodeMap, ligatureMap, diacriticMap,
                nonStripMap, removeSTree, detailsFlag, mutateFlag);
            // convert the output of current lexItem to input of next LexItem
            inList.removeAllElements();
            for(int j = 0; j < outList.size(); j++)
            {
                LexItem out = outList.elementAt(j);
                LexItem temp = LexItem.TargetToSource(out);
                inList.addElement(temp);
            }
        }
        // no need to update history since it's done in each flow component
        // reset mutate information
        for(int i = 0; i < outList.size(); i++)
        {
            outList.elementAt(i).SetMutateInformation(
                Transformation.NO_MUTATE_INFO);
        }
        
        return outList;
    }
    /**
    * A unit test driver for this flow component.
    */
    public static void main(String[] args)
    {
        // read in configuration file
        Configuration conf = new Configuration("data.config.lvg", true);
        String testStr = GetTestStr(args, "Left");
        int minTermLen = Integer.parseInt(
            conf.GetConfiguration(Configuration.MIN_TERM_LENGTH));
        String lvgDir = conf.GetConfiguration(Configuration.LVG_DIR);
        int maxTerm = Integer.parseInt(
            conf.GetConfiguration(Configuration.MAX_UNINFLS));
        Vector stopWords = ToStripStopWords.GetStopWordsFromFile(conf);
        Hashtable symbolMap
            = ToMapSymbolToAscii.GetSymbolMapFromFile(conf);
        Hashtable unicodeMap
            = ToMapUnicodeToAscii.GetUnicodeMapFromFile(conf);
        Hashtable ligatureMap 
            = ToSplitLigatures.GetLigatureMapFromFile(conf);
        Hashtable diacriticMap 
            = ToStripDiacritics.GetDiacriticMapFromFile(conf);
        Hashtable nonStripMap
            = ToStripMapUnicode.GetNonStripMapFromFile(conf);
        RTrieTree removeSTree = ToRemoveS.GetRTrieTreeFromFile(conf);
        // Mutate: connect to DB
        LexItem in = new LexItem(testStr);
        Vector outs = new Vector();
        try
        {
            Connection conn = DbBase.OpenConnection(conf);
            boolean isInflection = true;
            RamTrie trie = new RamTrie(isInflection, minTermLen, lvgDir, 0);
            if(conn != null)
            {
                outs = ToNormalize.Mutate(in, maxTerm, stopWords, conn, trie, 
                    symbolMap, unicodeMap, ligatureMap, diacriticMap,
                    nonStripMap, removeSTree, true, true);
            }
            DbBase.CloseConnection(conn, conf);
        }
        catch (Exception e)
        {
            System.err.println(e.getMessage());
        }
        // print out results
        PrintResults(in, outs);
    }
    // private method
    private static Vector GetNormBySteps(int step, Vector ins,
        int maxTerm, Vector stopWords, Connection conn, RamTrie trie, 
        Hashtable symbolMap,
        Hashtable unicodeMap,
        Hashtable ligatureMap, 
        Hashtable diacriticMap, 
        Hashtable nonStripMap,
        RTrieTree removeSTree, boolean detailsFlag, boolean mutateFlag) 
        throws SQLException
    {
        Vector outs = new Vector();
        int index = 0;
        // go through all elements for the ins
        for(int i = 0; i < ins.size(); i++)
        {
            LexItem in = ins.elementAt(i);
            Vector tempOuts = new Vector();
            switch(step)
            {
                case 0:         // -f:q0
                    tempOuts = ToMapSymbolToAscii.Mutate(in, symbolMap,
                        detailsFlag, mutateFlag);
                    break;
                case 1:         // -f:g
                    tempOuts = ToRemoveGenitive.Mutate(in, detailsFlag, 
                        mutateFlag);
                    break;
                case 2:         // -f:rs
                    tempOuts = ToRemoveS.Mutate(in, removeSTree,
                        detailsFlag, mutateFlag);
                    break;
                case 3:         // -f:o
                    tempOuts = ToReplacePunctuationWithSpace.Mutate(in, 
                        detailsFlag, mutateFlag);
                    break;
                case 4:         // -f:t
                    tempOuts = ToStripStopWords.Mutate(in, stopWords,
                        detailsFlag, mutateFlag);
                    break;
                case 5:         // -f:l
                    tempOuts = ToLowerCase.Mutate(in, detailsFlag, mutateFlag);
                    break;
                case 6:         // -f:B
                    tempOuts = ToUninflectWords.Mutate(in, maxTerm, conn, trie, 
                        detailsFlag, mutateFlag);
                    break;
                case 7:     // -f:Ct, don't use -f:ct, varon has non-ASCII Ct
                    tempOuts = GetCitation(in, conn, diacriticMap, ligatureMap);
                    break;
                case 8:         // -f:q7
                    tempOuts = ToUnicodeCoreNorm.Mutate(in, symbolMap,
                        unicodeMap, ligatureMap, diacriticMap,
                        detailsFlag, mutateFlag);
                    break;
                case 9:         // -f:q8
                    tempOuts = ToStripMapUnicode.Mutate(in, 
                        nonStripMap, detailsFlag, mutateFlag);
                    break;
                case 10:         // -f:w
                    tempOuts = ToSortWordsByOrder.Mutate(in, detailsFlag, 
                        mutateFlag);
                    break;
            }
            outs.addAll(tempOuts);
        }
        return outs;
    }
    // Use this method instead of -f:Ct, lowcases and trim after Ct.
    private static Vector GetCitation(LexItem in, Connection conn,
        Hashtable diacriticMap, 
        Hashtable ligatureMap) throws SQLException
    {
        Vector outs = new Vector();
        String inStr = in.GetSourceTerm();
        String delim = " \t-({[)}]_!@#%&*\\:;\"',.?/~+=|<>$`^";
        StringTokenizer buf = new StringTokenizer(inStr, delim);
        String newCit = new String();
        while(buf.hasMoreTokens() == true)
        {
            String inBase = buf.nextToken();
            Vector citations = 
                DbCitation.GetCitationsFromBase(inBase, conn);
            // sort citation to ensure consistency accross different database
            CitationComparator cc 
                = new CitationComparator();
            Collections.sort(citations, cc);
            String citation = inBase;
            if((citations != null) && (citations.size() > 0))
            {
                InflectionRecord record = citations.elementAt(0);
                citation = record.GetCitationTerm();
            }
            newCit += citation + " ";
        }
        // strip diacritics, lowercase, trim
        //newCit = ToStripDiacritics.StripDiacritics(newCit, diacriticMap);
        newCit = newCit.toLowerCase().trim();
        String details = "Citation";
        String mutate = new String();
        LexItem temp = UpdateLexItem(in, newCit, Flow.CITATION,
            in.GetSourceCategory().GetValue(), 
            in.GetSourceInflection().GetValue(), details, mutate);
        outs.add(temp);
        return outs;
    }
    // data members
    private final static int NORM_STEPS = 11;       // current norm
}