gov.nih.nlm.nls.lvg.Flows.ToNormUnicodeWithSynonym Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lvg2010dist Show documentation
LVG tools is used by Apache cTAKES.
The newest version!
package gov.nih.nlm.nls.lvg.Flows;
import java.util.*;
import java.io.*;
import com.ibm.icu.text.*;
import com.ibm.icu.lang.*;
import gov.nih.nlm.nls.lvg.Util.*;
import gov.nih.nlm.nls.lvg.Lib.*;
/*****************************************************************************
* This class normalizes Unicode characters to ASCII with synonym option.
* The normalization includes
* 
*  get unicode synonym base (-f:q4)
* 
 get core unicode Norm (-f:q7)
* 
 get unicode name (-f:q3)
* 
* In other words, this flow compoment is composed by -f:q4:q7:q3.
*
* History:
* 

* 
*
* @author NLM NLS Development Team
*
* @see 
* Design Document 
* @see  
* Norm Unicode characters to ASCII with Synonym options
*
* @version    V-2010
****************************************************************************/
public class ToNormUnicodeWithSynonym extends Transformation implements Cloneable
{
    // public methods
    /**
    * Performs the mutation of this flow component.
    *
    * @param   in   a LexItem as the input for this flow component
    * @param   unicodeSynonymMap   a hash table contains Unicode synonym 
    * @param   symbolMap   a hash table contain the Unicode symbols mapping
    * @param   unicodeMap   a hash table contain the Unicode mapping
    * @param   ligatureMap   a hash table contains the mapping of ligatures
    * @param   diacriticMap  a hash table contains the mapping of diacritics
    * @param   startTag   the starting tag for symbol name (default: ![ )
    * @param   endTag     the ending tag for symbol name (default: ]! )
    * @param   detailsFlag   a boolean flag for processing details information
    * @param   mutateFlag   a boolean flag for processing mutate information
    *
    * @return  the results from this flow component - a collection (Vector)
    * of LexItems
    */
    public static Vector Mutate(LexItem in, 
        Hashtable unicodeSynonymMap,
        Hashtable symbolMap,
        Hashtable unicodeMap,
        Hashtable ligatureMap,
        Hashtable diacriticMap,
        String startTag, String endTag, boolean detailsFlag, boolean mutateFlag)
    {
        // mutate the term: Unicode coreNorm 
        String synonymTerm = ToGetUnicodeSynonyms.GetUnicodeSynonym(
            in.GetSourceTerm(), unicodeSynonymMap);
        // mutate the term: Unicode coreNorm 
        String coreNormTerm = ToUnicodeCoreNorm.GetCoreNormStr(
            synonymTerm, symbolMap, unicodeMap, ligatureMap, diacriticMap);
        // mutate the term: symbol name 
        String term = ToGetUnicodeNames.GetUnicodeName(coreNormTerm, startTag,
            endTag);
        // details & mutate
        String details = null;
        String mutate = null;
        if(detailsFlag == true)
        {
            details = INFO;
        }
        if(mutateFlag == true)
        {
            mutate = Transformation.NO_MUTATE_INFO;
        }
        // updatea target 
        Vector out = new Vector();
        LexItem temp = UpdateLexItem(in, term, Flow.NORM_UNICODE_WITH_SYNONYM, 
            Transformation.UPDATE, Transformation.UPDATE, details, mutate);
        out.addElement(temp);
        return out;
    }
    /**
    * A unit test driver for this flow component.
    */
    public static void main(String[] args)
    {
        // load config file
        String testStr = GetTestStr(args, "\u00A9 and \u00B5");
        Configuration conf = new Configuration("data.config.lvg", true);
        Hashtable unicodeSynonyms 
            = ToGetUnicodeSynonyms.GetUnicodeSynonymMapFromFile(conf);
        Hashtable symbolMap
            = ToMapSymbolToAscii.GetSymbolMapFromFile(conf);
        Hashtable unicodeMap
            = ToMapUnicodeToAscii.GetUnicodeMapFromFile(conf);
        Hashtable ligatureMap 
            = ToSplitLigatures.GetLigatureMapFromFile(conf);
        Hashtable diacriticMap 
            = ToStripDiacritics.GetDiacriticMapFromFile(conf);
        String startTag = conf.GetConfiguration(Configuration.START_TAG);
        String endTag = conf.GetConfiguration(Configuration.END_TAG);
        // mutate
        LexItem in = new LexItem(testStr);
        Vector outs = ToNormUnicodeWithSynonym.Mutate(in, 
            unicodeSynonyms, symbolMap, unicodeMap, ligatureMap, diacriticMap,    
            startTag, endTag, true, true);
        PrintResults(in, outs);     // print out results
    }
    // private method
    // data members
    private static final String INFO = "Unicode Norm With Synonym option";
}