All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gov.nih.nlm.nls.lvg.Flows.ToNormUnicode Maven / Gradle / Ivy

The newest version!
package gov.nih.nlm.nls.lvg.Flows;
import java.util.*;
import java.io.*;
import com.ibm.icu.text.*;
import com.ibm.icu.lang.*;
import gov.nih.nlm.nls.lvg.Util.*;
import gov.nih.nlm.nls.lvg.Lib.*;
/*****************************************************************************
* This class normalizes Unicode characters to ASCII in a term. 
* The normalization includes
* 
    *
  • Unicode core norm (-f:q7) *
  • get unicode name (-f:q3) *
* In other words, this flow compoment is composed by -f:q7:q3. * *

History: *

    *
* * @author NLM NLS Development Team * * @see * Design Document * @see * Norm Unicode characters to ASCII * * @version V-2010 ****************************************************************************/ public class ToNormUnicode extends Transformation implements Cloneable { // public methods /** * Performs the mutation of this flow component. * * @param in a LexItem as the input for this flow component * @param symbolMap a hash table contain the unicode symbols mapping * @param unicodeMap a hash table contain the unicode mapping * @param ligatureMap a hash table contains the mapping of ligatures * @param diacriticMap a hash table contains the mapping of diacritics * @param startTag the starting tag for symbol name (default: ![ ) * @param endTag the ending tag for symbol name (default: ]! ) * @param detailsFlag a boolean flag for processing details information * @param mutateFlag a boolean flag for processing mutate information * * @return the results from this flow component - a collection (Vector) * of LexItems */ public static Vector Mutate(LexItem in, Hashtable symbolMap, Hashtable unicodeMap, Hashtable ligatureMap, Hashtable diacriticMap, String startTag, String endTag, boolean detailsFlag, boolean mutateFlag) { // mutate the term: Unicode coreNorm String coreNormTerm = ToUnicodeCoreNorm.GetCoreNormStr( in.GetSourceTerm(), symbolMap, unicodeMap, ligatureMap, diacriticMap); // mutate the term: symbol name String term = ToGetUnicodeNames.GetUnicodeName(coreNormTerm, startTag, endTag); // details & mutate String details = null; String mutate = null; if(detailsFlag == true) { details = INFO; } if(mutateFlag == true) { mutate = Transformation.NO_MUTATE_INFO; } // updatea target Vector out = new Vector(); LexItem temp = UpdateLexItem(in, term, Flow.NORM_UNICODE, Transformation.UPDATE, Transformation.UPDATE, details, mutate); out.addElement(temp); return out; } /** * A unit test driver for this flow component. */ public static void main(String[] args) { // load config file String testStr = GetTestStr(args, "\u00A9 and \u00B5"); Configuration conf = new Configuration("data.config.lvg", true); Hashtable symbolMap = ToMapSymbolToAscii.GetSymbolMapFromFile(conf); Hashtable unicodeMap = ToMapUnicodeToAscii.GetUnicodeMapFromFile(conf); Hashtable ligatureMap = ToSplitLigatures.GetLigatureMapFromFile(conf); Hashtable diacriticMap = ToStripDiacritics.GetDiacriticMapFromFile(conf); String startTag = conf.GetConfiguration(Configuration.START_TAG); String endTag = conf.GetConfiguration(Configuration.END_TAG); // mutate LexItem in = new LexItem(testStr); Vector outs = ToNormUnicode.Mutate(in, symbolMap, unicodeMap, ligatureMap, diacriticMap, startTag, endTag, true, true); PrintResults(in, outs); // print out results } // private method // data members private static final String INFO = "Normalize Unicode To ASCII"; }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy