All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gov.nih.nlm.nls.lvg.Flows.ToStripDiacritics Maven / Gradle / Ivy

The newest version!
package gov.nih.nlm.nls.lvg.Flows;
import java.util.*;
import java.io.*;
import com.ibm.icu.text.*;
import gov.nih.nlm.nls.lvg.Util.*;
import gov.nih.nlm.nls.lvg.Lib.*;
/*****************************************************************************
* This class strips diacritics characters from a term. Diacritics includes: 
* 
    *
  • grave accent *
  • acute accent *
  • circumflex accent *
  • tilde *
  • umlaut *
  • ring *
  • cedilla *
  • slash *
  • etc. *
*

Diacritic chractrers are in ISO Latin I character set. * In other words, it is Unicode Latin-1 supplement block (U+0080 ~ U+00FF). * It also in other unicode blocks, such as Latin Extend-A and Latin Extend-B. * The diacritics mapping list is configurable by modifying the configuration * file (${LVG}/data/Unicode/diacriticMap.data). * *

History: *

    *
* * @author NLM NLS Development Team * * @see * Design Document * @see * Strip Diacritics * * @version V-2010 ****************************************************************************/ public class ToStripDiacritics extends Transformation implements Cloneable { // public methods /** * Performs the mutation of this flow component. * * @param in a LexItem as the input for this flow component * @param diacriticMap a hash table contain the mapping of diacritics * @param detailsFlag a boolean flag for processing details information * @param mutateFlag a boolean flag for processing mutate information * * @return the results from this flow component - a collection (Vector) * of LexItems */ public static Vector Mutate(LexItem in, Hashtable diacriticMap, boolean detailsFlag, boolean mutateFlag) { Vector out = StripDiacritics(in, diacriticMap, INFO, detailsFlag, mutateFlag); return out; } /** * read in diacritics mapping list from configuration file * * @param config Configuratin object * * @return a hash table of diacritics */ public static Hashtable GetDiacriticMapFromFile( Configuration config) { String fName = config.GetConfiguration(Configuration.LVG_DIR) + config.GetConfiguration(Configuration.DIACRITICS_FILE); String line = null; Hashtable diacriticMap = new Hashtable(); try // load diacritics from file { // read in line by line from a file BufferedReader in = new BufferedReader(new InputStreamReader( new FileInputStream(fName), "UTF-8")); while((line = in.readLine()) != null) { // skip the line if it is empty or comments (#) if((line.length() > 0) && (line.charAt(0) != '#')) { // use ' ' and '\t' as delimiter to parse token StringTokenizer buf = new StringTokenizer(line, "|"); // readin fields 1 & 2 char inChar = (char) UnicodeUtil.UnicodeHexToNum(buf.nextToken()); char mapChar = buf.nextToken().charAt(0); Character diacritic = new Character(inChar); Character nDiacritic = new Character(mapChar); // Check fields 1 & 2 if((UnicodeUtil.IsAsciiChar(inChar) == true) || (UnicodeUtil.IsAsciiChar(mapChar) == false)) { System.err.println( "** Warning: Illegal format in diacritics file: '" + fName + "'."); System.err.println(line); } else { diacriticMap.put(diacritic, nDiacritic); } } } in.close(); } catch (Exception e) { System.err.println( "** ERR: problem of opening/reading diacritics file: '" + fName + "'."); System.err.println("Exception: " + e.toString()); } return diacriticMap; } /** * Strip diacritic for an input character * * @param inChar input character for stripping diacritic * @param diacriticMap user defined diacritics mapping * * @return a character of stripped diacritic */ public static char StripDiacritic(char inChar, Hashtable diacriticMap) { // use local mapping if inChar in the mapping table Character key = new Character(inChar); char outChar = inChar; if(diacriticMap.containsKey(key) == true) { outChar = (diacriticMap.get(key)).charValue(); } else // use unicode normalization NFD algorithm { // get the UniCode normalized String String normStr = Normalizer.normalize(inChar, Normalizer.NFD); // remove diacritics in Combinging diacritics Marks // For Unicode U+0000 ~ U+FFFF: // there are 12660 unicode has different results after Norm NFD // 11812 are CJK, Hebrew, etc other languages (no diacritics) // 4 are diacritics themselves, U+0340, U+0341, U+0343, U+0344 // 844 can be strip diacritics by this algorithm // NFKD does more than strip diacritics, it also split ligature .. // Thus, we still use NFD if((normStr.length() > 1) // themselves && (ContainDiacritics(normStr) == true)) // not other language { outChar = normStr.charAt(0); } } return outChar; } /** * Strip diacritic for an input string * * @param inStr input string for stripping diacritic * @param diacriticMap user defined diacritics mapping * * @return a string of stripped diacritic */ public static String StripDiacritics(String inStr, Hashtable diacriticMap) { StringBuffer buffer = new StringBuffer(); // strip each character in the string for(int i = 0; i < inStr.length(); i++) { char curChar = inStr.charAt(i); if(UnicodeUtil.IsAsciiChar(curChar) == true) { buffer.append(curChar); } else { buffer.append(StripDiacritic(curChar, diacriticMap)); } } return buffer.toString(); } /** * A unit test driver for this flow component. */ public static void main(String[] args) { // load config file Configuration conf = new Configuration("data.config.lvg", true); String testStr = GetTestStr(args, "resum\u00E9"); Hashtable diacriticMap = GetDiacriticMapFromFile(conf); // mutate LexItem in = new LexItem(testStr); Vector outs = ToStripDiacritics.Mutate(in, diacriticMap, true, true); PrintResults(in, outs); // print out results } // private method private static Vector StripDiacritics(LexItem in, Hashtable diacriticMap, String infoStr, boolean detailsFlag, boolean mutateFlag) { // details & mutate String details = null; String mutate = null; if(detailsFlag == true) { details = infoStr; } if(mutateFlag == true) { mutate = new String(); } // mutate the term: strip diacritics String inStr = in.GetSourceTerm(); String fs = GlobalBehavior.GetFieldSeparator(); StringBuffer buffer = new StringBuffer(); for(int i = 0; i < inStr.length(); i++) { char curChar = inStr.charAt(i); // strip each character in the string // use local mapping if inChar in the mapping table String opStr = NO_OPERATION + fs; Character key = new Character(curChar); char outChar = curChar; if(diacriticMap.containsKey(key) == true) { outChar = (diacriticMap.get(key)).charValue(); opStr = MAPPING + fs; } else // use unicode normalization NFD algorithm { // get the UniCode normalized String String normStr = Normalizer.normalize(curChar, Normalizer.NFD); // remove diacritics in Combinging diacritics Marks if((normStr.length() > 1) && (ContainDiacritics(normStr) == true)) { // non-diacritic character always on the first outChar = normStr.charAt(0); opStr = NORM_NFD + fs; } } buffer.append(outChar); // update mutate information if(mutateFlag == true) { mutate += opStr; } } String term = buffer.toString(); // updatea target Vector out = new Vector(); LexItem temp = UpdateLexItem(in, term, Flow.STRIP_DIACRITICS, Transformation.UPDATE, Transformation.UPDATE, details, mutate); out.addElement(temp); return out; } private static boolean ContainDiacritics(String inStr) { boolean flag = false; for(int i = 0; i < inStr.length(); i++) { char curChar = inStr.charAt(i); int curInt = UnicodeUtil.CharToNum(curChar); // Combining Diacritics Marks: U+0300 ~ U+036F if((curInt > 767) && (curInt < 880)) { flag = true; } } return flag; } // data members private static final String INFO = "Strip Diacritics"; final private static String NO_OPERATION = "NO"; final private static String MAPPING = "MP"; final private static String NORM_NFD = "NFD"; }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy