gov.nih.nlm.nls.lvg.Flows.ToStripDiacritics Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lvg2010dist Show documentation
LVG tools is used by Apache cTAKES.
The newest version!
package gov.nih.nlm.nls.lvg.Flows;
import java.util.*;
import java.io.*;
import com.ibm.icu.text.*;
import gov.nih.nlm.nls.lvg.Util.*;
import gov.nih.nlm.nls.lvg.Lib.*;
/*****************************************************************************
* This class strips diacritics characters from a term. Diacritics includes: 
* 
*  grave accent 
* 
 acute accent 
* 
 circumflex accent 
* 
 tilde 
* 
 umlaut 
* 
 ring 
* 
 cedilla 
* 
 slash 
* 
 etc. 
* 
*  Diacritic chractrers are in ISO Latin I character set.
* In other words, it is Unicode Latin-1 supplement block (U+0080 ~ U+00FF).
* It also in other unicode blocks, such as Latin Extend-A and Latin Extend-B.
* The diacritics mapping list is configurable by modifying the configuration
* file (${LVG}/data/Unicode/diacriticMap.data).
*
* 
History:
* 

* 
*
* @author NLM NLS Development Team
*
* @see 
* Design Document 
* @see  
* Strip Diacritics
*
* @version    V-2010
****************************************************************************/
public class ToStripDiacritics extends Transformation implements Cloneable
{
    // public methods
    /**
    * Performs the mutation of this flow component.
    *
    * @param   in   a LexItem as the input for this flow component
    * @param   diacriticMap a hash table contain the mapping of diacritics
    * @param   detailsFlag   a boolean flag for processing details information
    * @param   mutateFlag   a boolean flag for processing mutate information
    *
    * @return  the results from this flow component - a collection (Vector) 
    * of LexItems
    */
    public static Vector Mutate(LexItem in, 
        Hashtable diacriticMap,
        boolean detailsFlag, boolean mutateFlag)
    {
        Vector out = StripDiacritics(in, diacriticMap, INFO,
            detailsFlag, mutateFlag);
        return out;
    }
    /**
    * read in diacritics mapping list from configuration file
    *
    * @param   config   Configuratin object
    * 
    * @return  a hash table of diacritics
    */
    public static Hashtable GetDiacriticMapFromFile(
        Configuration config)
    {
        String fName = 
            config.GetConfiguration(Configuration.LVG_DIR) + 
            config.GetConfiguration(Configuration.DIACRITICS_FILE);
        String line = null; 
        Hashtable diacriticMap 
            = new Hashtable();
        try        // load diacritics from file
        {
            // read in line by line from a file
            BufferedReader in = new BufferedReader(new InputStreamReader(
                new FileInputStream(fName), "UTF-8"));
            while((line = in.readLine()) != null)
            {
                // skip the line if it is empty or comments (#)
                if((line.length() > 0) && (line.charAt(0) != '#'))
                {
                    // use ' ' and '\t' as delimiter to parse token
                    StringTokenizer buf = new StringTokenizer(line, "|");
                    // readin fields 1 & 2
                    char inChar = 
                        (char) UnicodeUtil.UnicodeHexToNum(buf.nextToken());
                    char mapChar = buf.nextToken().charAt(0);
                    Character diacritic = new Character(inChar);
                    Character nDiacritic = new Character(mapChar);
                    // Check fields 1 & 2
                    if((UnicodeUtil.IsAsciiChar(inChar) == true)
                    || (UnicodeUtil.IsAsciiChar(mapChar) == false))
                    {
                        System.err.println(
                            "** Warning: Illegal format in diacritics file: '"
                            + fName + "'.");
                        System.err.println(line);
                    }
                    else
                    {
                        diacriticMap.put(diacritic, nDiacritic);
                    }
                }
            }
            in.close();
        }
        catch (Exception e)
        {
            System.err.println(
                "** ERR: problem of opening/reading diacritics file: '" + 
                fName + "'.");
            System.err.println("Exception: " + e.toString());
        }
        return diacriticMap;
    }
    /**
    * Strip diacritic for an input character
    *
    * @param   inChar   input character for stripping diacritic
    * @param   diacriticMap   user defined diacritics mapping
    * 
    * @return  a character of stripped diacritic
    */
    public static char StripDiacritic(char inChar, 
        Hashtable diacriticMap)
    {
        // use local mapping if inChar in the mapping table
        Character key = new Character(inChar);
        char outChar = inChar;
        if(diacriticMap.containsKey(key) == true)
        {
            outChar = (diacriticMap.get(key)).charValue();
        }
        else    // use unicode normalization NFD algorithm
        {
            // get the UniCode normalized String 
            String normStr = Normalizer.normalize(inChar, Normalizer.NFD);
            // remove diacritics in Combinging diacritics Marks
            // For Unicode U+0000 ~ U+FFFF:
            // there are 12660 unicode has different results after Norm NFD
            // 11812 are CJK, Hebrew, etc other languages (no diacritics)
            // 4 are diacritics themselves, U+0340, U+0341, U+0343, U+0344
            // 844 can be strip diacritics by this algorithm
            // NFKD does more than strip diacritics, it also split ligature .. 
            // Thus, we still use NFD
            if((normStr.length() > 1)    // themselves
            && (ContainDiacritics(normStr) == true))    // not other language
            {
                outChar = normStr.charAt(0);
            }
        }
        return outChar;
    }
    /**
    * Strip diacritic for an input string
    *
    * @param   inStr   input string for stripping diacritic
    * @param   diacriticMap   user defined diacritics mapping
    * 
    * @return  a string of stripped diacritic
    */
    public static String StripDiacritics(String inStr, 
        Hashtable diacriticMap)
    {
        StringBuffer buffer = new StringBuffer();
        // strip each character in the string
        for(int i = 0; i < inStr.length(); i++)
        {
            char curChar = inStr.charAt(i);
            if(UnicodeUtil.IsAsciiChar(curChar) == true)
            {
                buffer.append(curChar);
            }
            else
            {
                buffer.append(StripDiacritic(curChar, diacriticMap));
            }
        }
        return buffer.toString();
    }
    /**
    * A unit test driver for this flow component.
    */
    public static void main(String[] args)
    {
        // load config file
        Configuration conf = new Configuration("data.config.lvg", true);
        String testStr = GetTestStr(args, "resum\u00E9");
        Hashtable diacriticMap 
            = GetDiacriticMapFromFile(conf);
        // mutate
        LexItem in = new LexItem(testStr);
        Vector outs 
            = ToStripDiacritics.Mutate(in, diacriticMap, true, true);
        PrintResults(in, outs);     // print out results
    }
    // private method
    private static Vector StripDiacritics(LexItem in,
        Hashtable diacriticMap, String infoStr,
        boolean detailsFlag, boolean mutateFlag)
    {
        // details & mutate
        String details = null;
        String mutate = null;
        if(detailsFlag == true)
        {
            details = infoStr;
        }
        if(mutateFlag == true)
        {
            mutate = new String();
        }
        // mutate the term: strip diacritics
        String inStr = in.GetSourceTerm();
        String fs = GlobalBehavior.GetFieldSeparator();
        StringBuffer buffer = new StringBuffer();
        for(int i = 0; i < inStr.length(); i++)
        {
            char curChar = inStr.charAt(i);
            // strip each character in the string
            // use local mapping if inChar in the mapping table
            String opStr = NO_OPERATION + fs;
            Character key = new Character(curChar);
            char outChar = curChar;
            if(diacriticMap.containsKey(key) == true)
            {
                outChar = (diacriticMap.get(key)).charValue();
                opStr = MAPPING + fs;
            }
            else    // use unicode normalization NFD algorithm
            {
                // get the UniCode normalized String 
                String normStr = Normalizer.normalize(curChar, Normalizer.NFD);
                // remove diacritics in Combinging diacritics Marks
                if((normStr.length() > 1)
                && (ContainDiacritics(normStr) == true))
                {
                    // non-diacritic character always on the first
                    outChar = normStr.charAt(0);
                    opStr = NORM_NFD + fs;
                }
            }
            buffer.append(outChar);
            // update mutate information
            if(mutateFlag == true)
            {
                mutate += opStr;
            }
        }
        String term = buffer.toString();
        // updatea target 
        Vector out = new Vector();
        LexItem temp = UpdateLexItem(in, term, Flow.STRIP_DIACRITICS, 
            Transformation.UPDATE, Transformation.UPDATE, details, mutate);
        out.addElement(temp);
        return out;
    }
    private static boolean ContainDiacritics(String inStr)
    {
        boolean flag = false;
        for(int i = 0; i < inStr.length(); i++)
        {
            char curChar = inStr.charAt(i);
            int curInt = UnicodeUtil.CharToNum(curChar);
            // Combining Diacritics Marks: U+0300 ~ U+036F
            if((curInt > 767) && (curInt < 880))
            {
                flag = true;
            }
        }
        return flag;
    }
    // data members
    private static final String INFO = "Strip Diacritics";
    final private static String NO_OPERATION = "NO"; 
    final private static String MAPPING = "MP"; 
    final private static String NORM_NFD = "NFD"; 
}