gov.nih.nlm.nls.lvg.Flows.ToStripMapUnicode Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lvg2010dist Show documentation
LVG tools is used by Apache cTAKES.
The newest version!
package gov.nih.nlm.nls.lvg.Flows;
import java.util.*;
import java.io.*;
import gov.nih.nlm.nls.lvg.Util.*;
import gov.nih.nlm.nls.lvg.Lib.*;
/*****************************************************************************
* This class strips or maps non-ASCII Unicode characters.
* This is a pure table mapping method with simply algorithm to 
* 
* strip non-ASCII unicode characters if they are not in mapping table
* 
convert non-ASCII unicode characters to ASCII if they are in mapping table
* 
* This flow is used to perform final tune up in normalizing Unicode to ASCII
* after using other Unicode normalization flows.
*
* 
 Users may define their own mapping in $LVG/data/Unicode/nonStripMap.data.
*
* 
History:
* 

* 
*
* @author NLM NLS Development Team
*
* @see 
* Design Document 
* @see  
* Strip or Map Unicode
*
* @version    V-2010
****************************************************************************/
public class ToStripMapUnicode extends Transformation implements Cloneable
{
    // public methods
    /**
    * Performs the mutation of this flow component.
    *
    * @param   in   a LexItem as the input for this flow component
    * @param   nonStripMap   a hash table contains the non-Strip map unicode
    * @param   detailsFlag   a boolean flag for processing details information
    * @param   mutateFlag   a boolean flag for processing mutate information
    *
    * @return  the results from this flow component - a collection (Vector) 
    * of LexItems
    */
    public static Vector Mutate(LexItem in, 
        Hashtable nonStripMap, boolean detailsFlag, 
        boolean mutateFlag)
    {
        Vector out = StripMapUnicodeToAscii(in, nonStripMap, INFO,
            detailsFlag, mutateFlag);
        return out;
    }
    /**
    * Read in non-strip map unicode list from configuration file
    *
    * @param   config   Configuratin object
    * 
    * @return  a hash table of non-strip map Unicode list
    */
    public static Hashtable GetNonStripMapFromFile(
        Configuration config)
    {
        String fName = 
            config.GetConfiguration(Configuration.LVG_DIR) + 
            config.GetConfiguration(Configuration.NON_STRIP_MAP_UNICODE_FILE);
        String line = null; 
        Hashtable nonStripMap 
            = new Hashtable();
        try        // load ligature from file
        {
            // read in line by line from a file
            BufferedReader in = new BufferedReader(new InputStreamReader(
                new FileInputStream(fName), "UTF-8"));
            while((line = in.readLine()) != null)
            {
                // skip the line if it is empty or comments (#)
                if((line.length() > 0) && (line.charAt(0) != '#'))
                {
                    // use ' ' and '\t' as delimiter to parse token
                    StringTokenizer buf = new StringTokenizer(line, "|", true);
                    // readin fields 1 & 2
                    char inChar = 
                        (char) UnicodeUtil.UnicodeHexToNum(buf.nextToken());
                    Character nonStripUnicode = new Character(inChar);
                    buf.nextToken();    // 1st delimiter
                    // new feature if the next field ASCII str is |
                    int nextTokenIndex = 0;
                    String mapStr = new String();
                    while(buf.hasMoreTokens() == true)
                    {
                        String curToken = buf.nextToken();    // next field
                        if(curToken.equals("|") == false)    // not "|"
                        {
                            if(nextTokenIndex == 0)    // 1st token after 1st field
                            {
                                mapStr = curToken;    // 2nd field is not "|"
                            }
                            else    // 2nd field is "|"
                            {
                                // add all "|"s before not "|" field
                                for(int i = 0; i < nextTokenIndex-1; i++)
                                {
                                     mapStr += "|";    // assign "|" to mapStr
                                }
                            }
                            break;
                        }
                        nextTokenIndex++;
                    }
                    // Check fields 1 & 2
                    if((UnicodeUtil.IsAsciiChar(inChar) == true)
                    || (UnicodeUtil.IsAsciiStr(mapStr) == false))
                    {
                        System.err.println(
                            "** Warning: Illegal format in nonStripMap file: '"
                            + fName + "'.");
                        System.err.println(line);
                    }
                    else
                    {
                        nonStripMap.put(nonStripUnicode, mapStr);
                    }
                }
            }
            in.close();
        }
        catch (Exception e)
        {
            System.err.println(
                "** Error: problem of opening/reading nonStripMap file: '" 
                + fName + "'.");
            System.err.println("Exception: " + e.toString());
        }
        return nonStripMap;
    }
    /**
    * Strip or map unicode to ASCII
    *
    * @param   inChar   an input character
    * @param   nonStripMap   a hash table contains the unicode
    *
    * @return  the stripped or mapped string in ASCII
    */
    public static String StripMapUnicodeToAscii(char inChar, 
        Hashtable nonStripMap)
    {
        // return original ASCII
        if(UnicodeUtil.IsAsciiChar(inChar) == true)
        {
            return String.valueOf(inChar);
        }
        // non-ASCII
        String outStr = new String();    // strip
        if(nonStripMap.containsKey(inChar) == true)        // map
        {
            outStr = nonStripMap.get(inChar);
        }
        return outStr;
    }
    /**
    * A unit test driver for this flow component.
    */
    public static void main(String[] args)
    {
        // load config file
        String testStr = GetTestStr(args, "\u00A9 and \u00B5");
        Configuration conf = new Configuration("data.config.lvg", true);
        Hashtable nonStripMap 
            = GetNonStripMapFromFile(conf);
        // mutate
        LexItem in = new LexItem(testStr);
        Vector outs = ToStripMapUnicode.Mutate(in, 
            nonStripMap, true, true);
        PrintResults(in, outs);     // print out results
    }
    // private method
    private static Vector StripMapUnicodeToAscii(LexItem in,
        Hashtable nonStripMap,
        String infoStr, boolean detailsFlag, boolean mutateFlag)
    {
        // details & mutate
        String details = null;
        String mutate = null;
        if(detailsFlag == true)
        {
            details = infoStr;
        }
        if(mutateFlag == true)
        {
            mutate = new String();
        }
        // mutate the term: get unicode name
        String inStr = in.GetSourceTerm();
        String fs = GlobalBehavior.GetFieldSeparator();
        StringBuffer buffer = new StringBuffer();
        for(int i = 0; i < inStr.length(); i++)
        {
            // strip or map unicode to ASCII Str 
            char curChar = inStr.charAt(i);
            String opStr = NO_OPERATION + fs;
            // ASCII: no operation
            if(UnicodeUtil.IsAsciiChar(curChar) == true)
            {
                buffer.append(curChar);
            }
            else    // NON-ASCII: Stripping or Mapping
            {
                Character key = new Character(curChar);
                if(nonStripMap.containsKey(key) == true)    // map
                {
                    buffer.append(nonStripMap.get(key));
                    opStr = MAPPING + fs;
                }
                else    // strip
                {
                    opStr = STRIPPING + fs;
                }
            }
            // update mutate information
            if(mutateFlag == true)
            {
                mutate += opStr;
            }
        }
        String term = buffer.toString();
        // updatea target 
        Vector out = new Vector();
        LexItem temp = UpdateLexItem(in, term, Flow.STRIP_MAP_UNICODE, 
            Transformation.UPDATE, Transformation.UPDATE, details, mutate);
        out.addElement(temp);
        return out;
    }
    // data members
    private static final String INFO = "Strip or Map Unicode to ASCII";
    final private static String NO_OPERATION = "NO";
    final private static String MAPPING = "MP";
    final private static String STRIPPING = "SP";
}