All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gov.nih.nlm.nls.lvg.Flows.ToSplitLigatures Maven / Gradle / Ivy

The newest version!
package gov.nih.nlm.nls.lvg.Flows;
import java.util.*;
import java.io.*;
import com.ibm.icu.text.*;
import gov.nih.nlm.nls.lvg.Util.*;
import gov.nih.nlm.nls.lvg.Lib.*;
/*****************************************************************************
* This class splits ligature characters from a term. 
* Ligatures are defined in different unicode blocks. 
* This flow is also used to normalize fullwidth chracters.
*
* 

In addition to ligatures defined in uniCode, users may define their own * ligatures and mapping characters. The ligatures mapping list is configurable * by modifying the configuration file. * *

History: *

    *
* * @author NLM NLS Development Team * * @see * Design Document * @see * Split Ligatures * * @version V-2010 ****************************************************************************/ public class ToSplitLigatures extends Transformation implements Cloneable { // public methods /** * Performs the mutation of this flow component. * * @param in a LexItem as the input for this flow component * @param ligatureMap a hash table contains the mapping of ligatures * @param detailsFlag a boolean flag for processing details information * @param mutateFlag a boolean flag for processing mutate information * * @return the results from this flow component - a collection (Vector) * of LexItems */ public static Vector Mutate(LexItem in, Hashtable ligatureMap, boolean detailsFlag, boolean mutateFlag) { Vector out = SplitLigatures(in, ligatureMap, INFO, detailsFlag, mutateFlag); return out; } /** * Read in ligatures mapping list from configuration file * * @param config Configuratin object * * @return a hash table of ligatures */ public static Hashtable GetLigatureMapFromFile( Configuration config) { String fName = config.GetConfiguration(Configuration.LVG_DIR) + config.GetConfiguration(Configuration.LIGATURES_FILE); String line = null; Hashtable ligatureMap = new Hashtable(); try // load ligature from file { // read in line by line from a file BufferedReader in = new BufferedReader(new InputStreamReader( new FileInputStream(fName), "UTF-8")); while((line = in.readLine()) != null) { // skip the line if it is empty or comments (#) if((line.length() > 0) && (line.charAt(0) != '#')) { // use ' ' and '\t' as delimiter to parse token StringTokenizer buf = new StringTokenizer(line, "|"); // readin fields 1 & 2 char inChar = (char) UnicodeUtil.UnicodeHexToNum(buf.nextToken()); String splitStr = buf.nextToken(); Character ligature = new Character(inChar); // Check fields 1 & 2 if(UnicodeUtil.IsAsciiChar(inChar) == true) { System.err.println( "** Warning: Illegal format in ligatures file: '" + fName + "'."); } else { ligatureMap.put(ligature, splitStr); } } } in.close(); } catch (Exception e) { System.err.println("Exception: " + e.toString()); System.err.println( "** Error: problem of opening/reading ligature file: '" + fName + "'."); } return ligatureMap; } /** * Split ligatures for an input character * * @param inChar input character for spliting ligature * @param ligatureMap user defined ligatures mapping * * @return split ligature string */ public static String SplitLigature(char inChar, Hashtable ligatureMap) { // use local mapping if inChar in the mapping table Character key = new Character(inChar); String outStr = new String(); if(ligatureMap.containsKey(key) == true) { outStr = ligatureMap.get(key); } else // use unicode normalization NFKC algorithm { outStr = Normalizer.normalize(inChar, Normalizer.NFKC); } // remove space character from both ends if(outStr.length() > 1) { outStr = outStr.trim(); } return outStr; } /** * A unit test driver for this flow component. */ public static void main(String[] args) { // load config file Configuration conf = new Configuration("data.config.lvg", true); String testStr = GetTestStr(args, "sp\u00E6lsau"); Hashtable ligatureMap = GetLigatureMapFromFile(conf); // mutate LexItem in = new LexItem(testStr); Vector outs = ToSplitLigatures.Mutate(in, ligatureMap, true, true); PrintResults(in, outs); // print out results } // private method private static Vector SplitLigatures(LexItem in, Hashtable ligatureMap, String infoStr, boolean detailsFlag, boolean mutateFlag) { // details & mutate String details = null; String mutate = null; if(detailsFlag == true) { details = infoStr; } if(mutateFlag == true) { mutate = new String(); } // mutate the term: split ligatures String inStr = in.GetSourceTerm(); String fs = GlobalBehavior.GetFieldSeparator(); StringBuffer buffer = new StringBuffer(); for(int i = 0; i < inStr.length(); i++) { char curChar = inStr.charAt(i); // strip each character in the string String opStr = NO_OPERATION + fs; Character key = new Character(curChar); String outStr = UnicodeUtil.CharToStr(curChar); if(ligatureMap.containsKey(key) == true) { outStr = ligatureMap.get(key); opStr = MAPPING + fs; } else // use unicode normalization NFKC algorithm { // get the UniCode normalized String String normStr = Normalizer.normalize(curChar, Normalizer.NFKC); // remove space character from both ends if(normStr.length() > 1) { outStr = normStr.trim(); } else { outStr = normStr; } // check if normalized if(UnicodeUtil.CharToStr(curChar).equals(outStr) == false) { opStr = NORM_NFKC + fs; } } buffer.append(outStr); // update mutate information if(mutateFlag == true) { mutate += opStr; } } String term = buffer.toString(); // updatea target Vector out = new Vector(); LexItem temp = UpdateLexItem(in, term, Flow.SPLIT_LIGATURES, Transformation.UPDATE, Transformation.UPDATE, details, mutate); out.addElement(temp); return out; } // data members private static final String INFO = "Split Ligatures"; final private static String NO_OPERATION = "NO"; final private static String MAPPING = "MP"; final private static String NORM_NFKC = "NFKC"; }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy