All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gov.nih.nlm.nls.lvg.Flows.ToGetUnicodeNames Maven / Gradle / Ivy

The newest version!
package gov.nih.nlm.nls.lvg.Flows;
import java.util.*;
import java.io.*;
import com.ibm.icu.text.*;
import com.ibm.icu.lang.*;
import gov.nih.nlm.nls.lvg.Util.*;
import gov.nih.nlm.nls.lvg.Lib.*;
/*****************************************************************************
* This class converts characters from a term into Unicode names. 
* The default format of the unicode name is ![unicode name]!  
*
* 

Users may have their own format by defining starting tag and ending tag. * These tags file is configurable by modifying the configuration file. * *

History: *

    *
* * @author NLM NLS Development Team * * @see * Design Document * @see * Get Unicode Names * * @version V-2010 ****************************************************************************/ public class ToGetUnicodeNames extends Transformation implements Cloneable { // public methods /** * Performs the mutation of this flow component. * * @param in a LexItem as the input for this flow component * @param startTag the starting tag for symbol name (default: ![ ) * @param endTag the ending tag for symbol name (default: ]! ) * @param detailsFlag a boolean flag for processing details information * @param mutateFlag a boolean flag for processing mutate information * * @return Vector - results from this flow component */ public static Vector Mutate(LexItem in, String startTag, String endTag, boolean detailsFlag, boolean mutateFlag) { Vector out = GetUnicodeNames(in, startTag, endTag, detailsFlag, mutateFlag); return out; } /** * Convert a symbol to it's ![unicode name]! * * @param inChar input character for getting symbol * @param startTag the starting tag for symbol name (default: ![ ) * @param endTag the ending tag for symbol name (default: ]! ) * * @return unicode name of a character in ![unicode name]! format */ public static String GetUnicodeName(char inChar, String startTag, String endTag) { String outStr = startTag + UCharacter.getName(inChar) + endTag; return outStr; } /** * Convert a symbol to it's ![unicode name]! * * @param inStr input string for getting symbol * @param startTag the starting tag for symbol name (default: ![ ) * @param endTag the ending tag for symbol name (default: ]! ) * * @return unicode name of a character in ![unicode name]! format */ public static String GetUnicodeName(String inStr, String startTag, String endTag) { StringBuffer buf = new StringBuffer(); // go through all chracters in the inStr for(int i = 0; i < inStr.length(); i++) { char curChar = inStr.charAt(i); // if ASCII, no change if(UnicodeUtil.IsAsciiChar(curChar) == true) { buf.append(curChar); } else // if no-ASCII, use ![unicode name]! { buf.append(GetUnicodeName(curChar, startTag, endTag)); } } return buf.toString(); } /** * Convert a ![unicode name]! to a character * * @param unicodeName character in ![unicode name]! format * @param startTag the starting tag for symbol name (default: ![ ) * @param endTag the ending tag for symbol name (default: ]! ) * * @return the character to the unicode name */ public static char GetCharFromUnicodeName(String unicodeName, String startTag, String endTag) { if(unicodeName == null) { return (char) -1; } // remove start and end tags: ![ and ]! if(IsLegalUnicodeNameFormat(unicodeName, startTag, endTag) == true) { unicodeName = GetUnicodeNameWithoutTag(unicodeName, startTag, endTag); char out = (char) UCharacter.getCharFromExtendedName(unicodeName); return out; } return (char) -1; } /** * A unit test driver for this flow component. */ public static void main(String[] args) { // load config file String testStr = "© and µ"; Configuration conf = new Configuration("data.config.lvg", true); String startTag = conf.GetConfiguration(Configuration.START_TAG); String endTag = conf.GetConfiguration(Configuration.END_TAG); // mutate LexItem in = new LexItem(testStr); Vector outs = ToGetUnicodeNames.Mutate(in, startTag, endTag, true, true); PrintResults(in, outs); // print out results } // private method /** * Convert non-ASCII char to ![unicode name]! * * @param in a LexItem as the input for this flow component * @param startTag the starting tag for symbol name (default: ![ ) * @param endTag the ending tag for symbol name (default: ]! ) * @param detailsFlag a boolean flag for processing details information * @param mutateFlag a boolean flag for processing mutate information * * @return Vector - results from this flow component */ private static Vector GetUnicodeNames(LexItem in, String startTag, String endTag, boolean detailsFlag, boolean mutateFlag) { // details & mutate String details = null; String mutate = null; if(detailsFlag == true) { details = INFO; } if(mutateFlag == true) { mutate = new String(); } // mutate the term: ![unicode name]! String inStr = in.GetSourceTerm(); String fs = GlobalBehavior.GetFieldSeparator(); StringBuffer buf = new StringBuffer(); for(int i = 0; i < inStr.length(); i++) { char curChar = inStr.charAt(i); // if ASCII, no change if(UnicodeUtil.IsAsciiChar(curChar) == true) { buf.append(curChar); } else // if no-ASCII, use ![unicode name]! { buf.append(GetUnicodeName(curChar, startTag, endTag)); } // update mutate information if(mutateFlag == true) { mutate += UnicodeUtil.GetUnicodeInfoXNB(curChar) + fs; } } String term = buf.toString(); // updatea target Vector out = new Vector(); LexItem temp = UpdateLexItem(in, term, Flow.GET_UNICODE_NAME, Transformation.UPDATE, Transformation.UPDATE, details, mutate); out.addElement(temp); return out; } private static boolean IsLegalUnicodeNameFormat(String inStr, String startTag, String endTag) { if((inStr.length() < (startTag.length() + endTag.length() + 1)) || (inStr.startsWith(startTag) == false) || (inStr.endsWith(endTag) == false)) { return false; } return true; } private static String GetUnicodeNameWithoutTag(String inStr, String startTag, String endTag) { String outStr = inStr.substring(startTag.length(), inStr.length()-endTag.length()); return outStr; } // data members private static final String INFO = "Get Unicode Names"; }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy