All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gov.nih.nlm.nls.lvg.Util.StripToken Maven / Gradle / Ivy

The newest version!
package gov.nih.nlm.nls.lvg.Util;
import java.util.*;
/*****************************************************************************
* This class tokenizes a string into Vector.
*
* 

History: *

    *
* * @author NLM NLS Development Team * * @see StrTokenObject * @see Design document * * @version V-2010 ***************************************************************************/ public class StripToken { // public methods /** * Tokenize a string and assigned token type to each token. This method * uses the default delimiters defined in StrTokenObject. They are * briefly described as follows: *
    *
  • restoreD: A String contains delimiters that will be restored * after tokenized *
  • strippingD: A String contains delimiters that will be restored and * it's previous token will be stripped out if they are in conflict * list or their type are stripped. *
  • strippableD: A String contains delimiters that will be restored * (stripped) if it's previous token is not stripped out (stripped). *
  • confList: char that can not be used before a delim of strippingD *
  • spaceD: space deliminator for nor-restore *
* No overlap is allowed between restoreD, strippingD, & strippableD. * * @param inStr a String to be tokenized * * @return Vector tokenized elements */ public static Vector Tokenize(String inStr) { return Tokenize(inStr, StrTokenObject.RESTORE_D_STR, StrTokenObject.STRIPPING_D_STR, StrTokenObject.STRIPABLE_D_STR, StrTokenObject.SPACE_D_STR); } /** * Tokenize a string and assigned token type to each token. * * @param inStr a String to be tokenized * @param restoreD a String contains delimiters that will be restored * after tokenized * @param strippingD a String contains delimiters that it's previous * token will be stripped out if they are in conflict list or their * type are stripped. * @param strippableD a String contains delimiters that will be restored * if it's previous token is not stripped out. * @param spaceD a String contains space delimiters. * * @return Vector - tokenized elements */ public static Vector Tokenize(String inStr, String restoreD, String strippingD, String strippableD, String spaceD) { // use space delim to tokenize the input string Vector tokenList = new Vector(); StringTokenizer buf = new StringTokenizer(inStr, spaceD); while(buf.hasMoreTokens() == true) { String cur = buf.nextToken(); // handle each token in details Vector curList = Tokenize(cur, restoreD, strippingD, strippableD); tokenList.addAll(curList); // put tokens back tokenList.addElement(SPACE_DELIM); // add a space between } // remove the last SPACE_DELIM int lastIndex = tokenList.size()-1; if(lastIndex >= 0) { tokenList.remove(lastIndex); } return tokenList; } /** * Tokenize a string and assigned token type to each token. * * @param inStr a String to be tokenized * @param restoreD a String contains delimiters that will be restored * after tokenized * @param strippingD a String contains delimiters that it's previous * token will be stripped out if they are in conflict list or their * type are stripped. * @param strippableD a String contains delimiters that will be restored * if it's previous token is not stripped out. * * @return Vector - tokenized elements */ public static Vector Tokenize(String inStr, String restoreD, String strippingD, String strippableD) { Vector tokenList = new Vector(); String allDelim = restoreD + strippingD + strippableD; // tokenize the input and assign the type for each token StringTokenizer buf = new StringTokenizer(inStr, allDelim, true); while(buf.hasMoreTokens() == true) { String cur = buf.nextToken(); if(restoreD.indexOf(cur) != -1) { AddToList(cur, StrTokenObject.RESTORE_D, tokenList); } else if(strippingD.indexOf(cur) != -1) { AddToList(cur, StrTokenObject.STRIPPING_D, tokenList); } else if(strippableD.indexOf(cur) != -1) { AddToList(cur, StrTokenObject.STRIPABLE_D, tokenList); } else { AddToList(cur, StrTokenObject.TOKEN, tokenList); } } return tokenList; } /** * Clean up elements in the tokenized Vector. This method includes lots of * smart operations while restoring tokens into a string. This is used * when strip with punctuations. In such case, some punctuations need to be * stripped automatically. This method uses the defualt conflict string * " -,:;". * * @param list a tokenized vector. * * @return a vector with clean-up tokenized elements. * * @see StrTokenObject */ public static Vector CleanUpToken( Vector list) { return CleanUpToken(list, StrTokenObject.CONFLICT_STR); } /** * Clean up elements in the tokenized Vector. This method includes lots of * smart operations while restoring tokens into a string. This is used * when strip with punctuations. In such case, some punctuations need to be * stripped automatically. * * @param list a tokenized vector. * @param confList a string contians all conflict characters. * A conflict character is a character need to be stripped if * the following token is stripping type. * * @return a vector with clean-up tokenized elements. */ public static Vector CleanUpToken( Vector list, String confList) { Vector newList = new Vector(); int lastDelimType = StrTokenObject.NONE; // type of last token // go through the list for(int i = 0; i < list.size(); i++) { StrTokenObject cur = list.elementAt(i); String curToken = cur.GetTokenStr(); int curDelimType = cur.GetTokenType(); switch(curDelimType) { // add the element if it is a token type case StrTokenObject.TOKEN: newList.addElement(cur); lastDelimType = curDelimType; break; // add the element if it is a space type and the previous token // is a token, restored, stripping, or stripable delimiter case StrTokenObject.SPACE_D: if((lastDelimType == StrTokenObject.TOKEN) || (lastDelimType == StrTokenObject.RESTORE_D) || (lastDelimType == StrTokenObject.STRIPPING_D) || (lastDelimType == StrTokenObject.STRIPABLE_D)) { newList.addElement(cur); lastDelimType = curDelimType; } break; // Don't add (remove) the element if it is a stripped type case StrTokenObject.STRIPPED: lastDelimType = curDelimType; break; // Add the element if it is a restore type case StrTokenObject.RESTORE_D: newList.addElement(cur); lastDelimType = curDelimType; break; case StrTokenObject.STRIPPING_D: //remove conflict list if(newList.size() > 0) // check if any char before it { int prevIndex = newList.size() - 1; StrTokenObject prev = newList.elementAt(prevIndex); int prevTokenType = prev.GetTokenType(); String prevTokenStr = prev.GetTokenStr(); // remove previous element if the previous element is // stripped type or a character in conflict list while((prevTokenType == StrTokenObject.STRIPPED) || ((prevTokenStr.length() == 1) && (confList.indexOf(prevTokenStr.charAt(0)) != -1))) { newList.remove(prevIndex); prevIndex = newList.size() -1; prev = newList.elementAt(prevIndex); prevTokenStr = prev.GetTokenStr(); prevTokenType = prev.GetTokenType(); } } // add the element if the newList is empty newList.addElement(cur); lastDelimType = curDelimType; break; // Add the element if the previous token is stripped type case StrTokenObject.STRIPABLE_D: if(lastDelimType != StrTokenObject.STRIPPED) { newList.addElement(cur); lastDelimType = curDelimType; } break; } } return newList; } /** * Remove the end elements of a tokenized Vector if it is * a character in the bad end list string defined in StrTokenObject. * * @param list a vector of tokenized StrTokenObjects that will be removed * end element (character) if it is in a badEndList. * * @return Vector - tokenized StrTokenObject without any * bad end character at the end. * * @see StrTokenObject */ public static Vector CleanUpEnd(Vector list) { return CleanUpEnd(list, StrTokenObject.BAD_END_STR); } /** * Remove the end elements of a tokenized Vector if it is * a character in the bad end list string. * * @param list a vector of tokenized StrTokenObjects that will be removed * end element (character) if it is in a badEndList. * @param badEndList a String that contains all bad end characters. * The default value for badEndList is " -,:;". * * @return a vector of tokenized StrTokenObject without any bad end * character at the end. */ public static Vector CleanUpEnd(Vector list, String badEndList) { Vector newList = new Vector(list); if(newList.size() > 0) { int lastIndex = newList.size() - 1; StrTokenObject last = newList.elementAt(lastIndex); String lastTokenStr = last.GetTokenStr(); // if the end character is in badEndList, remove it while((lastTokenStr.length() == 1) && (badEndList.indexOf(lastTokenStr.charAt(0)) != -1)) { newList.remove(lastIndex); lastIndex = newList.size() -1; last = newList.elementAt(lastIndex); lastTokenStr = last.GetTokenStr(); } } return newList; } /** * Compose a string from a list of tokenized StrTokenObjects. * * @param list Vector * * @return a string that is composed from the input list. */ public static String ComposeString(Vector list) { // compose String String out = new String(); for(int i = 0; i < list.size(); i++) { StrTokenObject temp = list.elementAt(i); out += temp.GetTokenStr(); } return out; } /** * Test driver for this class */ public static void main(String[] args) { if(args.length != 1) { System.out.println("** Usage: java Str "); } else { String inWord = args[0]; Vector tokenList = StripToken.Tokenize(inWord); System.out.println("---------------------------------------------"); System.out.println("in: '" + inWord + "'"); System.out.println("out: '" + StripToken.ComposeString(tokenList) + "'"); Vector newList = StripToken.CleanUpToken(tokenList); System.out.println("new: '" + StripToken.ComposeString(newList) + "'"); } } // private methods // Add strTokenObj into a Vector (list) private static void AddToList(String tokenStr, int tokenType, Vector list) { if(tokenStr != null) { StrTokenObject strTokenObj = new StrTokenObject(tokenStr, tokenType); list.addElement(strTokenObj); } } // print out all elements from a Vector (list) private static void PrintTokenVector(Vector list) { System.out.println("==============================================="); for(int i = 0; i < list.size(); i++) { StrTokenObject temp = list.elementAt(i); System.out.println(i + ": [" + temp.GetTokenType() + "]-[" + temp.GetTokenStr() + "]"); } } // data member final private static StrTokenObject SPACE_DELIM = new StrTokenObject(" ", StrTokenObject.SPACE_D); // space delimiter }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy