gov.nih.nlm.nls.lvg.Util.StripToken Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lvg2010dist Show documentation
LVG tools is used by Apache cTAKES.
The newest version!
package gov.nih.nlm.nls.lvg.Util;
import java.util.*;
/*****************************************************************************
* This class tokenizes a string into Vector.
*
* History:
* 

* 
*
* @author NLM NLS Development Team
*
* @see    StrTokenObject
* @see    Design document
*
* @version    V-2010
***************************************************************************/
public class StripToken
{
    // public methods
    /**
    * Tokenize a string and assigned token type to each token.  This method 
    * uses the default delimiters defined in StrTokenObject.  They are 
    * briefly described as follows:
    * 
    * restoreD: A String contains delimiters that will be restored
    *     after tokenized
    * 
strippingD: A String contains delimiters that will be restored and
    *     it's previous token will be stripped out if they are in conflict 
    *     list or their type are stripped.
    * 
strippableD: A String contains delimiters that will be restored 
    *     (stripped) if it's previous token is not stripped out (stripped).
    * 
confList: char that can not be used before a delim of strippingD
    * 
spaceD:    space deliminator for nor-restore
    * 
    * No overlap is allowed between restoreD, strippingD, & strippableD.
    *
    * @param   inStr  a String to be tokenized
    *
    * @return  Vector tokenized elements
    */
    public static Vector Tokenize(String inStr)
    {
        return Tokenize(inStr, StrTokenObject.RESTORE_D_STR, 
            StrTokenObject.STRIPPING_D_STR, StrTokenObject.STRIPABLE_D_STR,
            StrTokenObject.SPACE_D_STR);
    }
    /**
    * Tokenize a string and assigned token type to each token.
    *
    * @param   inStr  a String to be tokenized
    * @param   restoreD  a String contains delimiters that will be restored 
    *          after tokenized
    * @param   strippingD  a String contains delimiters that it's previous
    *          token will be stripped out if they are in conflict list or their
    *          type are stripped.
    * @param   strippableD  a String contains delimiters that will be restored
    *          if it's previous token is not stripped out.
    * @param   spaceD  a String contains space delimiters. 
    *
    * @return  Vector - tokenized elements
    */
    public static Vector Tokenize(String inStr, String restoreD,
        String strippingD, String strippableD, String spaceD)
    {
        // use space delim to tokenize the input string
        Vector tokenList = new Vector();
        StringTokenizer buf = new StringTokenizer(inStr, spaceD);
        while(buf.hasMoreTokens() == true)
        {
            String cur = buf.nextToken(); 
            // handle each token in details
            Vector curList 
                = Tokenize(cur, restoreD, strippingD, strippableD);
            
            tokenList.addAll(curList);                // put tokens back 
            tokenList.addElement(SPACE_DELIM);        // add a space between
        }
        // remove the last SPACE_DELIM
        int lastIndex = tokenList.size()-1;
        if(lastIndex >= 0)
        {
            tokenList.remove(lastIndex);
        }
        return tokenList;
    }
    /**
    * Tokenize a string and assigned token type to each token.
    *
    * @param   inStr  a String to be tokenized
    * @param   restoreD  a String contains delimiters that will be restored 
    *          after tokenized
    * @param   strippingD  a String contains delimiters that it's previous
    *          token will be stripped out if they are in conflict list or their
    *          type are stripped.
    * @param   strippableD  a String contains delimiters that will be restored
    *          if it's previous token is not stripped out.
    *
    * @return  Vector - tokenized elements
    */
    public static Vector Tokenize(String inStr, String restoreD,
        String strippingD, String strippableD)
    {
        Vector tokenList = new Vector();
        String allDelim = restoreD + strippingD + strippableD;
        // tokenize the input and assign the type for each token
        StringTokenizer buf = new StringTokenizer(inStr, allDelim, true);
        while(buf.hasMoreTokens() == true)
        {
            String cur = buf.nextToken();
            if(restoreD.indexOf(cur) != -1)
            {
                AddToList(cur, StrTokenObject.RESTORE_D, tokenList);
            }
            else if(strippingD.indexOf(cur) != -1)
            {
                AddToList(cur, StrTokenObject.STRIPPING_D, tokenList);
            }
            else if(strippableD.indexOf(cur) != -1)
            {
                AddToList(cur, StrTokenObject.STRIPABLE_D, tokenList);
            }
            else
            {
                AddToList(cur, StrTokenObject.TOKEN, tokenList);
            }
        }
        return tokenList;
    }
    /**
    * Clean up elements in the tokenized Vector.  This method includes lots of
    * smart operations while restoring tokens into a string.  This is used
    * when strip with punctuations.  In such case, some punctuations need to be
    * stripped automatically.  This method uses the defualt conflict string
    * " -,:;".
    *
    * @param   list  a tokenized vector.
    *
    * @return  a vector with clean-up tokenized elements.
    *
    * @see    StrTokenObject
    */
    public static Vector CleanUpToken(
        Vector list)
    {
        return CleanUpToken(list, StrTokenObject.CONFLICT_STR);
    }
    /**
    * Clean up elements in the tokenized Vector.  This method includes lots of
    * smart operations while restoring tokens into a string.  This is used
    * when strip with punctuations.  In such case, some punctuations need to be
    * stripped automatically.
    *
    * @param   list  a tokenized vector.
    * @param   confList   a string contians all conflict characters.  
    *          A conflict character is a character need to be stripped if
    *          the following token is stripping type.
    *
    * @return  a vector with clean-up tokenized elements.
    */
    public static Vector CleanUpToken(
        Vector list, String confList)
    {
        Vector newList = new Vector();
        int lastDelimType = StrTokenObject.NONE;    // type of last token
        // go through the list
        for(int i = 0; i < list.size(); i++)
        {
            StrTokenObject cur = list.elementAt(i);
            String curToken = cur.GetTokenStr();
            int curDelimType = cur.GetTokenType();
            switch(curDelimType)
            {
                // add the element if it is a token type
                case StrTokenObject.TOKEN:        
                    newList.addElement(cur);
                    lastDelimType = curDelimType;
                    break;
                // add the element if it is a space type and the previous token
                // is a token, restored, stripping, or stripable delimiter
                case StrTokenObject.SPACE_D:
                    if((lastDelimType == StrTokenObject.TOKEN)
                    || (lastDelimType == StrTokenObject.RESTORE_D)
                    || (lastDelimType == StrTokenObject.STRIPPING_D)
                    || (lastDelimType == StrTokenObject.STRIPABLE_D))
                    {
                        newList.addElement(cur);
                        lastDelimType = curDelimType;
                    }
                    break;
                // Don't add (remove) the element if it is a stripped type
                case StrTokenObject.STRIPPED:
                    lastDelimType = curDelimType;
                    break;
                // Add the element if it is a restore type
                case StrTokenObject.RESTORE_D:
                    newList.addElement(cur);
                    lastDelimType = curDelimType;
                    break;
                case StrTokenObject.STRIPPING_D:
                    //remove conflict list
                    if(newList.size() > 0)        // check if any char before it 
                    {
                        int prevIndex = newList.size() - 1;
                        StrTokenObject prev = newList.elementAt(prevIndex);
                        int prevTokenType = prev.GetTokenType();
                        String prevTokenStr = prev.GetTokenStr();
                        // remove previous element if the previous element is 
                        // stripped type or a character in conflict list
                        while((prevTokenType == StrTokenObject.STRIPPED)
                        || ((prevTokenStr.length() == 1)
                        && (confList.indexOf(prevTokenStr.charAt(0)) != -1))) 
                        {
                            newList.remove(prevIndex);
                            prevIndex = newList.size() -1;
                            prev = newList.elementAt(prevIndex);
                            prevTokenStr = prev.GetTokenStr();
                            prevTokenType = prev.GetTokenType();
                        }
                    }
                    // add the element if the newList is empty
                    newList.addElement(cur);
                    lastDelimType = curDelimType;
                    break;
                // Add the element if the previous token is stripped type
                case StrTokenObject.STRIPABLE_D:
                    if(lastDelimType != StrTokenObject.STRIPPED)
                    {
                        newList.addElement(cur);
                        lastDelimType = curDelimType;
                    }
                    break;
            }
        }
        return newList;
    }
    /**
    * Remove the end elements of a tokenized Vector if it is 
    * a character in the bad end list string defined in StrTokenObject.
    *
    * @param   list  a vector of tokenized StrTokenObjects that will be removed
    *          end element (character) if it is in a badEndList.
    *
    * @return  Vector - tokenized StrTokenObject without any 
    *          bad end character at the end.
    *
    * @see    StrTokenObject
    */
    public static Vector CleanUpEnd(Vector list)
    {
        return CleanUpEnd(list, StrTokenObject.BAD_END_STR);
    }
    /**
    * Remove the end elements of a tokenized Vector if it is 
    * a character in the bad end list string.
    *
    * @param   list  a vector of tokenized StrTokenObjects that will be removed
    *          end element (character) if it is in a badEndList.
    * @param   badEndList  a String that contains all bad end characters.
    *          The default value for badEndList is " -,:;".
    *
    * @return  a vector of tokenized StrTokenObject without any bad end 
    *          character at the end.
    */
    public static Vector CleanUpEnd(Vector list,
        String badEndList)
    {
        Vector newList = new Vector(list);
        if(newList.size() > 0)
        {
            int lastIndex = newList.size() - 1;
            StrTokenObject last = newList.elementAt(lastIndex);
            String lastTokenStr = last.GetTokenStr();
            // if the end character is in badEndList, remove it 
            while((lastTokenStr.length() == 1)
            && (badEndList.indexOf(lastTokenStr.charAt(0)) != -1))
            {
                newList.remove(lastIndex);
                lastIndex = newList.size() -1;
                last = newList.elementAt(lastIndex);
                lastTokenStr = last.GetTokenStr();
            }
        }
        return newList;
    }
    /**
    * Compose a string from a list of tokenized StrTokenObjects.
    *
    * @param   list  Vector
    *
    * @return  a string that is composed from the input list.
    */
    public static String ComposeString(Vector list)
    {
        // compose String
        String out = new String();
        for(int i = 0; i < list.size(); i++)
        {
            StrTokenObject temp = list.elementAt(i);
            out += temp.GetTokenStr();
        }
        return out;
    }
    /**
    *    Test driver for this class
    */
    public static void main(String[] args)
    {
        if(args.length != 1)
        {
            System.out.println("** Usage: java Str ");
        }
        else
        {
            String inWord = args[0];
            Vector tokenList = StripToken.Tokenize(inWord);
            System.out.println("---------------------------------------------");
            System.out.println("in:  '" + inWord + "'");
            System.out.println("out: '" + StripToken.ComposeString(tokenList) 
                + "'");
            Vector newList = StripToken.CleanUpToken(tokenList);
            System.out.println("new: '" + StripToken.ComposeString(newList) 
                + "'");
        }
    }
    // private methods
    // Add strTokenObj into a Vector (list)
    private static void AddToList(String tokenStr, int tokenType, 
        Vector list)
    {
        if(tokenStr != null)
        {
            StrTokenObject strTokenObj = 
                new StrTokenObject(tokenStr, tokenType);
            list.addElement(strTokenObj);
        }
    }
    // print out all elements from a Vector (list)
    private static void PrintTokenVector(Vector list)
    {
        System.out.println("===============================================");
        for(int i = 0; i < list.size(); i++)
        {
            StrTokenObject temp = list.elementAt(i);
            System.out.println(i + ": [" + temp.GetTokenType() + "]-["
                + temp.GetTokenStr() + "]");
        }
    }
    // data member
    final private static StrTokenObject SPACE_DELIM = 
        new StrTokenObject(" ", StrTokenObject.SPACE_D);    // space delimiter
}