org.dmd.util.parsing.ClassifierNew Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of dark-matter-data Show documentation
The dark-matter project provides mechanism to define concepts associated with Domain Specific Languages (DSLs) and generate code that can be extended with business logic that supports the given DSL purpose.
There is a newer version: 3.1.15
Show newest version
//	---------------------------------------------------------------------------
//	dark-matter-data
//	Copyright (c) 2010 dark-matter-data committers
//	---------------------------------------------------------------------------
//	This program is free software; you can redistribute it and/or modify it
//	under the terms of the GNU Lesser General Public License as published by the
//	Free Software Foundation; either version 3 of the License, or (at your
//	option) any later version.
//	This program is distributed in the hope that it will be useful, but WITHOUT
//	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
//	FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
//	more details.
//	You should have received a copy of the GNU Lesser General Public License along
//	with this program; if not, see .
//	---------------------------------------------------------------------------
package org.dmd.util.parsing;

import java.util.ArrayList;
import java.util.StringTokenizer;

/**
 * The Classifier class takes an input string, breaks it into
 * tokens and classifies each token as either a basic token type
 * eg. (Token.TEXT, Token.INTEGER, Token.FLOAT) or as a
 * custom token type defined by the user. This approach is
 * especially useful when an entire line of input has to parsed
 * and checked against some allowable format.
 * @see org.dmd.util.parsing.Token
 * @see org.dmd.util.parsing.TokenArrayList 
 */

public class ClassifierNew {
    /**
     * This dictionary stores the tokens that are to be considered keywords
     * by the classifier.
     */
    private Dictionary         dict;

    /**
     * These are the additional characters that are to be treated as separators
     * during the parsing of the string. However, unlike our default whitespace
     * delimiters, these separators have an integer type associated with them.
     */
     private Dictionary        separators;

    /**
     * This string buffer is used to store the complete set of separators to
     * be used during parsing.
     */
     private StringBuffer       delimiters;

     /**
      * These are the default whitespace delimiters that we'll use to breakdown
      * the strings to be parsed.
      */
     private static String  defaultDelimiters = new String(" \t\r\n\f");

    /**
     * Default constructor.
     */
    public ClassifierNew() {
        dict = new Dictionary();
        separators = new Dictionary();
        delimiters = new StringBuffer(defaultDelimiters);
    }

    /**
     * Creates a new classifier that recognizes the tokens specified in the
     * given dictionary.
     * @param d The dictionary that contains keyword tokens.
     */
    public ClassifierNew(Dictionary d) {
        dict = d;
        separators = new Dictionary();
        delimiters = new StringBuffer(defaultDelimiters);
    }

    /**
     * Creates a new classifier that recognizes the tokens specified in the
     * given dictionary.
     * @param d The dictionary that contains keyword tokens.
     * @param useDefaults A flag that indicates whether or not we should use the
     * default delimiter characters ' ', \t \r \n \f when tokenizing a string. If
     * this flag is set to false, you should add your own seperator characters using
     * the addSeperator() method.
     */
    public ClassifierNew(Dictionary d, boolean useDefaults) {
        dict = d;
        separators = new Dictionary();

        if (useDefaults)
            delimiters = new StringBuffer(defaultDelimiters);
        else
            delimiters = new StringBuffer();
    }

    /**
     * Adds a new keyword to the classifier's dictionary.
     * 
     * The text should be 2 or more characters in length. If you want to recognize
     * a single character as a token, you should add it using the addSeparator()
     * method.
     * @param k The text to be recognized as a keyword.
     * @param i Its unique numeric identifier.
     * @return true if the keyword and its identifier are unique and false otherwise.
     */
    public boolean addKeyword(String k, int i) {
        return(dict.add(k,i));
    }
    
    /**
     * tries to find the keyword in our dictionary.
     * @param k the string to find.
     * @return true if it's a keyword and false otherwise.
     */
    public boolean isKeyWord(String k){
    	if (dict.find(k) == null)
    		return(false);
    	return(true);
    }

    /**
     * Adds a new separator to the classifier.
     * 
     * @param c A string containing a single character to be recognized as a separator.
     * @param i Its unique numeric identifier.
     * @return true if the separator and its identifier are unique and false otherwise.
     */
    public boolean addSeparator(String c, int i) {
        boolean rc = true;

        // System.out.println("*** addsep  " + c + "***\n\n");

        rc = separators.add(c,i);

        if (rc == true){
            if (i > dict.getMaxID())
                dict.setMaxID(i);
            delimiters.append(c);
        }

        return(rc);
    }

    /**
     * The classify function breaks down the specified string into tokens and
     * passes back an array of classified tokens.
     * @param s The string to be parsed.
     * @param returnSeps Set to true if the separators should be returned as tokens.
     */
    public ArrayList classify(String s, boolean returnSeps){
    	ArrayList  tokens      = new ArrayList();
        String          curr        = null;
        Token           sep         = null;
        Token           key         = null;

        // Indicates where the current token starts
        int             tokenPos    = 0;
        // The last token added to the token array.
        Token           lastToken   = null;

        // We create a string tokenizer, giving our complete set of delimiters
        // and tell it to return delimiters as tokens. We'll get rid of the
        // whitespace as we go.
        StringTokenizer st      = new StringTokenizer(s,delimiters.toString(),true);

        while(st.hasMoreTokens()){
            lastToken = null;
            curr = st.nextToken();
            key = dict.find(curr);
            if (curr.length() == 1){
                sep = separators.find(curr);
                if (Character.isWhitespace(curr.charAt(0))){
                    // System.out.println("white\n");
                    lastToken = null;
                }
                else if (sep != null){
                    // System.out.println("sep   " + curr + "\n");
                    if (returnSeps){
                        lastToken = new Token(sep);
                        tokens.add(lastToken);
                    }
                }
                else if (key != null){
                    // System.out.println("1key  " + curr + "\n");
                    lastToken = new Token(key);
                    tokens.add(lastToken);
                }
                else{
                    // System.out.println("text  " + curr + "\n");
                    lastToken = new Token(curr,this.getType(curr));
                    tokens.add(lastToken);
                }
            }
            else if (key != null){
                // System.out.println("key   " + curr + "\n");
                lastToken = new Token(key);
                tokens.add(lastToken);
            }
            else{
                // System.out.println("text  " + curr + "\n");
                lastToken = new Token(curr,this.getType(curr));
                tokens.add(lastToken);
            }

            if (lastToken == null){
                // We have a single seperator that we don't want returned
                tokenPos++;
            }
            else{
                // Set the start position of the last token
                lastToken.setPos(tokenPos);

                // Increase the position by the length of the last token token
                tokenPos += lastToken.getValue().length();
            }
        }

        return(tokens);
    }

    /**
     * Returns the basic type of a piece of text that isn't a separator or keyword.
     * @param s The string to be tested.
     * @returns INTEGER if the text represents and integer, FLOAT if it represents
     * a float and TEXT otherwise.
     */
     private int getType(String s){
        int rc = Token.TEXT;

        if (this.isInteger(s))
            rc = Token.INTEGER;
        else if (this.isFloat(s))
            rc = Token.FLOAT;

        return(rc);
     }

    /**
     * Checks to see if the specified string is an integer.
     * @param s The string to be tested.
     * @returns true if the string represents an integer and false otherwise
     */
     private boolean isInteger(String s){
        boolean rc = true;
        try {
            Integer.parseInt(s);
        }
        catch (NumberFormatException e) {
            rc = false;
        }
        return(rc);
     }

    /**
     * Checks to see if the specified string is a float value.
     * @param s The string to be tested.
     * @returns true if the string represents a float value and false otherwise
     */
     private boolean isFloat(String s){
        boolean rc = true;
        try {
            Float.parseFloat(s);
        }
        catch (NumberFormatException e) {
            rc = false;
        }
        return(rc);
     }

}