gov.sandia.cognition.text.token.AbstractCharacterBasedTokenizer Maven / Gradle / Ivy

Go to download
/*
 * File:                AbstractCharacterBasedTokenizer.java
 * Authors:             Justin Basilico
 * Company:             Sandia National Laboratories
 * Project:             Cognitive Foundry
 * 
 * Copyright March 03, 2009, Sandia Corporation.
 * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive 
 * license for use of this work by or on behalf of the U.S. Government. Export 
 * of this program may require a license from the United States Government. 
 * See CopyrightHistory.txt for complete details.
 * 
 */

package gov.sandia.cognition.text.token;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.LinkedList;

/**
 * An abstract implementation of a tokenizer that considers each character
 * individually. It takes care of most of the work and lets the subclasses
 * define what a valid token member character is.
 * 
 * @author  Justin Basilico
 * @since   3.0
 */
public abstract class AbstractCharacterBasedTokenizer
    extends AbstractTokenizer
{

    /**
     * Creates a new {@code LetterNumberTokenizer}.
     */
    public AbstractCharacterBasedTokenizer()
    {
        super();
    }

    public Iterable tokenize(
        final Reader reader)
    {
        // Create a buffered reader to do the tokenization.
        final BufferedReader br = new BufferedReader(reader);
        final LinkedList result = new LinkedList();

        try
        {
            // We are going to read the text into tokens. We keep track of
            // the current index into the reader plus the starting point of
            // each token. We read the data for each token into the string
            // builder.
            int index = 0;
            int start = 0;
            StringBuilder text = new StringBuilder();

            // TODO: Is there a better way to read in the character?
            final char[] buffer = new char[1];
            while (br.read(buffer) == 1)
            {
                final char c = buffer[0];

                //
                if (!this.isTokenMember(c))
                {
                    if (text.length() > 0)
                    {
                        result.add(new DefaultToken(text.toString(), start));
                        text = new StringBuilder();
                    }

                    start = index + 1;
                }
                else
                {
                    // This is a part of the current token, so add it.
                    text.append(c);
                }

                index++;
            }

            // Take care of the last token, if there is one.
            if (text.length() > 0)
            {
                result.add(new DefaultToken(text.toString(), start));
            }
        }
        catch (IOException e)
        {
            // Error during tokenization.
            return null;
        }

        return result;
    }

    /**
     * Determines if the given character is considered to be part of a token.
     *
     * @param   c
     *      A character.
     * @return
     *      True if the character can be part of a token; otherwise, false.
     */
    public abstract boolean isTokenMember(
        final char c);
    
}