gov.sandia.cognition.text.token.AbstractCharacterBasedTokenizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gov-sandia-cognition-text-core Show documentation
Show all versions of gov-sandia-cognition-text-core Show documentation
Algorithms and components for text analysis and information retrieval.
/*
* File: AbstractCharacterBasedTokenizer.java
* Authors: Justin Basilico
* Company: Sandia National Laboratories
* Project: Cognitive Foundry
*
* Copyright March 03, 2009, Sandia Corporation.
* Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
* license for use of this work by or on behalf of the U.S. Government. Export
* of this program may require a license from the United States Government.
* See CopyrightHistory.txt for complete details.
*
*/
package gov.sandia.cognition.text.token;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.LinkedList;
/**
* An abstract implementation of a tokenizer that considers each character
* individually. It takes care of most of the work and lets the subclasses
* define what a valid token member character is.
*
* @author Justin Basilico
* @since 3.0
*/
public abstract class AbstractCharacterBasedTokenizer
extends AbstractTokenizer
{
/**
* Creates a new {@code LetterNumberTokenizer}.
*/
public AbstractCharacterBasedTokenizer()
{
super();
}
public Iterable tokenize(
final Reader reader)
{
// Create a buffered reader to do the tokenization.
final BufferedReader br = new BufferedReader(reader);
final LinkedList result = new LinkedList();
try
{
// We are going to read the text into tokens. We keep track of
// the current index into the reader plus the starting point of
// each token. We read the data for each token into the string
// builder.
int index = 0;
int start = 0;
StringBuilder text = new StringBuilder();
// TODO: Is there a better way to read in the character?
final char[] buffer = new char[1];
while (br.read(buffer) == 1)
{
final char c = buffer[0];
//
if (!this.isTokenMember(c))
{
if (text.length() > 0)
{
result.add(new DefaultToken(text.toString(), start));
text = new StringBuilder();
}
start = index + 1;
}
else
{
// This is a part of the current token, so add it.
text.append(c);
}
index++;
}
// Take care of the last token, if there is one.
if (text.length() > 0)
{
result.add(new DefaultToken(text.toString(), start));
}
}
catch (IOException e)
{
// Error during tokenization.
return null;
}
return result;
}
/**
* Determines if the given character is considered to be part of a token.
*
* @param c
* A character.
* @return
* True if the character can be part of a token; otherwise, false.
*/
public abstract boolean isTokenMember(
final char c);
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy