org.carrot2.text.analysis.ExtendedWhitespaceTokenizer Maven / Gradle / Ivy
/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.analysis;
import java.io.IOException;
import java.io.Reader;
import org.carrot2.text.util.MutableCharArray;
/**
* A tokenizer separating input characters on whitespace, but capable of extracting more
* complex tokens, such as URLs, e-mail addresses and sentence delimiters.
*/
public final class ExtendedWhitespaceTokenizer implements ITokenizer
{
/**
* JFlex parser used to split the input into tokens.
*/
private final ExtendedWhitespaceTokenizerImpl parser;
public ExtendedWhitespaceTokenizer()
{
parser = new ExtendedWhitespaceTokenizerImpl((Reader)null);
}
/**
* Reset this tokenizer to start parsing another stream.
*/
@Override
public void reset(Reader input)
{
this.parser.yyreset(input);
}
@Override
public short nextToken() throws IOException
{
final short result = (short) parser.getNextToken();
return result == ExtendedWhitespaceTokenizerImpl.YYEOF ? ITokenizer.TT_EOF : result;
}
@Override
public void setTermBuffer(MutableCharArray array)
{
array.reset(parser.yybuffer(), parser.yystart(), parser.yylength());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy