All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.text.analysis.ExtendedWhitespaceTokenizer Maven / Gradle / Ivy


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.analysis;

import java.io.IOException;
import java.io.Reader;

import org.carrot2.text.util.MutableCharArray;

/**
 * A tokenizer separating input characters on whitespace, but capable of extracting more
 * complex tokens, such as URLs, e-mail addresses and sentence delimiters.
 */
public final class ExtendedWhitespaceTokenizer implements ITokenizer
{
    /**
     * JFlex parser used to split the input into tokens.
     */
    private final ExtendedWhitespaceTokenizerImpl parser;

    public ExtendedWhitespaceTokenizer()
    {
        parser = new ExtendedWhitespaceTokenizerImpl((Reader)null);
    }
    
    /**
     * Reset this tokenizer to start parsing another stream.
     */
    @Override
    public void reset(Reader input)
    {
        this.parser.yyreset(input);
    }

    @Override
    public short nextToken() throws IOException
    {
        final short result = (short) parser.getNextToken();
        return result == ExtendedWhitespaceTokenizerImpl.YYEOF ? ITokenizer.TT_EOF : result;
    }

    @Override
    public void setTermBuffer(MutableCharArray array)
    {
        array.reset(parser.yybuffer(), parser.yystart(), parser.yylength());
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy