org.carrot2.text.analysis.ITokenizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-mini Show documentation
Show all versions of carrot2-mini Show documentation
Carrot2 search results clustering framework. Minimal functional subset
(core algorithms and infrastructure, no document sources).
/*
* Carrot2 project.
*
* Copyright (C) 2002-2013, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.analysis;
import java.io.IOException;
import java.io.Reader;
import org.carrot2.text.preprocessing.LanguageModelStemmer;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.text.preprocessing.StopListMarker;
import org.carrot2.text.util.MutableCharArray;
/**
* Splits input characters into tokens representing e.g. words, digits, acronyms,
* punctuation. For each token, the following information is available:
*
* - token type
* - Types of tokens: numbers, URIs, punctuation, acronyms and others. See all constants
* in this class declared with
TT_
prefix, e.g. {@link #TT_TERM}.
* - token flags
* - Additional token flags such as an indication whether a punctuation token is a
* sentence delimiter ({@link #TF_SEPARATOR_SENTENCE}).
*
*
* @see TokenTypeUtils
*/
public interface ITokenizer
{
/*
* Token type mask: 0x000f
*/
public static final int TYPE_MASK = 0x000f;
public static final int TT_TERM = 0x0001;
public static final int TT_NUMERIC = 0x0002;
public static final int TT_PUNCTUATION = 0x0003;
public static final int TT_EMAIL = 0x0004;
public static final int TT_ACRONYM = 0x0005;
public static final int TT_FULL_URL = 0x0006;
public static final int TT_BARE_URL = 0x0007;
public static final int TT_FILE = 0x0008;
public static final int TT_HYPHTERM = 0x0009;
/**
* Indicates the end of the token stream.
*/
public static final int TT_EOF = -1;
/*
* Additional token flags, mask: 0xFF00
*/
/** Current token is a sentence separator. */
public static final short TF_SEPARATOR_SENTENCE = 0x0100;
/** Current token is a document separator (never returned from parsing). */
public static final short TF_SEPARATOR_DOCUMENT = 0x0200;
/** Current token separates document's logical fields. */
public static final short TF_SEPARATOR_FIELD = 0x0400;
/** Current token terminates the input (never returned from parsing). */
public static final short TF_TERMINATOR = 0x0800;
/*
* Token flags related to processing steps after tokenization. To save some memory,
* these token flags are stored together with token type. These flags may not be
* available directly from the tokenizer.
*/
/**
* The current token is a common word. This flag is not directly available from the
* tokenizer.
*
* @see AllWords#type
* @see StopListMarker
*/
public static final short TF_COMMON_WORD = 0x1000;
/**
* The current token is part of the query. This flag is not directly available from
* the tokenizer.
*
* @see AllWords#type
* @see LanguageModelStemmer
*/
public static final short TF_QUERY_WORD = 0x2000;
/**
* Resets the tokenizer to process new data
*
* @param reader the input to tokenize. The reader will not be closed
* by the tokenizer when the end of stream is reached.
*/
public void reset(Reader reader) throws IOException;
/**
* Returns the next token from the input stream.
*
* @return the type of the token as defined by the {@link #TT_TERM} and other
* constants or {@link #TT_EOF} when the end of the data stream has been
* reached.
* @see TokenTypeUtils
*/
public short nextToken() throws IOException;
/**
* Sets the current token image to the provided buffer.
*
* @param array buffer in which the current token image should be
* stored
*/
public void setTermBuffer(MutableCharArray array);
}