org.carrot2.language.Tokenizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-core Show documentation
Show all versions of carrot2-core Show documentation
Carrot2 Text Clustering Library
/*
* Carrot2 project.
*
* Copyright (C) 2002-2020, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* https://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.language;
import java.io.IOException;
import java.io.Reader;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.util.MutableCharArray;
/**
* Splits input characters into tokens representing e.g. words, digits, acronyms, punctuation. For
* each token, the following information is available:
*
*
* - token type
*
- Types of tokens: numbers, URIs, punctuation, acronyms and others. See all constants in this
* class declared with
TT_
prefix, e.g. {@link #TT_TERM}.
* - token flags
*
- Additional token flags such as an indication whether a punctuation token is a sentence
* delimiter ({@link #TF_SEPARATOR_SENTENCE}).
*
*
* @see TokenTypeUtils
*/
public interface Tokenizer {
/*
* Token type mask: 0x000f
*/
public static final int TYPE_MASK = 0x000f;
public static final int TT_TERM = 0x0001;
public static final int TT_NUMERIC = 0x0002;
public static final int TT_PUNCTUATION = 0x0003;
public static final int TT_EMAIL = 0x0004;
public static final int TT_ACRONYM = 0x0005;
public static final int TT_FULL_URL = 0x0006;
public static final int TT_BARE_URL = 0x0007;
public static final int TT_FILE = 0x0008;
public static final int TT_HYPHTERM = 0x0009;
/** Indicates the end of the token stream. */
public static final int TT_EOF = -1;
/*
* Additional token flags, mask: 0xFF00
*/
/** Current token is a sentence separator. */
public static final short TF_SEPARATOR_SENTENCE = 0x0100;
/** Current token is a document separator (never returned from parsing). */
public static final short TF_SEPARATOR_DOCUMENT = 0x0200;
/** Current token separates document's logical fields. */
public static final short TF_SEPARATOR_FIELD = 0x0400;
/** Current token terminates the input (never returned from parsing). */
public static final short TF_TERMINATOR = 0x0800;
/*
* Token flags related to processing steps after tokenization. To save some memory,
* these token flags are stored together with token type. These flags may not be
* available directly from the tokenizer.
*/
/**
* The current token is a common word. This flag is not directly available from the tokenizer.
*
* @see AllWords#type
*/
public static final short TF_COMMON_WORD = 0x1000;
/**
* The current token is part of the query. This flag is not directly available from the tokenizer.
*
* @see AllWords#type
*/
public static final short TF_QUERY_WORD = 0x2000;
/**
* Resets the tokenizer to process new data
*
* @param reader the input to tokenize. The reader will not be closed by the
* tokenizer when the end of stream is reached.
*/
void reset(Reader reader) throws IOException;
/**
* Returns the next token from the input stream.
*
* @return the type of the token as defined by the {@link #TT_TERM} and other constants or {@link
* #TT_EOF} when the end of the data stream has been reached.
* @see TokenTypeUtils
*/
short nextToken() throws IOException;
/**
* Sets the current token image to the provided buffer.
*
* @param array buffer in which the current token's image should be stored
*/
void setTermBuffer(MutableCharArray array);
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy