All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.language.Tokenizer Maven / Gradle / Ivy

There is a newer version: 4.6.0
Show newest version
/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2021, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * https://www.carrot2.org/carrot2.LICENSE
 */
package org.carrot2.language;

import java.io.IOException;
import java.io.Reader;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.util.MutableCharArray;

/**
 * Splits input characters into tokens representing e.g. words, digits, acronyms, punctuation. For
 * each token, the following information is available:
 *
 * 
*
token type *
Types of tokens: numbers, URIs, punctuation, acronyms and others. See all constants in this * class declared with TT_ prefix, e.g. {@link #TT_TERM}. *
token flags *
Additional token flags such as an indication whether a punctuation token is a sentence * delimiter ({@link #TF_SEPARATOR_SENTENCE}). *
* * @see TokenTypeUtils */ public interface Tokenizer { /* * Token type mask: 0x000f */ public static final int TYPE_MASK = 0x000f; public static final int TT_TERM = 0x0001; public static final int TT_NUMERIC = 0x0002; public static final int TT_PUNCTUATION = 0x0003; public static final int TT_EMAIL = 0x0004; public static final int TT_ACRONYM = 0x0005; public static final int TT_FULL_URL = 0x0006; public static final int TT_BARE_URL = 0x0007; public static final int TT_FILE = 0x0008; public static final int TT_HYPHTERM = 0x0009; /** Indicates the end of the token stream. */ public static final int TT_EOF = -1; /* * Additional token flags, mask: 0xFF00 */ /** Current token is a sentence separator. */ public static final short TF_SEPARATOR_SENTENCE = 0x0100; /** Current token is a document separator (never returned from parsing). */ public static final short TF_SEPARATOR_DOCUMENT = 0x0200; /** Current token separates document's logical fields. */ public static final short TF_SEPARATOR_FIELD = 0x0400; /** Current token terminates the input (never returned from parsing). */ public static final short TF_TERMINATOR = 0x0800; /* * Token flags related to processing steps after tokenization. To save some memory, * these token flags are stored together with token type. These flags may not be * available directly from the tokenizer. */ /** * The current token is a common word. This flag is not directly available from the tokenizer. * * @see AllWords#type */ public static final short TF_COMMON_WORD = 0x1000; /** * The current token is part of the query. This flag is not directly available from the tokenizer. * * @see AllWords#type */ public static final short TF_QUERY_WORD = 0x2000; /** * Resets the tokenizer to process new data * * @param reader the input to tokenize. The reader will not be closed by the * tokenizer when the end of stream is reached. */ void reset(Reader reader) throws IOException; /** * Returns the next token from the input stream. * * @return the type of the token as defined by the {@link #TT_TERM} and other constants or {@link * #TT_EOF} when the end of the data stream has been reached. * @see TokenTypeUtils */ short nextToken() throws IOException; /** * Sets the current token image to the provided buffer. * * @param array buffer in which the current token's image should be stored */ void setTermBuffer(MutableCharArray array); }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy