All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.text.analysis.ITokenizer Maven / Gradle / Ivy


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2015, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.analysis;

import java.io.IOException;
import java.io.Reader;

import org.carrot2.text.preprocessing.LanguageModelStemmer;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.text.preprocessing.StopListMarker;
import org.carrot2.text.util.MutableCharArray;

/**
 * Splits input characters into tokens representing e.g. words, digits, acronyms,
 * punctuation. For each token, the following information is available:
 * 
*
token type
*
Types of tokens: numbers, URIs, punctuation, acronyms and others. See all constants * in this class declared with TT_ prefix, e.g. {@link #TT_TERM}.
*
token flags
*
Additional token flags such as an indication whether a punctuation token is a * sentence delimiter ({@link #TF_SEPARATOR_SENTENCE}).
*
* * @see TokenTypeUtils */ public interface ITokenizer { /* * Token type mask: 0x000f */ public static final int TYPE_MASK = 0x000f; public static final int TT_TERM = 0x0001; public static final int TT_NUMERIC = 0x0002; public static final int TT_PUNCTUATION = 0x0003; public static final int TT_EMAIL = 0x0004; public static final int TT_ACRONYM = 0x0005; public static final int TT_FULL_URL = 0x0006; public static final int TT_BARE_URL = 0x0007; public static final int TT_FILE = 0x0008; public static final int TT_HYPHTERM = 0x0009; /** * Indicates the end of the token stream. */ public static final int TT_EOF = -1; /* * Additional token flags, mask: 0xFF00 */ /** Current token is a sentence separator. */ public static final short TF_SEPARATOR_SENTENCE = 0x0100; /** Current token is a document separator (never returned from parsing). */ public static final short TF_SEPARATOR_DOCUMENT = 0x0200; /** Current token separates document's logical fields. */ public static final short TF_SEPARATOR_FIELD = 0x0400; /** Current token terminates the input (never returned from parsing). */ public static final short TF_TERMINATOR = 0x0800; /* * Token flags related to processing steps after tokenization. To save some memory, * these token flags are stored together with token type. These flags may not be * available directly from the tokenizer. */ /** * The current token is a common word. This flag is not directly available from the * tokenizer. * * @see AllWords#type * @see StopListMarker */ public static final short TF_COMMON_WORD = 0x1000; /** * The current token is part of the query. This flag is not directly available from * the tokenizer. * * @see AllWords#type * @see LanguageModelStemmer */ public static final short TF_QUERY_WORD = 0x2000; /** * Resets the tokenizer to process new data * * @param reader the input to tokenize. The reader will not be closed * by the tokenizer when the end of stream is reached. */ public void reset(Reader reader) throws IOException; /** * Returns the next token from the input stream. * * @return the type of the token as defined by the {@link #TT_TERM} and other * constants or {@link #TT_EOF} when the end of the data stream has been * reached. * @see TokenTypeUtils */ public short nextToken() throws IOException; /** * Sets the current token image to the provided buffer. * * @param array buffer in which the current token image should be * stored */ public void setTermBuffer(MutableCharArray array); }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy