org.carrot2.text.analysis.ITokenizer Maven / Gradle / Ivy


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2015, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.analysis;

import java.io.IOException;
import java.io.Reader;

import org.carrot2.text.preprocessing.LanguageModelStemmer;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.text.preprocessing.StopListMarker;
import org.carrot2.text.util.MutableCharArray;

/**
 * Splits input characters into tokens representing e.g. words, digits, acronyms,
 * punctuation. For each token, the following information is available:
 * 
 * token type
 * Types of tokens: numbers, URIs, punctuation, acronyms and others. See all constants
 * in this class declared with TT_ prefix, e.g. {@link #TT_TERM}.
 * token flags
 * Additional token flags such as an indication whether a punctuation token is a
 * sentence delimiter ({@link #TF_SEPARATOR_SENTENCE}).
 * 
 * 
 * @see TokenTypeUtils
 */
public interface ITokenizer
{
    /*
     * Token type mask: 0x000f
     */
    public static final int TYPE_MASK = 0x000f;

    public static final int TT_TERM = 0x0001;
    public static final int TT_NUMERIC = 0x0002;
    public static final int TT_PUNCTUATION = 0x0003;
    public static final int TT_EMAIL = 0x0004;
    public static final int TT_ACRONYM = 0x0005;
    public static final int TT_FULL_URL = 0x0006;
    public static final int TT_BARE_URL = 0x0007;
    public static final int TT_FILE = 0x0008;
    public static final int TT_HYPHTERM = 0x0009;

    /**
     * Indicates the end of the token stream.
     */
    public static final int TT_EOF = -1;

    /*
     * Additional token flags, mask: 0xFF00
     */

    /** Current token is a sentence separator. */
    public static final short TF_SEPARATOR_SENTENCE = 0x0100;

    /** Current token is a document separator (never returned from parsing). */
    public static final short TF_SEPARATOR_DOCUMENT = 0x0200;

    /** Current token separates document's logical fields. */
    public static final short TF_SEPARATOR_FIELD = 0x0400;

    /** Current token terminates the input (never returned from parsing). */
    public static final short TF_TERMINATOR = 0x0800;

    /*
     * Token flags related to processing steps after tokenization. To save some memory,
     * these token flags are stored together with token type. These flags may not be
     * available directly from the tokenizer.
     */

    /**
     * The current token is a common word. This flag is not directly available from the
     * tokenizer.
     * 
     * @see AllWords#type
     * @see StopListMarker
     */
    public static final short TF_COMMON_WORD = 0x1000;

    /**
     * The current token is part of the query. This flag is not directly available from
     * the tokenizer.
     * 
     * @see AllWords#type
     * @see LanguageModelStemmer
     */
    public static final short TF_QUERY_WORD = 0x2000;

    /**
     * Resets the tokenizer to process new data
     * 
     * @param reader the input to tokenize. The reader will not be closed
     *            by the tokenizer when the end of stream is reached.
     */
    public void reset(Reader reader) throws IOException;

    /**
     * Returns the next token from the input stream.
     * 
     * @return the type of the token as defined by the {@link #TT_TERM} and other
     *         constants or {@link #TT_EOF} when the end of the data stream has been
     *         reached.
     * @see TokenTypeUtils
     */
    public short nextToken() throws IOException;

    /**
     * Sets the current token image to the provided buffer.
     * 
     * @param array buffer in which the current token image should be
     *            stored
     */
    public void setTermBuffer(MutableCharArray array);
}