org.carrot2.language.Tokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-core Show documentation
Carrot2 Text Clustering Library
There is a newer version: 4.6.0
/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2021, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * https://www.carrot2.org/carrot2.LICENSE
 */
package org.carrot2.language;

import java.io.IOException;
import java.io.Reader;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.util.MutableCharArray;

/**
 * Splits input characters into tokens representing e.g. words, digits, acronyms, punctuation. For
 * each token, the following information is available:
 *
 * 
 *   token type
 *   
Types of tokens: numbers, URIs, punctuation, acronyms and others. See all constants in this
 *       class declared with TT_ prefix, e.g. {@link #TT_TERM}.
 *   
token flags
 *   
Additional token flags such as an indication whether a punctuation token is a sentence
 *       delimiter ({@link #TF_SEPARATOR_SENTENCE}).
 * 
 *
 * @see TokenTypeUtils
 */
public interface Tokenizer {
  /*
   * Token type mask: 0x000f
   */
  public static final int TYPE_MASK = 0x000f;

  public static final int TT_TERM = 0x0001;
  public static final int TT_NUMERIC = 0x0002;
  public static final int TT_PUNCTUATION = 0x0003;
  public static final int TT_EMAIL = 0x0004;
  public static final int TT_ACRONYM = 0x0005;
  public static final int TT_FULL_URL = 0x0006;
  public static final int TT_BARE_URL = 0x0007;
  public static final int TT_FILE = 0x0008;
  public static final int TT_HYPHTERM = 0x0009;

  /** Indicates the end of the token stream. */
  public static final int TT_EOF = -1;

  /*
   * Additional token flags, mask: 0xFF00
   */

  /** Current token is a sentence separator. */
  public static final short TF_SEPARATOR_SENTENCE = 0x0100;

  /** Current token is a document separator (never returned from parsing). */
  public static final short TF_SEPARATOR_DOCUMENT = 0x0200;

  /** Current token separates document's logical fields. */
  public static final short TF_SEPARATOR_FIELD = 0x0400;

  /** Current token terminates the input (never returned from parsing). */
  public static final short TF_TERMINATOR = 0x0800;

  /*
   * Token flags related to processing steps after tokenization. To save some memory,
   * these token flags are stored together with token type. These flags may not be
   * available directly from the tokenizer.
   */

  /**
   * The current token is a common word. This flag is not directly available from the tokenizer.
   *
   * @see AllWords#type
   */
  public static final short TF_COMMON_WORD = 0x1000;

  /**
   * The current token is part of the query. This flag is not directly available from the tokenizer.
   *
   * @see AllWords#type
   */
  public static final short TF_QUERY_WORD = 0x2000;

  /**
   * Resets the tokenizer to process new data
   *
   * @param reader the input to tokenize. The reader will not be closed by the
   *     tokenizer when the end of stream is reached.
   */
  void reset(Reader reader) throws IOException;

  /**
   * Returns the next token from the input stream.
   *
   * @return the type of the token as defined by the {@link #TT_TERM} and other constants or {@link
   *     #TT_EOF} when the end of the data stream has been reached.
   * @see TokenTypeUtils
   */
  short nextToken() throws IOException;

  /**
   * Sets the current token image to the provided buffer.
   *
   * @param array buffer in which the current token's image should be stored
   */
  void setTermBuffer(MutableCharArray array);
}