org.openkoreantext.processor.OpenKoreanTextProcessor.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of open-korean-text Show documentation
Scala/Java library to process Korean text
There is a newer version: 2.3.1
/*
 * Twitter Korean Text - Scala library to process Korean text
 *
 * Copyright 2014 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.openkoreantext.processor

import org.openkoreantext.processor.normalizer.KoreanNormalizer
import org.openkoreantext.processor.phrase_extractor.KoreanPhraseExtractor
import org.openkoreantext.processor.phrase_extractor.KoreanPhraseExtractor.KoreanPhrase
import org.openkoreantext.processor.stemmer.KoreanStemmer
import org.openkoreantext.processor.tokenizer.KoreanTokenizer.KoreanToken
import org.openkoreantext.processor.tokenizer._
import org.openkoreantext.processor.util.{KoreanDictionaryProvider, KoreanPos}

/**
  * OpenKoreanTextProcessor provides error and slang tolerant Korean tokenization.
  */
object OpenKoreanTextProcessor {
  /**
    * Normalize Korean text. Uses KoreanNormalizer.normalize().
    *
    * @param text Input text
    * @return Normalized Korean text
    */
  def normalize(text: CharSequence): CharSequence = KoreanNormalizer.normalize(text)


  /**
    * Tokenize text into a sequence of KoreanTokens, which includes part-of-speech information and
    * whether a token is an out-of-vocabulary term.
    *
    * @param text input text
    * @return A sequence of KoreanTokens.
    */
  def tokenize(text: CharSequence): Seq[KoreanToken] = KoreanTokenizer.tokenize(text)

  /**
    * Tokenize text (with a custom profile) into a sequence of KoreanTokens,
    * which includes part-of-speech information and whether a token is an out-of-vocabulary term.
    *
    * @param text input text
    * @param profile TokenizerProfile
    * @return A sequence of KoreanTokens.
    */
  def tokenize(text: CharSequence,
               profile: TokenizerProfile
              ): Seq[KoreanToken] = {
    KoreanTokenizer.tokenize(text, profile)
  }

  /**
    * Tokenize text (with a custom profile) into a sequence of KoreanTokens,
    * which includes part-of-speech information and whether a token is an out-of-vocabulary term,
    * and return top `n` candidates.
    *
    * @param text input text
    * @param n number of top candidates
    * @return A sequence of sequences of KoreanTokens.
    */
  def tokenizeTopN(text: CharSequence,
                   n: Int
                  ): Seq[Seq[Seq[KoreanToken]]] = {
    KoreanTokenizer.tokenizeTopN(text, n)
  }

  /**
    * Tokenize text (with a custom profile) into a sequence of KoreanTokens,
    * which includes part-of-speech information and whether a token is an out-of-vocabulary term,
    * and return top `n` candidates.
    *
    * @param text input text
    * @param n number of top candidates
    * @param profile TokenizerProfile
    * @return A sequence of sequences of KoreanTokens.
    */
  def tokenizeTopN(text: CharSequence,
                   n: Int,
                   profile: TokenizerProfile
                  ): Seq[Seq[Seq[KoreanToken]]] = {
    KoreanTokenizer.tokenizeTopN(text, n, profile)
  }

  /**
    * Add user-defined word list to the noun dictionary. Spaced words are not allowed.
    *
    * @param words Sequence of words to add.
    */
  def addNounsToDictionary(words: Seq[String]) {
    KoreanDictionaryProvider.addWordsToDictionary(KoreanPos.Noun, words)
  }

  /**
    * Tokenize text into a sequence of token strings. This excludes spaces.
    *
    * @param tokens Korean tokens
    * @return A sequence of token strings.
    */
  def tokensToStrings(tokens: Seq[KoreanToken]): Seq[String] = {
    tokens.filterNot(t => t.pos == KoreanPos.Space).map(_.text.toString)
  }

  /**
    * Split input text into sentences.
    *
    * @param text input text
    * @return A sequence of sentences.
    */
  def splitSentences(text: CharSequence): Seq[Sentence] = {
    KoreanSentenceSplitter.split(text)
  }

  /**
    * Extract noun-phrases from Korean text
    *
    * @param tokens         Korean tokens
    * @param filterSpam     true if spam/slang terms to be filtered out (default: false)
    * @param enableHashtags true if #hashtags to be included (default: true)
    * @return A sequence of extracted phrases
    */
  def extractPhrases(tokens: Seq[KoreanToken],
                     filterSpam: Boolean = false,
                     enableHashtags: Boolean = true): Seq[KoreanPhrase] = {
    KoreanPhraseExtractor.extractPhrases(tokens, filterSpam, enableHashtags)
  }

  /**
    * Detokenize the input list of words.
    *
    * @param tokens List of words.
    * @return Detokenized string.
    */
  def detokenize(tokens: Iterable[String]): String = {
    KoreanDetokenizer.detokenize(tokens)
  }
}