com.twitter.penguin.korean.TwitterKoreanProcessorJava Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of korean-text Show documentation
Scala library to process Korean text
There is a newer version: 4.4.4
/*
 * Twitter Korean Text - Scala library to process Korean text
 *
 * Copyright 2014 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.twitter.penguin.korean;

import java.util.LinkedList;
import java.util.List;

import scala.collection.Iterator;
import scala.collection.JavaConversions;
import scala.collection.Seq;

import com.twitter.penguin.korean.phrase_extractor.KoreanPhraseExtractor;
import com.twitter.penguin.korean.tokenizer.KoreanTokenizer.KoreanToken;
import com.twitter.penguin.korean.tokenizer.Sentence;
import com.twitter.penguin.korean.util.KoreanPos;

/**
 * Java wrapper for TwitterKoreanProcessor using Builder pattern
 */
public final class TwitterKoreanProcessorJava {

  /**
   * Normalize Korean text
   * 그랰ㅋㅋㅋㅋㅋㅋ -> 그래ㅋㅋ
   *
   * @param text Input text.
   * @return Normalized text.
   */
  public static CharSequence normalize(CharSequence text) {
    return TwitterKoreanProcessor.normalize(text);
  }

  /**
   * Tokenize with the builder options.
   *
   * @param text Input text.
   * @return A list of Korean Tokens (run tokensToJavaList to transform to Java List)
   */
  public static Seq tokenize(CharSequence text) {
    return TwitterKoreanProcessor.tokenize(
        text
    );
  }



  /**
   * Transforms the tokenization output to List
   *
   * @param tokens Korean tokens (output of tokenize(CharSequence text)).
   * @return List of KoreanTokenJava.
   */
  public static List tokensToJavaKoreanTokenList(Seq tokens, boolean keepSpace) {
    Iterator tokenized = tokens.iterator();
    List output = new LinkedList<>();
    while (tokenized.hasNext()) {
      KoreanToken token = tokenized.next();
      if (keepSpace || token.pos() != KoreanPos.Space()) {
        output.add(new KoreanTokenJava(
            token.text(),
            KoreanPosJava.valueOf(token.pos().toString()),
            token.offset(),
            token.length(),
            token.unknown()
        ));
      }
    }
    return output;
  }

  // Default behavior of keepSpace is false
  public static List tokensToJavaKoreanTokenList(Seq tokens) {
    return tokensToJavaKoreanTokenList(tokens, false);
  }


  /**
   * Tokenize with the builder options into a String Iterable.
   *
   * @param tokens Korean tokens (output of tokenize(CharSequence text)).
   * @return List of token strings.
   */
  public static List tokensToJavaStringList(Seq tokens, boolean keepSpace) {
    Iterator tokenized = tokens.iterator();
    List output = new LinkedList<>();
    while (tokenized.hasNext()) {
      final KoreanToken token = tokenized.next();

      if (keepSpace || token.pos() != KoreanPos.Space()) {
        output.add(token.text());
      }
    }
    return output;
  }

  // Default behavior of keepSpace is false
  public static List tokensToJavaStringList(Seq tokens) {
    return tokensToJavaStringList(tokens, false);
  }


  /**
   * Stem Korean Verbs and Adjectives
   *
   * @param tokens Korean tokens (output of tokenize(CharSequence text)).
   * @return StemmedTextWithTokens(text, tokens)
   */
  public static Seq stem(Seq tokens) {

    return TwitterKoreanProcessor.stem(tokens);
  }

  /**
   * Split input text into sentences.
   *
   * @param text Input text.
   * @return List of Sentence objects.
   */
  public static List splitSentences(CharSequence text) {
    return JavaConversions.seqAsJavaList(
        TwitterKoreanProcessor.splitSentences(text)
    );
  }

  /**
   * Extract phrases from Korean input text
   *
   * @param tokens Korean tokens (output of tokenize(CharSequence text)).
   * @return List of phrase CharSequences.
   */
  public static List extractPhrases(Seq tokens, boolean filterSpam, boolean includeHashtags) {
    return JavaConversions.seqAsJavaList(
        TwitterKoreanProcessor.extractPhrases(tokens, filterSpam, includeHashtags)
    );
  }

  /**
   * Detokenize the input list of words.
   *
   * @param tokens List of words.
   * @return Detokenized string.
   */
  public static String detokenize(List tokens) {
    return TwitterKoreanProcessor.detokenize(JavaConversions.iterableAsScalaIterable(tokens));
  }
}