com.twitter.penguin.korean.TwitterKoreanProcessorJava Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of korean-text Show documentation
Show all versions of korean-text Show documentation
Scala library to process Korean text
/*
* Twitter Korean Text - Scala library to process Korean text
*
* Copyright 2014 Twitter, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.twitter.penguin.korean;
import java.util.LinkedList;
import java.util.List;
import scala.collection.Iterator;
import scala.collection.JavaConversions;
import scala.collection.Seq;
import com.twitter.penguin.korean.phrase_extractor.KoreanPhraseExtractor;
import com.twitter.penguin.korean.tokenizer.KoreanTokenizer.KoreanToken;
import com.twitter.penguin.korean.tokenizer.Sentence;
import com.twitter.penguin.korean.util.KoreanPos;
/**
* Java wrapper for TwitterKoreanProcessor using Builder pattern
*/
public final class TwitterKoreanProcessorJava {
/**
* Normalize Korean text
* 그랰ㅋㅋㅋㅋㅋㅋ -> 그래ㅋㅋ
*
* @param text Input text.
* @return Normalized text.
*/
public static CharSequence normalize(CharSequence text) {
return TwitterKoreanProcessor.normalize(text);
}
/**
* Tokenize with the builder options.
*
* @param text Input text.
* @return A list of Korean Tokens (run tokensToJavaList to transform to Java List)
*/
public static Seq tokenize(CharSequence text) {
return TwitterKoreanProcessor.tokenize(
text
);
}
/**
* Transforms the tokenization output to List
*
* @param tokens Korean tokens (output of tokenize(CharSequence text)).
* @return List of KoreanTokenJava.
*/
public static List tokensToJavaKoreanTokenList(Seq tokens, boolean keepSpace) {
Iterator tokenized = tokens.iterator();
List output = new LinkedList<>();
while (tokenized.hasNext()) {
KoreanToken token = tokenized.next();
if (keepSpace || token.pos() != KoreanPos.Space()) {
output.add(new KoreanTokenJava(
token.text(),
KoreanPosJava.valueOf(token.pos().toString()),
token.offset(),
token.length(),
token.unknown()
));
}
}
return output;
}
// Default behavior of keepSpace is false
public static List tokensToJavaKoreanTokenList(Seq tokens) {
return tokensToJavaKoreanTokenList(tokens, false);
}
/**
* Tokenize with the builder options into a String Iterable.
*
* @param tokens Korean tokens (output of tokenize(CharSequence text)).
* @return List of token strings.
*/
public static List tokensToJavaStringList(Seq tokens, boolean keepSpace) {
Iterator tokenized = tokens.iterator();
List output = new LinkedList<>();
while (tokenized.hasNext()) {
final KoreanToken token = tokenized.next();
if (keepSpace || token.pos() != KoreanPos.Space()) {
output.add(token.text());
}
}
return output;
}
// Default behavior of keepSpace is false
public static List tokensToJavaStringList(Seq tokens) {
return tokensToJavaStringList(tokens, false);
}
/**
* Stem Korean Verbs and Adjectives
*
* @param tokens Korean tokens (output of tokenize(CharSequence text)).
* @return StemmedTextWithTokens(text, tokens)
*/
public static Seq stem(Seq tokens) {
return TwitterKoreanProcessor.stem(tokens);
}
/**
* Split input text into sentences.
*
* @param text Input text.
* @return List of Sentence objects.
*/
public static List splitSentences(CharSequence text) {
return JavaConversions.seqAsJavaList(
TwitterKoreanProcessor.splitSentences(text)
);
}
/**
* Extract phrases from Korean input text
*
* @param tokens Korean tokens (output of tokenize(CharSequence text)).
* @return List of phrase CharSequences.
*/
public static List extractPhrases(Seq tokens, boolean filterSpam, boolean includeHashtags) {
return JavaConversions.seqAsJavaList(
TwitterKoreanProcessor.extractPhrases(tokens, filterSpam, includeHashtags)
);
}
/**
* Detokenize the input list of words.
*
* @param tokens List of words.
* @return Detokenized string.
*/
public static String detokenize(List tokens) {
return TwitterKoreanProcessor.detokenize(JavaConversions.iterableAsScalaIterable(tokens));
}
}