
com.megafarad.ve_scala.japanese.JapaneseTokenParser.scala Maven / Gradle / Ivy
package com.megafarad.ve_scala.japanese
import com.atilika.kuromoji.ipadic.{Token => KuromojiToken}
import java.util
object JapaneseTokenParser {
private val POS1 = 0
private val POS2 = 1
private val POS3 = 2
private val POS4 = 3
private val CTYPE = 4
private val CFORM = 5
private val BASIC = 6
private val READING = 7
private val PRONUNCIATION = 8
private def getFeatureSafely(allFeaturesArray: Array[String], feature: Int): String = {
if (feature > PRONUNCIATION) throw new IllegalStateException("Asked for a feature out of bounds.")
if (allFeaturesArray.length >= feature + 1) allFeaturesArray(feature) else "*"
}
def getPosArray(array: Array[String]): Array[String] = {
util.Arrays.copyOfRange(array, POS1, POS4 + 1)
}
def parse(kuromojiToken: KuromojiToken): JapaneseToken = {
val posArray = getPosArray(kuromojiToken.getAllFeaturesArray)
JapaneseToken(literal = kuromojiToken.getSurface,
pos = posArray(POS1),
pos2 = posArray(POS2),
pos3 = posArray(POS3),
pos4 = posArray(POS4),
inflectionType = kuromojiToken.getAllFeaturesArray()(CTYPE),
inflectionForm = kuromojiToken.getAllFeaturesArray()(CFORM),
lemma = kuromojiToken.getAllFeaturesArray()(BASIC),
reading = getFeatureSafely(kuromojiToken.getAllFeaturesArray, READING),
hatsuon = getFeatureSafely(kuromojiToken.getAllFeaturesArray, PRONUNCIATION),
sentenceEnding = kuromojiToken.getSurface.equals("。"))
}
def parse(surface: String, rawFeaturesArray: String): JapaneseToken = {
val allFeaturesArray = rawFeaturesArray.split(",")
val posArray = getPosArray(allFeaturesArray)
JapaneseToken(literal = surface,
pos = posArray(POS1),
pos2 = posArray(POS2),
pos3 = posArray(POS3),
pos4 = posArray(POS4),
inflectionType = allFeaturesArray(CTYPE),
inflectionForm = allFeaturesArray(CFORM),
lemma = allFeaturesArray(BASIC),
reading = getFeatureSafely(allFeaturesArray, READING),
hatsuon = getFeatureSafely(allFeaturesArray, PRONUNCIATION),
sentenceEnding = surface.equals("。"))
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy