All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mayabot.nlp.segment.plugins.pos.PosPerceptronDef.kt Maven / Gradle / Ivy

package com.mayabot.nlp.segment.plugins.pos

import com.mayabot.nlp.common.FastStringBuilder
import com.mayabot.nlp.common.utils.CharNormUtils
import com.mayabot.nlp.common.utils.Characters
import com.mayabot.nlp.perceptron.EvaluateFunction
import com.mayabot.nlp.perceptron.PerceptronDefinition
import com.mayabot.nlp.perceptron.PerceptronModel
import com.mayabot.nlp.segment.Nature

object InnerPos {
    val natures = Nature.values().filter {
        it != Nature.newWord
                && it != Nature.begin
                && it != Nature.end
    }.map { it.name }.sorted().toTypedArray()
}

/**
 * 语料格式。 word1/label word2/label
 */
class PosPerceptronDef
@JvmOverloads
constructor(
        val labels: Array = InnerPos.natures

) : PerceptronDefinition> {

    override fun labels(): Array {
        return labels
    }


    override fun featureMaxSize() = 50

    /**
     *  word1/flag word2/flag
     */
    override fun parseAnnotateText(text: String): List> {
        return text.split(' ').mapNotNull {
            val s = it.split('/')
            if (s.size == 2) {
                s[0] to s[1]
            } else {
                null
            }
        }
    }

    override fun inputList2InputSeq(list: List): List {
        return list
    }

    private val CHAR_BEGIN = "_B_"
    private val CHAR_END = "_E_"

    /**
     * 和前面一个词,后面一个词。
     * 词本身。
     * 词的前缀,词的后缀
     */
    override fun featureFunction(sentence: List, size: Int, position: Int, buffer: FastStringBuilder, emit: () -> Unit) {

        var preWord = if (position > 0) sentence[position - 1] else null
        val curWord = sentence[position]
        var nextWord = if (position < size - 1) sentence[position + 1] else null

        if (nextWord!=null && nextWord.length == 1) {
            val c = nextWord[0]
            val isP = Characters.isPunctuation(c)
            if (isP || c == ' ') {
                // 我认为标点符号和词性无关
                nextWord = null
            }
        }

        if (preWord!=null && preWord.length == 1) {
            val c = preWord[0]
            val isP = Characters.isPunctuation(c)
            if (isP || c == ' ') {
                // 我认为标点符号和词性无关
                preWord = null
            }
        }

        if (preWord != null) {
            buffer.clear()
            buffer.append(preWord)
            buffer.append('☺')
            emit()
        }

        //让同一个特征出现两次。我认为这个特征比较重要
        buffer.clear()
        buffer.append(curWord)
        emit()
        emit()

        if (nextWord != null) {
            buffer.clear()
            buffer.append(nextWord)
            buffer.append('♂')
            emit()
        }

        val length = curWord.length

        // prefix
        if (length >= 2) {
            val last = length - 1

            val c1 = curWord[0]
            val l1 = curWord[last]

            buffer.set2(c1, '★')
            emit()

            buffer.set2(l1, '✆')
            emit()

            if (length >= 3) {
                val c2 = curWord[1]
                val l2 = curWord[last - 1]

                buffer.set3(c1, c2, '★')
                emit()

                buffer.set3(l1, l2, '✆')
                emit()

                if (length >= 4) {
                    val c3 = curWord[2]
                    val l3 = curWord[last - 2]
                    buffer.set4(c1, c2, c3, '★')
                    emit()
                    buffer.set4(l1, l2, l3, '✆')
                    emit()
                }
            }
        }
    }

    override fun evaluateFunction(perceptron: PerceptronModel): EvaluateFunction? {
        return null
    }

    override fun preProcessInputSequence(input: List): List {
        return input.map { CharNormUtils.convert(it) }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy