Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.mayabot.nlp.perceptron.PerceptronModel.kt Maven / Gradle / Ivy
/*
* Copyright 2018 mayabot.com authors. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mayabot.nlp.perceptron
import com.mayabot.nlp.algorithm.TopIntMinK
import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrie
import com.mayabot.nlp.common.hppc.IntArrayList
import java.io.DataOutputStream
import java.io.File
import java.util.*
/**
* FeatureVector最后多一位是留个转移特征使用
*/
typealias FeatureVector = IntArrayList
typealias FeatureVectorSequence = List
/**
* 感知机模型。
*
* 这个感知机面向的是向量进行计算的,提供最最底层的实现。
*
* FeatureSet中保存了所有的特征,每个特征都有自己的数组下标。
* 系统中存在一共N个特征。
*
* 感知机处理的向量是一个特征向量的长度是N,但是是觉得稀疏的向量,只有
* 某几位是1,其他的为0,所以使用有限的int[]来保存为1的向量的下标。
*
* decode中使用了int[]中的,最后一位是留个转移特征使用的。
*
* 所以抽取特征向量的时候需要注意。
* //// 最后一列留给转移特征
* result.add(0)
*
* @author jimichan
*/
interface PerceptronModel {
/**
* 返回特征集合[FeatureSet]
*/
fun featureSet(): FeatureSet
/**
* 为在线学习检测[featureId]是否已经存在,如果不存在,那么自动增加一个
*/
fun makeSureParameter(featureId: Int)
/**
* 在线学习版本的update。学习一个[data]的[TrainSample]实例
*/
fun onlineLearn(data: TrainSample)
/**
* 压缩模型
* 要求FeatureSet的文本key不能为空,在训练阶段的时候使用
* 压缩模型大小。删除权重不重要的特征。
* - [ratio]是压缩比,0.1表示压缩去掉10%的特征。
* - [threshold]特征最小得分,得分小于这个阈值就删除。
*/
fun compress(ratio: Double, threshold: Double): PerceptronModel
/**
* 解码
*
* @param featureSequence FeatureSequence 为一个浓缩的特征向量,最后一位是留给转移特征。
* @param guessLabel 结构保存在这个数组里面去
*/
fun decode(featureSequence: FeatureVectorSequence, guessLabel: IntArray)
/**
* 解码
*
* @param featureSequence FeatureSequence 稀疏特征向量的简短表示,最后一位是留给转移特征。
* @return label对应的ID数组
*/
fun decode(featureSequence: FeatureVectorSequence): IntArray {
val result = IntArray(featureSequence.size)
if (result.isEmpty()) {
return result
}
decode(featureSequence, result)
return result
}
/**
* 保存感知机模型实例到二进制文件。
* 模型讲
* @param dir File 一个空的文件夹
*/
fun save(dir: File)
fun decodeQuickMode(quick: Boolean)
fun parameterAt(index: Int): Float
}
/**
* 感知机模型实现。
*
* parameter是一个很长的float数组。
*
* [ [F0],[F1],[label_0,label_1,label_x]..[FN]] ]
* Fn 表示一个特征在每个label的权重。
* 假设Label是4个,特征总数是10000.
* 那么逻辑上存在4个长度为1万的float向量,每一个向量代表对于label的感知机参数权重。
* 也是是存在四个感知机。
*
* 一个Input,包含了多个特征,这些特征在每个每个感知机里面对应的位置之和,就是每个label的score了。
* 在不考虑转移概率和全局最后的情况下,只要选择一个当前最大的得分即可。
* 如果要考虑全局最优那么需要viterbi解码。
*
* FeatureSet中Feature的key是按照顺序排序的,假设实际数据的特征数量是1000,label是4,那么在前面插入4+1个特殊特征。
*
*
* @author jimichan
*/
class PerceptronModelImpl(
private val featureSet: FeatureSet,
val labelCount: Int,
var parameter: FloatArray
) : PerceptronModel {
override fun parameterAt(index: Int) = parameter[index]
override fun decodeQuickMode(quick: Boolean) {
decodeQuickModel = quick
}
private val MaxScore = Integer.MIN_VALUE.toDouble()
private var decodeQuickModel = false
private val labelLimitInParameter = (labelCount + 1) * labelCount
constructor(featureSet: FeatureSet, labelCount: Int) :
this(featureSet, labelCount, FloatArray(featureSet.size() * labelCount))
fun parameterSize() = parameter.size
override fun featureSet() = featureSet
override fun makeSureParameter(featureId: Int) {
var newSize = (featureId + 1) * labelCount
if (newSize > parameter.size) {
val pad = 1000
newSize = (featureId + pad) * labelCount
parameter = Arrays.copyOf(parameter, newSize)
}
}
/**
* 单线程训练时调用.
* 平均感知机
*/
fun update(data: TrainSample, total: DoubleArray, timestamp: IntArray, current: Int) {
val length = data.size
val guessLabel = IntArray(length)
decode(data.featureSequence, guessLabel)
for (i in 0 until length) {
val labels = featureToLabel(data, guessLabel, i)
updateParameter(labels.goldFeature, labels.predFeature, total, timestamp, current)
}
}
/**
* 多线程训练时调用.
* 结构化感知机
*/
fun update(data: TrainSample) {
val length = data.size
val guessLabel = IntArray(length)
decode(data.featureSequence, guessLabel)
for (i in 0 until length) {
val labels = featureToLabel(data, guessLabel, i)
updateOnline(labels.goldFeature, labels.predFeature)
}
}
override fun onlineLearn(data: TrainSample) {
val guessLabel = IntArray(data.size)
decode(data.featureSequence, guessLabel)
if (Arrays.equals(guessLabel, data.label)) {
return
}
var over = 0
for (i in 0 until 10) {
val eq = updateForOnlineLearnInner(data, 1f)
if (eq) {
over++
if (over > 1) {
return
}
}
}
}
private fun updateForOnlineLearnInner(data: TrainSample, step: Float): Boolean {
val length = data.size
val guessLabel = IntArray(length)
decode(data.featureSequence, guessLabel)
for (i in 0 until length) {
val labels = featureToLabel(data, guessLabel, i)
updateOnline2(labels.goldFeature, labels.predFeature, step)
}
decode(data.featureSequence, guessLabel)
return Arrays.equals(guessLabel, data.label)
}
private fun updateOnline2(goldIndex: IntArray, predictIndex: IntArray, step: Float) {
for (i in goldIndex.indices) {
val xii = predictIndex[i]
if (goldIndex[i] == xii)
continue
else {
val toAdd = goldIndex[i]
if (toAdd > labelLimitInParameter) {
parameter[toAdd] += step
}
if (xii >= 0 && xii < parameter.size && xii > labelLimitInParameter) {
parameter[xii] -= step
} else {
//throw IllegalArgumentException("更新参数时传入了非法的下标")
}
}
}
}
private fun updateOnline(goldIndex: IntArray, predictIndex: IntArray) {
for (i in goldIndex.indices) {
val xii = predictIndex[i]
if (goldIndex[i] == xii)
continue
else {
parameter[goldIndex[i]]++
if (xii >= 0 && xii < parameter.size)
parameter[xii]--
else {
throw IllegalArgumentException("更新参数时传入了非法的下标")
}
}
}
}
private fun featureToLabel(data: TrainSample, guessLabel: IntArray, i: Int): Labels {
//序号代替向量 替代数组
val featureVector = data.featureSequence[i]
//正确标签
val goldFeature = IntArray(featureVector.size())
//预测标签
val predFeature = IntArray(featureVector.size())
val last = featureVector.size() - 1
for (j in 0 until last) {
//特征 -> 对应标签
goldFeature[j] = featureVector[j] * labelCount + data.label[i]
predFeature[j] = featureVector[j] * labelCount + guessLabel[i]
}
goldFeature[last] = (if (i == 0) labelCount else data.label[i - 1]) * labelCount + data.label[i]
predFeature[last] = (if (i == 0) labelCount else guessLabel[i - 1]) * labelCount + guessLabel[i]
return Labels(goldFeature, predFeature)
}
private fun updateParameter(goldIndex: IntArray, predictIndex: IntArray, total: DoubleArray, timestamp: IntArray, current: Int): Boolean {
if (goldIndex.contentEquals(predictIndex)) return false
for (i in goldIndex.indices) {
if (goldIndex[i] == predictIndex[i])
continue
else {
record(goldIndex[i], 1f, total, timestamp, current)//当预测的标注和实际标注不一致时 先更新权重
if (predictIndex[i] >= 0 && predictIndex[i] < parameter.size)
record(predictIndex[i], -1f, total, timestamp, current)
else {
throw IllegalArgumentException("更新参数时传入了非法的下标")
}
}
}
return true
}
private fun record(index: Int, value: Float, total: DoubleArray, timestamp: IntArray, current: Int) {
val passed = current - timestamp[index]
total[index] += passed * parameter[index].toDouble() //权重乘以该纬度经历时间
parameter[index] += value
timestamp[index] = current
}
fun average(total: DoubleArray, timestamp: IntArray, current: Int) {
val cf = current.toFloat()
for (i in 0 until parameter.size) {
val pass = cf - timestamp[i].toFloat()
val totali = total[i].toFloat()
parameter[i] = (totali + pass * parameter[i]) / cf
}
}
/**
* 模型压缩
* @param ratio 压缩比,如0.2,那么就是去掉0.2的特征
*/
override fun compress(ratio: Double, threshold: Double): PerceptronModel {
if (ratio < 0 || ratio >= 1) {
throw IllegalArgumentException("压缩比必须介于 0 和 1 之间")
}
assert(featureSet.keys != null)
val k = if (ratio == 0.0) 0 else (ratio * featureSet.size()).toInt()
val featureList = featureSet.keys!!
fun score(id: Int): Float {
var s = 0f
for (i in id * labelCount until id * labelCount + labelCount) {
s += Math.abs(parameter[i])
}
return s
}
val filterSet = HashSet()
for (id in 0 until featureList.size) {
val s = score(id)
if (s < threshold) {
filterSet.add(id)
}
}
println("threshold filterd ${filterSet.size}")
//移除了很小的之后,还没有达到缩减的值
if (k > 0 && filterSet.size < k) {
val heap = TopIntMinK(k - filterSet.size)
println("let's filter top min ${k - filterSet.size}")
for (id in 0 until featureList.size) {
val s = score(id)
if (s >= threshold) {
heap.push(id, s)
}
}
val topMinResultIdSet = heap.result().map { it.first }.toSet()
filterSet.addAll(topMinResultIdSet)
}
println("remove ${filterSet.size} feature,real compress ${String.format("%.3f", filterSet.size * 1.0f / featureList.size)}")
val newSize = featureList.size - filterSet.size
val newFeatureList = ArrayList(newSize)
val newParameter = FloatArray(labelCount * newSize)
var cc = 0
featureList.forEachIndexed { index, s ->
if (index <= labelCount || !filterSet.contains(index)) {
// index<=labelCount 是特征构件是保留的和labelCount+1数量相等的特征,需要保留
newFeatureList += s
System.arraycopy(parameter, index * labelCount, newParameter, cc * labelCount, labelCount)
cc++
}
}
return PerceptronModelImpl(FeatureSet(DoubleArrayTrie(newFeatureList), newFeatureList),
labelCount,
newParameter
)
}
/**
* 二进制格式:
* parameter.bin => [labelCount][parameter.size][parameter array]
* feature.dat => 特征集合
*/
override fun save(dir: File) {
dir.mkdirs()
File(dir, "parameter.bin").outputStream().buffered().use {
val out = DataOutputStream(it)
out.writeInt(labelCount)
out.writeInt(parameter.size)
parameter.forEach { w -> out.writeFloat(w) }
out.flush()
}
featureSet.save(
File(dir, "feature.dat"),
File(dir, "feature.txt"))
}
/**
* 快速解码。不考虑转移概率,比较适合词性模型解码。
*/
private fun decodeQuick(featureSequence: FeatureVectorSequence, guessLabel: IntArray) {
var index = 0
val parameterArray = parameter
for (feature in featureSequence) {
val buffer = feature.buffer
val sizeM1 = feature.size() - 1
var maxScore = Float.MIN_VALUE
//这里修改为-1,一定会有什么特征都不存在的。
var maxIndex = -1
if(sizeM1 != 0) {
for (label in 0 until labelCount) {
var score = 0.0f
for (i in 0 until sizeM1) {
score += parameterArray[buffer[i] * labelCount + label]
}
if (score > maxScore) {
maxIndex = label
maxScore = score
}
}
}
guessLabel[index++] = maxIndex
}
}
/**
* viterbi
*/
override fun decode(featureSequence: FeatureVectorSequence, guessLabel: IntArray) {
val parameter = parameter
//快速模式,不考虑转移,只适用于词性标注类型的任务
if (decodeQuickModel) {
decodeQuick(featureSequence, guessLabel)
return
}
val sentenceLength = featureSequence.size
val preMatrix = IntArray(sentenceLength * labelCount)
// 二阶维特比解码算法,目的是找出每个选择带来权重之后全局最大化。
//上一回的状态
var scoreMLast = DoubleArray(labelCount)
var scoreMNow = DoubleArray(labelCount)
//first
val firstFeature = featureSequence[0]
val bos = labelCount
val bosBase = bos * labelCount
for (j in 0 until labelCount) {
preMatrix[j] = j
val score = scoreBase(firstFeature, j) + parameter[bosBase + j]
scoreMLast[j] = score
}
for (i in 1 until sentenceLength) {
val allFeature = featureSequence[i]
val base = i * labelCount
for (curLabel in 0 until labelCount) {
var maxScore = MaxScore
// baseScore的计算提取到下面for循环之外来,避免重复计算,提高性能
val baseScore = scoreBase(allFeature, curLabel)
for (preLabel in 0 until labelCount) {
val curScore = scoreMLast[preLabel] + baseScore + parameter[preLabel * labelCount + curLabel]
if (curScore > maxScore) {
maxScore = curScore
preMatrix[base + curLabel] = preLabel
scoreMNow[curLabel] = maxScore
}
}
}
//switch
val temp = scoreMLast
scoreMLast = scoreMNow
scoreMNow = temp
}
//此时scoreM0 肯定是最后一个
// 找出scoreMLast中最大的score和index
var maxIndex = 0
var maxScore = scoreMLast[0]
for (index in 1 until labelCount) {
val x = scoreMLast[index]
if (maxScore < x) {
maxIndex = index
maxScore = x
}
}
//回填每个位置的猜测的结构。
var k = (sentenceLength - 1) * labelCount
for (i in sentenceLength - 1 downTo 0) {
guessLabel[i] = maxIndex
maxIndex = preMatrix[k + maxIndex]
k -= labelCount
}
}
private fun scoreBase(featureVector: FeatureVector, currentTag: Int): Double {
val parameter = parameter
var score = 0.0
val buffer = featureVector.buffer
for (i in 0 until featureVector.size() - 1) {
val index = buffer[i]
score += parameter[index * labelCount + currentTag]
}
return score
}
class Labels(val goldFeature: IntArray, val predFeature: IntArray)
}
/**
* 感知机训练的样本
* FeatureVector最后多一位是留个转移特征使用
*/
class TrainSample(
val featureSequence: FeatureVectorSequence,
val label: IntArray) {
val size = featureSequence.size
}