com.johnsnowlabs.nlp.annotators.parser.dep.Perceptron.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2017-2022 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.johnsnowlabs.nlp.annotators.parser.dep
import com.johnsnowlabs.nlp.annotators.parser.dep.GreedyTransition._
import scala.collection.mutable
class Perceptron(nClasses: Int) extends Serializable {
// These need not be visible outside the class
type TimeStamp = Int
case class WeightLearner(current: Int, total: Int, ts: TimeStamp) {
def addChange(change: Int): WeightLearner = {
WeightLearner(current + change, total + current * (seen - ts), seen)
}
}
type ClassToWeightLearner =
mutable.Map[ClassNum, WeightLearner] // tells us the stats of each class (if present)
// The following are keyed on feature (to keep tally of total numbers into each, and when)(for the TRAINING phase)
val learning = mutable.Map.empty[
String, // Corresponds to Feature.name
mutable.Map[
String, // Corresponds to Feature.data
ClassToWeightLearner
]
] // This is hairy and mutable...
// Number of instances seen - used to measure how 'old' each total is
var seen: TimeStamp = 0
type ClassVector = Vector[Score]
def predict(classnumVector: ClassVector): ClassNum = { // Return best class guess for this vector of weights
classnumVector.zipWithIndex
.maxBy(_._1)
._2 // in vector order (stabilizes) ///NOT : (and alphabetically too)
}
def current(w: WeightLearner): Float = w.current
def average(w: WeightLearner): Float =
(w.current * (seen - w.ts) + w.total) / seen // This is dynamically calculated
// No need for average_weights() function - it's all done dynamically
def score(features: Map[Feature, Score], scoreMethod: WeightLearner => Float): ClassVector = { // Return 'dot-product' score for all classes
if (false) { // This is the functional version : 3023ms for 1 train_all, and 0.57ms for a sentence
features
.filter(pair => pair._2 != 0) // if the 'score' multiplier is zero, skip
.foldLeft(Vector.fill(nClasses)(0: Float)) {
case (acc, (Feature(name, data), score)) => { // Start with a zero classnum->score vector
learning
.getOrElse(
name,
Map[String, ClassToWeightLearner]()
) // This is first level of feature access
.getOrElse(
data,
Map[ClassNum, WeightLearner]()
) // This is second level of feature access and is a Map of ClassNums to Weights (or NOOP if not there)
.foldLeft(acc) { (accuracyForFeature, cnWL) =>
{ // Add each of the class->weights onto our score vector
val classnum: ClassNum = cnWL._1
val weightLearner: WeightLearner = cnWL._2
accuracyForFeature.updated(
classnum,
accuracyForFeature(classnum) + score * scoreMethod(weightLearner))
}
}
}
}
} else { // This is the mutable version : 2493ms for 1 train_all, and 0.45ms for a sentence
val scores = new Array[Score](nClasses) // All 0?
features
.filter(pair => pair._2 != 0) // if the 'score' multiplier is zero, skip
.foreach {
case (Feature(name, data), score) => { // Ok, so given a particular feature, and score to weight it by
if (learning.contains(name) && learning(name).contains(data)) {
learning(name)(data).foreach {
case (classnum, weightLearner) => {
scores(classnum) += score * scoreMethod(weightLearner)
}
}
}
}
}
scores.toVector
}
}
def update(truth: ClassNum, guess: ClassNum, features: Iterable[Feature]): Unit = { // Hmmm ..Unit..
seen += 1
if (truth != guess) {
for {
feature <- features
} {
learning.getOrElseUpdate(feature.name, mutable.Map[FeatureData, ClassToWeightLearner]())
val thisLearning = learning(feature.name)
.getOrElseUpdate(feature.data, mutable.Map[ClassNum, WeightLearner]())
if (thisLearning.contains(guess)) {
thisLearning.update(guess, thisLearning(guess).addChange(-1))
}
thisLearning.update(
truth,
thisLearning.getOrElse(truth, WeightLearner(0, 0, seen)).addChange(+1))
learning(feature.name)(feature.data) = thisLearning
}
}
}
override def toString: String = {
s"perceptron.seen=[$seen]" + System.lineSeparator() +
learning
.map({
case (featureName, m1) => {
m1.map({
case (featureData, cnFeature) => {
cnFeature
.map({
case (cn, feature) => {
s"$cn:${feature.current},${feature.total},${feature.ts}"
}
})
.mkString(s"${featureData}[", "|", "]" + System.lineSeparator())
}
}).mkString(
s"${featureName}{" + System.lineSeparator(),
"",
"}" + System.lineSeparator())
}
})
.mkString(
"perceptron.learning={" + System.lineSeparator(),
"",
"}" + System.lineSeparator())
}
def load(lines: Iterator[String]): Unit = {
val perceptronSeen = """perceptron.seen=\[(.*)\]""".r
val perceptronFeatN = """(.*)\{""".r
val perceptronFeatD = """(.*)\[(.*)\]""".r
val ilines = lines
def parse(lines: Iterator[String]): Unit = if (ilines.hasNext) ilines.next match {
case perceptronSeen(data) => {
seen = data.toInt
parse(lines)
}
case "perceptron.learning={" => {
parseFeatureName(ilines)
parse(lines)
}
case _ => () // line not understood : Finished with perceptron
}
def parseFeatureName(lines: Iterator[String]): Unit = if (lines.hasNext) lines.next match {
case perceptronFeatN(featureName) => {
learning.getOrElseUpdate(featureName, mutable.Map[FeatureData, ClassToWeightLearner]())
parseFeatureData(featureName, lines)
parseFeatureName(lines) // Go back for more featurename sections
}
case _ => () // line not understood : Finished with featurename
}
def parseFeatureData(featureName: String, lines: Iterator[String]): Unit =
if (lines.hasNext) lines.next match {
case perceptronFeatD(featureData, classnumWeight) => {
learning(featureName)
.getOrElseUpdate(featureData, mutable.Map[ClassNum, WeightLearner]())
classnumWeight
.split('|')
.map(cw => {
val cnWT = cw.split(':').map(_.split(',').map(_.toInt))
learning(featureName)(featureData) += (
(
cnWT(0)(0),
WeightLearner(cnWT(1)(0), cnWT(1)(1), cnWT(1)(2))))
})
parseFeatureData(featureName, lines) // Go back for more featuredata lines
}
case _ => () // line not understood : Finished with featuredata
}
parse(lines)
}
}