cc.factorie.app.classify.InfoGain.scala Maven / Gradle / Ivy
/* Copyright (C) 2008-2016 University of Massachusetts Amherst.
This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible)
http://factorie.cs.umass.edu, http://github.com/factorie
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
package cc.factorie.app.classify
import cc.factorie.util.TopN
import cc.factorie.variable._
/** Calculate the information gain between all features of Instances and the Instances' labels.
@author Andrew McCallum
@since 0.10
*/
class InfoGain[L<:DiscreteVar,F<:VectorVar](labels:Iterable[L], f:L=>F) extends cc.factorie.util.DenseDoubleSeq {
def apply(i:Int): Double = infogains(i)
def length = infogains.length
val domain: DiscreteDomain = f(labels.head).domain.dimensionDomain
private val infogains = cc.factorie.util.DoubleSeq(domain.size)
var baseEntropy: Double = 0.0
init(labels)
// TODO Currently only works for CategoricalDomain, not DiscreteDomain
override def top(n:Int): TopN[String] = new TopN[String](n, this, domain.asInstanceOf[CategoricalDomain[String]].categories)
protected def init(labels:Iterable[L]) {
val numInstances = labels.size
val instanceDomain = f(labels.head).domain
val numFeatures = instanceDomain.dimensionDomain.size
val labelDomain = labels.head.domain
val numLabels = labelDomain.size
val featureTargetProportions = Array.fill(numFeatures)(new DenseProportions1(numLabels))
val featureCount = new Array[Double](numFeatures)
val targetProportions = new DenseProportions1(numLabels)
for (label <- labels) {
val instance: VectorVar = f(label)
assert(instance.domain == instanceDomain)
assert(label.domain == labelDomain)
val labelIndex = label.intValue
targetProportions.masses.+=(labelIndex, 1.0)
//println("InfoGain "+instance.activeDomain.toSeq)
//for (featureIndex <- instance.activeDomain.asSeq)
//println("InfoGain "+instance.tensor.asInstanceOf[cc.factorie.la.GrowableSparseBinaryTensor1].toIntArray.toSeq)
assert(instance.value.activeDomain.toSeq.distinct.length == instance.value.activeDomain.toSeq.length, instance.value.activeDomain.toSeq.toString())
instance.value.activeDomain.foreach(featureIndex => {
featureTargetProportions(featureIndex).masses.+=(labelIndex, 1.0)
featureCount(featureIndex) += 1
})
}
baseEntropy = targetProportions.entropy
for (featureIndex <- 0 until numFeatures) {
//println("InfoGain feature="+instanceDomain.dimensionDomain.asInstanceOf[CategoricalDomain[String]].category(featureIndex))
//println("InfoGain targetProportions="+targetProportions.masses)
//println("InfoGain featureTargetProp="+featureTargetProportions(featureIndex).masses)
val entropyWithFeature = if (featureTargetProportions(featureIndex).masses.massTotal > 0) featureTargetProportions(featureIndex).entropy else 0.0
val normWithoutFeature = numInstances - featureCount(featureIndex)
val entropyWithoutFeature =
if (normWithoutFeature > 0) {
val noTargetProportions = new DenseMasses1(numLabels)
noTargetProportions += targetProportions.masses
noTargetProportions -= featureTargetProportions(featureIndex).masses
noTargetProportions.normalize()
noTargetProportions.entropy
//maths.entropy((0 until numLabels).map(li => (targetProportions.mass(li) - featureTargetProportions(featureIndex).mass(li))/normWithoutFeature))
} else 0.0
infogains(featureIndex) = baseEntropy - featureCount(featureIndex)/numInstances * entropyWithFeature - (numInstances-featureCount(featureIndex))/numInstances * entropyWithoutFeature
}
}
}