com.spotify.noether.NdcgAtK.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of noether-core_2.12 Show documentation
Show all versions of noether-core_2.12 Show documentation
Machine Learning Aggregators
/*
* Copyright 2018 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.spotify.noether
import com.twitter.algebird.{Aggregator, Semigroup}
/**
* Compute the average NDCG value of all the predictions, truncated at ranking position k.
* The discounted cumulative gain at position k is computed as:
* sum,,i=1,,^k^ (2^{relevance of ''i''th item}^ - 1) / log(i + 1),
* and the NDCG is obtained by dividing the DCG value on the ground truth set. In the current
* implementation, the relevance value is binary.
* If a query has an empty ground truth set, zero will be used as ndcg
*
* See the following paper for detail:
*
* IR evaluation methods for retrieving highly relevant documents. K. Jarvelin and J. Kekalainen
*
* @param k the position to compute the truncated ndcg, must be positive
*/
case class NdcgAtK[T](k: Int) extends Aggregator[RankingPrediction[T], (Double, Long), Double] {
require(k > 0, "ranking position k should be positive")
def prepare(input: RankingPrediction[T]): (Double, Long) = {
val labSet = input.actual.toSet
if (labSet.nonEmpty) {
val labSetSize = labSet.size
val n = math.min(math.max(input.predicted.length, labSetSize), k)
var maxDcg = 0.0
var dcg = 0.0
var i = 0
while (i < n) {
val gain = 1.0 / math.log(i + 2.0)
if (i < input.predicted.length && labSet.contains(input.predicted(i))) {
dcg += gain
}
if (i < labSetSize) {
maxDcg += gain
}
i += 1
}
(dcg / maxDcg, 1L)
} else {
(0.0, 1L)
}
}
def semigroup: Semigroup[(Double, Long)] = implicitly[Semigroup[(Double, Long)]]
def present(score: (Double, Long)): Double = score._1 / score._2
}