Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.ml.stat
import scala.collection.mutable
/**
* MultiClassSummarizer computes the number of distinct labels and corresponding counts,
* and validates the data to see if the labels used for k class multi-label classification
* are in the range of {0, 1, ..., k - 1} in an online fashion.
*
* Two MultilabelSummarizer can be merged together to have a statistical summary of the
* corresponding joint dataset.
*/
private[ml] class MultiClassSummarizer extends Serializable {
// The first element of value in distinctMap is the actually number of instances,
// and the second element of value is sum of the weights.
private val distinctMap = new mutable.HashMap[Int, (Long, Double)]
private var totalInvalidCnt: Long = 0L
/**
* Add a new label into this MultilabelSummarizer, and update the distinct map.
*
* @param label The label for this data point.
* @param weight The weight of this instances.
* @return This MultilabelSummarizer
*/
def add(label: Double, weight: Double = 1.0): MultiClassSummarizer = {
require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0")
if (weight == 0.0) return this
if (label - label.toInt != 0.0 || label < 0) {
totalInvalidCnt += 1
}
else {
val (counts: Long, weightSum: Double) = distinctMap.getOrElse(label.toInt, (0L, 0.0))
distinctMap.put(label.toInt, (counts + 1L, weightSum + weight))
}
this
}
/**
* Merge another MultilabelSummarizer, and update the distinct map.
* (Note that it will merge the smaller distinct map into the larger one using in-place
* merging, so either `this` or `other` object will be modified and returned.)
*
* @param other The other MultilabelSummarizer to be merged.
* @return Merged MultilabelSummarizer object.
*/
def merge(other: MultiClassSummarizer): MultiClassSummarizer = {
val (largeMap, smallMap) = if (this.distinctMap.size > other.distinctMap.size) {
(this, other)
} else {
(other, this)
}
smallMap.distinctMap.foreach {
case (key, value) =>
val (counts: Long, weightSum: Double) = largeMap.distinctMap.getOrElse(key, (0L, 0.0))
largeMap.distinctMap.put(key, (counts + value._1, weightSum + value._2))
}
largeMap.totalInvalidCnt += smallMap.totalInvalidCnt
largeMap
}
/** @return The total invalid input counts. */
def countInvalid: Long = totalInvalidCnt
/** @return The number of distinct labels in the input dataset. */
def numClasses: Int = if (distinctMap.isEmpty) 0 else distinctMap.keysIterator.max + 1
/** @return The weightSum of each label in the input dataset. */
def histogram: Array[Double] = {
val result = Array.ofDim[Double](numClasses)
var i = 0
val len = result.length
while (i < len) {
result(i) = distinctMap.getOrElse(i, (0L, 0.0))._2
i += 1
}
result
}
}