
org.dianahep.histogrammar.primitives.categorize.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of histogrammar_2.10 Show documentation
Show all versions of histogrammar_2.10 Show documentation
Histogram abstraction to simplify complex aggregations in distributed environments.
The newest version!
// Copyright 2016 DIANA-HEP
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package org.dianahep
import scala.collection.mutable
import scala.language.postfixOps
import scala.language.existentials
import org.dianahep.histogrammar.json._
import org.dianahep.histogrammar.util._
package histogrammar {
//////////////////////////////////////////////////////////////// Categorize/Categorized/Categorizing
/** Split a given quantity by its categorical value and fill only one category per datum.
*
* A bar chart may be thought of as a histogram with string-valued (categorical) bins, so this is the equivalent of [[org.dianahep.histogrammar.Bin]] for bar charts. The order of the strings is deferred to the visualization stage.
*
* Unlike [[org.dianahep.histogrammar.SparselyBin]], this aggregator has the potential to use unlimited memory. A large number of ''distinct'' categories can generate many unwanted bins.
*
* Factory produces mutable [[org.dianahep.histogrammar.Categorizing]] and immutable [[org.dianahep.histogrammar.Categorized]] objects.
*/
object Categorize extends Factory {
val name = "Categorize"
val help = "Split a given quantity by its categorical value and fill only one category per datum."
val detailedHelp = """A bar chart may be thought of as a histogram with string-valued (categorical) bins, so this is the equivalent of [[org.dianahep.histogrammar.Bin]] for bar charts. The order of the strings is deferred to the visualization stage.
Unlike [[org.dianahep.histogrammar.SparselyBin]], this aggregator has the potential to use unlimited memory. A large number of ''distinct'' categories can generate many unwanted bins."""
/** Create an immutable [[org.dianahep.histogrammar.Categorized]] from arguments (instead of JSON).
*
* @param entries Weighted number of entries (sum of all observed weights).
* @param contentType Name of the intended content; used as a placeholder in cases with zero bins (due to no observed data).
* @param bins String category and the associated container of values associated with it.
*/
def ed[V <: Container[V] with NoAggregation](entries: Double, contentType: String, bins: Map[String, V]) = new Categorized(entries, None, contentType, bins)
/** Create an empty, mutable [[org.dianahep.histogrammar.Categorizing]].
*
* @param quantity Numerical function to split into bins.
* @param value New value (note the `=>`: expression is reevaluated every time a new value is needed).
*/
def apply[DATUM, V <: Container[V] with Aggregation{type Datum >: DATUM}](quantity: UserFcn[DATUM, String], value: => V = Count()) =
new Categorizing(quantity, 0.0, value, mutable.HashMap[String, V]())
/** Synonym for `apply`. */
def ing[DATUM, V <: Container[V] with Aggregation{type Datum >: DATUM}](quantity: UserFcn[DATUM, String], value: => V = Count()) =
apply(quantity, value)
import KeySetComparisons._
def fromJsonFragment(json: Json, nameFromParent: Option[String]): Container[_] with NoAggregation = json match {
case JsonObject(bins @ _*) if (bins.keySet has Set("entries", "bins:type", "bins").maybe("name").maybe("bins:name")) =>
val get = bins.toMap
val entries = get("entries") match {
case JsonNumber(x) => x
case x => throw new JsonFormatException(x, name + ".entries")
}
val quantityName = get.getOrElse("name", JsonNull) match {
case JsonString(x) => Some(x)
case JsonNull => None
case x => throw new JsonFormatException(x, name + ".name")
}
val (contentType, factory) = get("bins:type") match {
case JsonString(name) => (name, Factory(name))
case x => throw new JsonFormatException(x, name + ".bins:type")
}
val dataName = get.getOrElse("bins:name", JsonNull) match {
case JsonString(x) => Some(x)
case JsonNull => None
case x => throw new JsonFormatException(x, name + ".bins:name")
}
val thebins =
get("bins") match {
case JsonObject(categoryPairs @ _*) =>
categoryPairs map {
case (JsonString(category), value) =>
category -> factory.fromJsonFragment(value, dataName)
} toMap
case x => throw new JsonFormatException(x, name + ".bins")
}
new Categorized(entries, (nameFromParent ++ quantityName).lastOption, contentType, thebins.asInstanceOf[Map[String, C] forSome {type C <: Container[C] with NoAggregation}])
case _ => throw new JsonFormatException(json, name)
}
}
/** An accumulated quantity that was split by its categorical (string-based) values, filling only one category per datum.
*
* Use the factory [[org.dianahep.histogrammar.Categorize]] to construct an instance.
*
* @param entries Weighted number of entries (sum of all observed weights).
* @param quantityName Optional name given to the quantity function, passed for bookkeeping.
* @param contentType Name of the intended content; used as a placeholder in cases with zero bins (due to no observed data).
* @param bins String category and the associated container of values associated with it.
*/
class Categorized[V <: Container[V] with NoAggregation] private[histogrammar](val entries: Double, val quantityName: Option[String], contentType: String, val bins: Map[String, V]) extends Container[Categorized[V]] with NoAggregation with QuantityName {
type Type = Categorized[V]
type EdType = Categorized[V]
def factory = Categorize
if (entries < 0.0)
throw new ContainerException(s"entries ($entries) cannot be negative")
/** Number of `bins`. */
def size = bins.size
/** Iterable over the keys of the `bins`. */
def keys: Iterable[String] = bins.toIterable.map(_._1)
/** Iterable over the values of the `bins`. */
def values: Iterable[Container[V]] = bins.toIterable.map(_._2)
/** Set of keys among the `bins`. */
def keySet: Set[String] = keys.toSet
/** Attempt to get key `x`, throwing an exception if it does not exist. */
def apply(x: String) = bins(x)
/** Attempt to get key `x`, returning `None` if it does not exist. */
def get(x: String) = bins.get(x)
/** Attempt to get key `x`, returning an alternative if it does not exist. */
def getOrElse(x: String, default: => V) = bins.getOrElse(x, default)
def zero = new Categorized[V](0.0, quantityName, contentType, Map[String, V]())
def +(that: Categorized[V]) =
if (this.quantityName != that.quantityName)
throw new ContainerException(s"cannot add ${getClass.getName} because quantityName differs (${this.quantityName} vs ${that.quantityName})")
else
new Categorized(
this.entries + that.entries,
this.quantityName,
contentType,
(this.keySet union that.keySet).toSeq map {key =>
if ((this.bins contains key) && (that.bins contains key))
(key, this.bins(key) + that.bins(key))
else if (this.bins contains key)
(key, this.bins(key))
else
(key, that.bins(key))
} toMap)
def *(factor: Double) =
if (factor.isNaN || factor <= 0.0)
zero
else
new Categorized[V](factor * entries, quantityName, contentType, bins map {case (c, v) => (c, v * factor)})
def children = values.toList
def toJsonFragment(suppressName: Boolean) = JsonObject(
"entries" -> JsonFloat(entries),
"bins:type" -> JsonString(contentType),
"bins" -> JsonObject(bins.toSeq map {case (k, v) => (JsonString(k), v.toJsonFragment(true))}: _*)).
maybe(JsonString("name") -> (if (suppressName) None else quantityName.map(JsonString(_)))).
maybe(JsonString("bins:name") -> (bins.headOption match {case Some((k, v: QuantityName)) => v.quantityName.map(JsonString(_)); case _ => None}))
override def toString() = s""""""
override def equals(that: Any) = that match {
case that: Categorized[V] => this.entries === that.entries && this.quantityName == that.quantityName && this.bins == that.bins
case _ => false
}
override def hashCode() = (entries, quantityName, bins).hashCode()
}
/** Accumulating a quantity by splitting it by its categorical (string-based) value and filling only one category per datum.
*
* Use the factory [[org.dianahep.histogrammar.Categorize]] to construct an instance.
*
* @param quantity Numerical function to track.
* @param entries Weighted number of entries (sum of all observed weights).
* @param value New value (note the `=>`: expression is reevaluated every time a new value is needed).
* @param bins Map of string category and the associated container of values associated with it.
*/
class Categorizing[DATUM, V <: Container[V] with Aggregation{type Datum >: DATUM}] private[histogrammar](val quantity: UserFcn[DATUM, String], var entries: Double, value: => V, val bins: mutable.HashMap[String, V]) extends Container[Categorizing[DATUM, V]] with AggregationOnData with CategoricalQuantity[DATUM] {
protected val v = value
type Type = Categorizing[DATUM, V]
type EdType = Categorized[v.EdType]
type Datum = DATUM
def factory = Categorize
if (entries < 0.0)
throw new ContainerException(s"entries ($entries) cannot be negative")
/** Number of `bins`. */
def size = bins.size
/** Iterable over the keys of the `bins`. */
def keys: Iterable[String] = bins.toIterable.map(_._1)
/** Iterable over the values of the `bins`. */
def values: Iterable[V] = bins.toIterable.map(_._2)
/** Set of keys among the `bins`. */
def keySet: Set[String] = keys.toSet
/** Attempt to get key `x`, throwing an exception if it does not exist. */
def apply(x: String) = bins(x)
/** Attempt to get key `x`, returning `None` if it does not exist. */
def get(x: String) = bins.get(x)
/** Attempt to get key `x`, returning an alternative if it does not exist. */
def getOrElse(x: String, default: => V) = bins.getOrElse(x, default)
def zero = new Categorizing[DATUM, V](quantity, 0.0, value, mutable.HashMap[String, V]())
def +(that: Categorizing[DATUM, V]) =
if (this.quantity.name != that.quantity.name)
throw new ContainerException(s"cannot add ${getClass.getName} because quantity name differs (${this.quantity.name} vs ${that.quantity.name})")
else
new Categorizing[DATUM, V](
this.quantity,
this.entries + that.entries,
this.value,
mutable.HashMap[String, V]((this.keySet union that.keySet).toSeq map {key =>
if ((this.bins contains key) && (that.bins contains key))
(key, this.bins(key) + that.bins(key))
else if (this.bins contains key)
(key, this.bins(key))
else
(key, that.bins(key))
}: _*))
def *(factor: Double) =
if (factor.isNaN || factor <= 0.0)
zero
else
new Categorizing[DATUM, V](quantity, factor * entries, value, mutable.HashMap[String, V](bins.toSeq.map({case (c, v) => (c, v * factor)}): _*))
def fill[SUB <: Datum](datum: SUB, weight: Double = 1.0) {
checkForCrossReferences()
if (weight > 0.0) {
val q = quantity(datum)
if (!(bins contains q))
bins(q) = value.zero
bins(q).fill(datum, weight)
// no possibility of exception from here on out (for rollback)
entries += weight
}
}
def children = value :: values.toList
def toJsonFragment(suppressName: Boolean) = JsonObject(
"entries" -> JsonFloat(entries),
"bins:type" -> JsonString(value.factory.name),
"bins" -> JsonObject(bins.toSeq map {case (k, v) => (JsonString(k), v.toJsonFragment(true))}: _*)).
maybe(JsonString("name") -> (if (suppressName) None else quantity.name.map(JsonString(_)))).
maybe(JsonString("bins:name") -> List(value).collect({case v: AnyQuantity[_, _] => v.quantity.name}).headOption.flatten.map(JsonString(_)))
override def toString() = s""""""
override def equals(that: Any) = that match {
case that: Categorizing[DATUM, V] => this.quantity == that.quantity && this.entries === that.entries && this.bins == that.bins
case _ => false
}
override def hashCode() = (quantity, entries, bins).hashCode()
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy