
org.dianahep.histogrammar.primitives.sparselybin.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of histogrammar_2.10 Show documentation
Show all versions of histogrammar_2.10 Show documentation
Histogram abstraction to simplify complex aggregations in distributed environments.
The newest version!
// Copyright 2016 DIANA-HEP
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package org.dianahep
import scala.collection.mutable
import scala.collection.immutable.SortedMap
import scala.language.existentials
import org.dianahep.histogrammar.json._
import org.dianahep.histogrammar.util._
package histogrammar {
//////////////////////////////////////////////////////////////// SparselyBin/SparselyBinned/SparselyBinning
/** Split a quantity into equally spaced bins, creating them whenever their `entries` would be non-zero. Exactly one sub-aggregator is filled per datum.
*
* Use this when you have a distribution of known scale (bin width) but unknown domain (lowest and highest bin index).
*
* Unlike fixed-domain binning, this aggregator has the potential to use unlimited memory. A large number of ''distinct'' outliers can generate many unwanted bins.
*
* Like fixed-domain binning, the bins are indexed by integers, though they are 64-bit and may be negative. Bin indexes below `-(2**63 - 1)` are put in the `-(2**63 - 1)` are bin and indexes above `(2**63 - 1)` are put in the `(2**63 - 1)` bin.
*
* Factory produces mutable [[org.dianahep.histogrammar.SparselyBinning]] and immutable [[org.dianahep.histogrammar.SparselyBinned]] objects.
*/
object SparselyBin extends Factory {
val name = "SparselyBin"
val help = "Split a quantity into equally spaced bins, creating them whenever their `entries` would be non-zero. Exactly one sub-aggregator is filled per datum."
val detailedHelp = """Use this when you have a distribution of known scale (bin width) but unknown domain (lowest and highest bin index).
Unlike fixed-domain binning, this aggregator has the potential to use unlimited memory. A large number of _distinct_ outliers can generate many unwanted bins.
Like fixed-domain binning, the bins are indexed by integers, though they are 64-bit and may be negative. Bin indexes below `-(2**63 - 1)` are put in the `-(2**63 - 1)` are bin and indexes above `(2**63 - 1)` are put in the `(2**63 - 1)` bin."""
private val integerPattern = "-?[0-9]+".r
/** Create an immutable [[org.dianahep.histogrammar.SparselyBinned]] from arguments (instead of JSON).
*
* @param binWidth Width of the equally sized bins.
* @param entries Weighted number of entries (sum of all observed weights).
* @param contentType Name of the intended content; used as a placeholder in cases with zero bins (due to no observed data).
* @param bins Centers and values of each bin.
* @param nanflow Container for data that resulted in `NaN`.
* @param origin Left edge of the bin whose index is zero.
*/
def ed[V <: Container[V] with NoAggregation, N <: Container[N] with NoAggregation](binWidth: Double, entries: Double, contentType: String, bins: SortedMap[Long, V], nanflow: N, origin: Double) =
new SparselyBinned[V, N](binWidth, entries, None, contentType, bins, nanflow, origin)
/** Create an empty, mutable [[org.dianahep.histogrammar.SparselyBinning]].
*
* @param binWidth Width of the equally sized bins.
* @param quantity Numerical function to split into bins.
* @param value Template used to create zero values (by calling this `value`'s `zero` method).
* @param nanflow Container for data that resulted in `NaN`.
* @param origin Left edge of the bin whose index is zero.
*/
def apply[DATUM, V <: Container[V] with Aggregation{type Datum >: DATUM}, N <: Container[N] with Aggregation{type Datum >: DATUM}]
(binWidth: Double,
quantity: UserFcn[DATUM, Double],
value: => V = Count(),
nanflow: N = Count(),
origin: Double = 0.0) =
new SparselyBinning[DATUM, V, N](binWidth, quantity, 0.0, value, mutable.HashMap[Long, V](), nanflow, origin)
/** Synonym for `apply`. */
def ing[DATUM, V <: Container[V] with Aggregation{type Datum >: DATUM}, N <: Container[N] with Aggregation{type Datum >: DATUM}]
(binWidth: Double,
quantity: UserFcn[DATUM, Double],
value: => V = Count(),
nanflow: N = Count(),
origin: Double = 0.0) = apply(binWidth, quantity, value, nanflow, origin)
trait Methods {
def binWidth: Double
def origin: Double
/** The number of non-empty bins. */
def numFilled: Int
/** The number of bins between the first non-empty one (inclusive) and the last non-empty one (exclusive). */
def num: Long
/** The first non-empty bin. */
def minBin: Option[Long]
/** The last non-empty bin. */
def maxBin: Option[Long]
def low: Option[Double]
def high: Option[Double]
/** Get a sequence of filled indexes. */
def indexes: Seq[Long]
/** Get the low and high edge of a bin (given by index number). */
def range(index: Long): (Double, Double)
/** Find the bin index associated with numerical value `x`.
*
* @return `Long.MIN_VALUE` if `x` is `NaN`, the bin index if it is between `Long.MIN_VALUE + 1` and `Long.MAX_VALUE`, otherwise saturate at the endpoints.
*/
def bin(x: Double): Long =
if (nan(x))
java.lang.Long.MIN_VALUE
else {
val out = Math.floor((x - origin) / binWidth)
if (out < java.lang.Long.MIN_VALUE + 1)
java.lang.Long.MIN_VALUE + 1
else if (out > java.lang.Long.MAX_VALUE)
java.lang.Long.MAX_VALUE
else
out.toLong
}
/** Return `true` iff `x` is in the nanflow region (equal to `NaN`). */
def nan(x: Double): Boolean = x.isNaN
}
import KeySetComparisons._
def fromJsonFragment(json: Json, nameFromParent: Option[String]): Container[_] with NoAggregation = json match {
case JsonObject(pairs @ _*) if (pairs.keySet has Set("binWidth", "entries", "bins:type", "bins", "nanflow:type", "nanflow", "origin").maybe("name").maybe("bins:name")) =>
val get = pairs.toMap
val entries = get("entries") match {
case JsonNumber(x) => x
case x => throw new JsonFormatException(x, name + ".entries")
}
val quantityName = get.getOrElse("name", JsonNull) match {
case JsonString(x) => Some(x)
case JsonNull => None
case x => throw new JsonFormatException(x, name + ".name")
}
val binWidth = get("binWidth") match {
case JsonNumber(x) => x
case x => throw new JsonFormatException(x, name + ".binWidth")
}
val (contentType, binsFactory) = get("bins:type") match {
case JsonString(name) => (name, Factory(name))
case x => throw new JsonFormatException(x, name + ".bins:type")
}
val binsName = get.getOrElse("bins:name", JsonNull) match {
case JsonString(x) => Some(x)
case JsonNull => None
case x => throw new JsonFormatException(x, name + ".bins:name")
}
val bins = get("bins") match {
case JsonObject(indexBins @ _*) =>
SortedMap(indexBins map {
case (JsonString(i), v) if (integerPattern.pattern.matcher(i).matches) => (i.toLong, binsFactory.fromJsonFragment(v, binsName))
case (i, _) => throw new JsonFormatException(i, name + s".bins key must be an integer")
}: _*)
case x => throw new JsonFormatException(x, name + ".bins")
}
val nanflowFactory = get("nanflow:type") match {
case JsonString(name) => Factory(name)
case x => throw new JsonFormatException(x, name + ".nanflow:type")
}
val nanflow = nanflowFactory.fromJsonFragment(get("nanflow"), None)
val origin = get("origin") match {
case JsonNumber(x) => x
case x => throw new JsonFormatException(x, name + ".origin")
}
new SparselyBinned(binWidth, entries, (nameFromParent ++ quantityName).lastOption, contentType, bins.asInstanceOf[SortedMap[Long, C] forSome {type C <: Container[C] with NoAggregation}], nanflow.asInstanceOf[C forSome {type C <: Container[C] with NoAggregation}], origin)
case _ => throw new JsonFormatException(json, name)
}
}
/** An accumulated quantity that was split into equally spaced bins, filling only one bin per datum and creating new bins as necessary.
*
* Use the factory [[org.dianahep.histogrammar.SparselyBin]] to construct an instance.
*
* @param binWidth Width of the equally sized bins.
* @param entries Weighted number of entries (sum of all observed weights).
* @param quantityName Optional name given to the quantity function, passed for bookkeeping.
* @param contentType Name of the intended content; used as a placeholder in cases with zero bins (due to no observed data).
* @param bins Centers and values of each bin.
* @param nanflow Container for data that resulted in `NaN`.
* @param origin Left edge of the bin whose index is zero.
*/
class SparselyBinned[V <: Container[V] with NoAggregation, N <: Container[N] with NoAggregation] private[histogrammar](val binWidth: Double, val entries: Double, val quantityName: Option[String], val contentType: String, val bins: SortedMap[Long, V], val nanflow: N, val origin: Double) extends Container[SparselyBinned[V, N]] with NoAggregation with QuantityName with SparselyBin.Methods {
type Type = SparselyBinned[V, N]
type EdType = SparselyBinned[V, N]
def factory = SparselyBin
if (entries < 0.0)
throw new ContainerException(s"entries ($entries) cannot be negative")
if (binWidth <= 0.0)
throw new ContainerException(s"binWidth ($binWidth) must be greater than zero")
def zero = new SparselyBinned[V, N](binWidth, 0.0, quantityName, contentType, SortedMap[Long, V](), nanflow.zero, origin)
def +(that: SparselyBinned[V, N]) = {
if (this.quantityName != that.quantityName)
throw new ContainerException(s"cannot add ${getClass.getName} because quantityName differs (${this.quantityName} vs ${that.quantityName})")
if (this.binWidth != that.binWidth)
throw new ContainerException(s"cannot add ${getClass.getName} because binWidth differs (${this.binWidth} vs ${that.binWidth})")
if (this.origin != that.origin)
throw new ContainerException(s"cannot add ${getClass.getName} because origin differs (${this.origin} vs ${that.origin})")
val newbins =
SortedMap[Long, V]((this.bins.keySet union that.bins.keySet).toSeq map {case i =>
(this.bins.get(i), that.bins.get(i)) match {
case (Some(v1), Some(v2)) => i -> (v1 + v2)
case (Some(v1), None) => i -> v1
case (None, Some(v2)) => i -> v2
case _ => throw new Exception("can't get here")
}
}: _*)
new SparselyBinned[V, N](binWidth, this.entries + that.entries, quantityName, contentType, newbins, this.nanflow + that.nanflow, origin)
}
def *(factor: Double) =
if (factor.isNaN || factor <= 0.0)
zero
else
new SparselyBinned[V, N](
binWidth,
factor * entries,
quantityName,
contentType,
SortedMap[Long, V](bins.toSeq map {case (i, x) => (i, x * factor)}: _*),
nanflow * factor,
origin)
def numFilled = bins.size
def num = if (bins.isEmpty) 0L else 1L + bins.last._1 - bins.head._1
def minBin = if (bins.isEmpty) None else Some(bins.head._1)
def maxBin = if (bins.isEmpty) None else Some(bins.last._1)
def low = if (bins.isEmpty) None else Some(minBin.get * binWidth + origin)
def high = if (bins.isEmpty) None else Some((maxBin.get + 1L) * binWidth + origin)
/** Extract the container at a given index, if it exists. */
def at(index: Long) = bins.find(_._1 == index).map(_._2)
def indexes = bins.map(_._1).toSeq
def range(index: Long) = (index * binWidth + origin, (index + 1) * binWidth + origin)
def values = bins.map(_._2)
def children = nanflow :: values.toList
def toJsonFragment(suppressName: Boolean) = JsonObject(
"binWidth" -> JsonFloat(binWidth),
"entries" -> JsonFloat(entries),
"bins:type" -> JsonString(if (bins.isEmpty) contentType else bins.head._2.factory.name),
"bins" -> JsonObject(bins.toSeq map {case (i, v) => (JsonString(i.toString), v.toJsonFragment(true))}: _*),
"nanflow:type" -> JsonString(nanflow.factory.name),
"nanflow" -> nanflow.toJsonFragment(false),
"origin" -> JsonFloat(origin)).
maybe(JsonString("name") -> (if (suppressName) None else quantityName.map(JsonString(_)))).
maybe(JsonString("bins:name") -> (bins.headOption match {case Some((i, v: QuantityName)) => v.quantityName.map(JsonString(_)); case _ => None}))
override def toString() = s""""""
override def equals(that: Any) = that match {
case that: SparselyBinned[V, N] => this.binWidth === that.binWidth && this.entries === that.entries && this.quantityName == that.quantityName && this.bins == that.bins && this.nanflow == that.nanflow && this.origin === that.origin
case _ => false
}
override def hashCode() = (binWidth, entries, quantityName, bins, nanflow, origin).hashCode
}
/** Accumulating a quantity by splitting it into equally spaced bins, filling only one bin per datum and creating new bins as necessary.
*
* Use the factory [[org.dianahep.histogrammar.SparselyBin]] to construct an instance.
*
* @param binWidth Width of the equally sized bins.
* @param quantity Numerical function to split into bins.
* @param entries Weighted number of entries (sum of all observed weights).
* @param value New value (note the `=>`: expression is reevaluated every time a new value is needed).
* @param bins Centers and values of each bin.
* @param nanflow Container for data that resulted in `NaN`.
* @param origin Left edge of the bin whose index is zero.
*/
class SparselyBinning[DATUM, V <: Container[V] with Aggregation{type Datum >: DATUM}, N <: Container[N] with Aggregation{type Datum >: DATUM}] private[histogrammar]
(val binWidth: Double,
val quantity: UserFcn[DATUM, Double],
var entries: Double,
value: => V,
val bins: mutable.Map[Long, V],
val nanflow: N,
val origin: Double) extends Container[SparselyBinning[DATUM, V, N]] with AggregationOnData with NumericalQuantity[DATUM] with SparselyBin.Methods {
protected val v = value
type Type = SparselyBinning[DATUM, V, N]
type EdType = SparselyBinned[v.EdType, nanflow.EdType]
type Datum = DATUM
def factory = SparselyBin
if (entries < 0.0)
throw new ContainerException(s"entries ($entries) cannot be negative")
if (binWidth <= 0.0)
throw new ContainerException(s"binWidth ($binWidth) must be greater than zero")
def zero = new SparselyBinning[DATUM, V, N](binWidth, quantity, 0.0, value, mutable.Map[Long, V](), nanflow.zero, origin)
def +(that: SparselyBinning[DATUM, V, N]) = {
if (this.quantity.name != that.quantity.name)
throw new ContainerException(s"cannot add ${getClass.getName} because quantity name differs (${this.quantity.name} vs ${that.quantity.name})")
if (this.binWidth != that.binWidth)
throw new ContainerException(s"cannot add ${getClass.getName} because binWidth differs (${this.binWidth} vs ${that.binWidth})")
if (this.origin != that.origin)
throw new ContainerException(s"cannot add ${getClass.getName} because origin differs (${this.origin} vs ${that.origin})")
val newbins =
mutable.Map[Long, V]((this.bins.keySet union that.bins.keySet).toSeq map {case i =>
(this.bins.get(i), that.bins.get(i)) match {
case (Some(v1), Some(v2)) => i -> (v1 + v2)
case (Some(v1), None) => i -> v1
case (None, Some(v2)) => i -> v2
case _ => throw new Exception("can't get here")
}
}: _*)
new SparselyBinning[DATUM, V, N](binWidth, this.quantity, this.entries + that.entries, this.value, newbins, this.nanflow + that.nanflow, origin)
}
def *(factor: Double) =
if (factor.isNaN || factor <= 0.0)
zero
else
new SparselyBinning[DATUM, V, N](
binWidth,
quantity,
factor * entries,
value,
mutable.Map[Long, V](bins.toSeq map {case (i, x) => (i, x * factor)}: _*),
nanflow * factor,
origin)
def fill[SUB <: Datum](datum: SUB, weight: Double = 1.0) {
checkForCrossReferences()
if (weight > 0.0) {
val q = quantity(datum)
if (nan(q))
nanflow.fill(datum, weight)
else {
val b = bin(q)
if (!(bins contains b))
bins.update(b, value.zero)
bins(b).fill(datum, weight)
}
// no possibility of exception from here on out (for rollback)
entries += weight
}
}
def numFilled = bins.size
def num = if (bins.isEmpty) 0L else 1L + bins.map(_._1).max - bins.map(_._1).min
def minBin = if (bins.isEmpty) None else Some(bins.map(_._1).min)
def maxBin = if (bins.isEmpty) None else Some(bins.map(_._1).max)
def low = if (bins.isEmpty) None else Some(minBin.get * binWidth + origin)
def high = if (bins.isEmpty) None else Some((maxBin.get + 1L) * binWidth + origin)
/** Extract the container at a given index, if it exists. */
def at(index: Long) = bins.get(index)
def indexes = bins.map(_._1).toSeq
def range(index: Long) = (index * binWidth + origin, (index + 1) * binWidth + origin)
def values = bins.map(_._2)
def children = value :: nanflow :: values.toList
def toJsonFragment(suppressName: Boolean) = JsonObject(
"binWidth" -> JsonFloat(binWidth),
"entries" -> JsonFloat(entries),
"bins:type" -> JsonString(value.factory.name),
"bins" -> JsonObject(bins.toSeq map {case (i, v) => (JsonString(i.toString), v.toJsonFragment(true))}: _*),
"nanflow:type" -> JsonString(nanflow.factory.name),
"nanflow" -> nanflow.toJsonFragment(false),
"origin" -> JsonFloat(origin)).
maybe(JsonString("name") -> (if (suppressName) None else quantity.name.map(JsonString(_)))).
maybe(JsonString("bins:name") -> List(value).collect({case v: AnyQuantity[_, _] => v.quantity.name}).headOption.flatten.map(JsonString(_)))
override def toString() = s""""""
override def equals(that: Any) = that match {
case that: SparselyBinning[DATUM, V, N] => this.binWidth === that.binWidth && this.quantity == that.quantity && this.entries === that.entries && this.bins == that.bins && this.nanflow == that.nanflow && this.origin === that.origin
case _ => false
}
override def hashCode() = (binWidth, quantity, entries, bins, nanflow, origin).hashCode
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy