org.dianahep.histogrammar.primitives.count.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of histogrammar_2.10 Show documentation
Histogram abstraction to simplify complex aggregations in distributed environments.
The newest version!
// Copyright 2016 DIANA-HEP
// 
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// 
//     http://www.apache.org/licenses/LICENSE-2.0
// 
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package org.dianahep

import scala.language.existentials

import org.dianahep.histogrammar._
import org.dianahep.histogrammar.json._
import org.dianahep.histogrammar.util._

package histogrammar {
  //////////////////////////////////////////////////////////////// Count/Counted/Counting

  /** Count entries by accumulating the sum of all observed weights or a sum of transformed weights (e.g. sum of squares of weights).
    * 
    * An optional `transform` function can be applied to the weights before summing. To accumulate the sum of squares of weights, use `{x: Double => x*x}`, for instance. This is unlike any other primitive's `quantity` function in that its domain is the ''weights'' (always double), not ''data'' (any type).
    * 
    * Factory produces mutable [[org.dianahep.histogrammar.Counting]] and immutable [[org.dianahep.histogrammar.Counted]] objects.
    */
  object Count extends Factory {
    val name = "Count"
    val help = "Count entries by accumulating the sum of all observed weights or a sum of transformed weights (e.g. sum of squares of weights)."
    val detailedHelp = """An optional `transform` function can be applied to the weights before summing. To accumulate the sum of squares of weights, use `{x: Double => x*x}`, for instance. This is unlike any other primitive's `quantity` function in that its domain is the ''weights'' (always double), not ''data'' (any type)."""

    object Identity extends UserFcn[Double, Double] {
      def name = None
      def hasCache = false
      def apply[SUB <: Double](x: SUB) = x
    }

    /** Create an immutable [[org.dianahep.histogrammar.Counted]] from arguments (instead of JSON).
      * 
      * @param entries Weighted number of entries (sum of all observed weights).
      */
    def ed(entries: Double) = new Counted(entries)

    /** Create an empty, mutable [[org.dianahep.histogrammar.Counting]].
      * 
      * @param transform Transform each weight before adding. For instance, to collect a sum of squared weights, pass {x: Double: x*x} as `transform`.
      */
    def apply(transform: UserFcn[Double, Double] = Identity) = new Counting(0.0, transform)

    /** Synonym for `apply`. */
    def ing(transform: UserFcn[Double, Double] = Identity) = apply(transform)

    /** Use [[org.dianahep.histogrammar.Counted]] in Scala pattern-matching. */
    def unapply(x: Counted) = Some(x.entries)
    /** Use [[org.dianahep.histogrammar.Counting]] in Scala pattern-matching. */
    def unapply(x: Counting) = Some(x.entries)

    import KeySetComparisons._
    def fromJsonFragment(json: Json, nameFromParent: Option[String]): Container[_] with NoAggregation = json match {
      case JsonNumber(entries) => new Counted(entries)
      case _ => throw new JsonFormatException(json, name)
    }

    // // Confidence interval formulas for counting statistics (e.g. bin-by-bin).
    // // For more information, see https://www.ine.pt/revstat/pdf/rs120203.pdf

    // /** Compute the normal confidence interval on `number` (where `number` is a count of events in a Poisson process).
    //   * 
    //   * The normal confidence interval is `number + z * sqrt(number)`. It has good properties for large numbers.
    //   * 
    //   * @param z Informally, the "number of sigmas." Formally, `z` is the `1 - alpha/2` quantile of the standard normal distribution, where `alpha` is the error quantile. For example, for a 95% confidence level, the error (`alpha`) is 5%, so z = 1.96. Note that `z` is signed. `z = 0` yields the central value, `z = -1` yields "one sigma" below the central value, and `z = 1` yields "one sigma" above the central value.
    //   */
    // def normalConfidenceInterval(number: Double, z: Double): Double = normalConfidenceInterval(number, List(z): _*).head

    // /** Compute the normal confidence interval on `number` (where `number` is a count of events in a Poisson process).
    //   * 
    //   * The normal confidence interval is `number + z * sqrt(number)`. It has good properties for large numbers.
    //   * 
    //   * @param z Informally, the "number of sigmas." Formally, `z` is the `1 - alpha/2` quantile of the standard normal distribution, where `alpha` is the error quantile. For example, for a 95% confidence level, the error (`alpha`) is 5%, so z = 1.96. Note that `z` is signed. `z = 0` yields the central value, `z = -1` yields "one sigma" below the central value, and `z = 1` yields "one sigma" above the central value.
    //   */
    // def normalConfidenceInterval(number: Double, zs: Double*): Seq[Double] =
    //   zs.map(z => number + z * Math.sqrt(number))

    // /** Compute the Wilson confidence interval on `number` (where `number` is a count of events in a Poisson process).
    //   * 
    //   * The Wilson confidence interval is `number * (1 - 1/(9*number) + z / (3 * sqrt(number)))**3`. It has good properties for small integers, but not for numbers below one.
    //   * 
    //   * @param z Informally, the "number of sigmas." Formally, `z` is the `1 - alpha/2` quantile of the standard normal distribution, where `alpha` is the error quantile. For example, for a 95% confidence level, the error (`alpha`) is 5%, so z = 1.96. Note that `z` is signed. `z = 0` yields the central value, `z = -1` yields "one sigma" below the central value, and `z = 1` yields "one sigma" above the central value.
    //   */
    // def wilsonConfidenceInterval(number: Double, z: Double): Double = wilsonConfidenceInterval(number, List(z): _*).head

    // /** Compute the Wilson confidence interval on `number` (where `number` is a count of events in a Poisson process).
    //   * 
    //   * The Wilson confidence interval is `number * (1 - 1/(9*number) + z / (3 * sqrt(number)))**3`. It has good properties for small integers, but not for numbers below one.
    //   * 
    //   * @param z Informally, the "number of sigmas." Formally, `z` is the `1 - alpha/2` quantile of the standard normal distribution, where `alpha` is the error quantile. For example, for a 95% confidence level, the error (`alpha`) is 5%, so z = 1.96. Note that `z` is signed. `z = 0` yields the central value, `z = -1` yields "one sigma" below the central value, and `z = 1` yields "one sigma" above the central value.
    //   */
    // def wilsonConfidenceInterval(number: Double, zs: Double*): Seq[Double] =
    //   zs.map(z => number * Math.pow(1.0 - 1.0/(9.0*number) + z / (3.0 * Math.sqrt(number)), 3))

  }

  /** An accumulated count (sum of weights) of data, ignoring its content.
    * 
    * Use the factory [[org.dianahep.histogrammar.Count]] to construct an instance.
    * 
    * @param entries Weighted number of entries (sum of all weights).
    */
  class Counted private[histogrammar](val entries: Double) extends Container[Counted] with NoAggregation {
    type Type = Counted
    type EdType = Counted
    def factory = Count

    if (entries < 0.0)
      throw new ContainerException(s"entries ($entries) cannot be negative")

    def zero = new Counted(0.0)
    def +(that: Counted): Counted = new Counted(this.entries + that.entries)
    def *(factor: Double) =
      if (factor.isNaN  ||  factor <= 0.0)
        zero
      else
        new Counted(factor * entries)

    def children = Nil

    def toJsonFragment(suppressName: Boolean) = JsonFloat(entries)

    override def toString() = s""
    override def equals(that: Any) = that match {
      case that: Counted => this.entries === that.entries
      case _ => false
    }
    override def hashCode() = entries.hashCode
  }

  /** Accumulating a count (sum of weights) of data, ignoring its content.
    * 
    * Use the factory [[org.dianahep.histogrammar.Count]] to construct an instance.
    * 
    * This is the only container with [[org.dianahep.histogrammar.Aggregation]] that doesn't have a configurable data type: its `Datum` is `Any`. It is primarily for the sake of this container that `Aggregation` is contravariant.
    * 
    * @param entries Weighted number of entries (sum of all observed weights).
    */
  class Counting private[histogrammar](var entries: Double, val transform: UserFcn[Double, Double]) extends Container[Counting] with Aggregation {
    type Type = Counting
    type EdType = Counted
    type Datum = Any
    def factory = Count

    if (entries < 0.0)
      throw new ContainerException(s"entries ($entries) cannot be negative")

    def zero = new Counting(0.0, transform)
    def +(that: Counting): Counting = new Counting(this.entries + that.entries, transform)
    def *(factor: Double) =
      if (!transform.isInstanceOf[Count.Identity.type])
        throw new ContainerException("Cannot scalar-multiply Counting with a non-identity transform.")
      else if (factor.isNaN  ||  factor <= 0.0)
        zero
      else
        new Counting(factor * entries, transform)

    def fill[SUB <: Any](datum: SUB, weight: Double = 1.0) {
      checkForCrossReferences()
      // no possibility of exception from here on out (for rollback)
      if (weight > 0.0)
        entries += transform(weight)
    }

    def children = Nil

    def toJsonFragment(suppressName: Boolean) = JsonFloat(entries)

    override def toString() = s""
    override def equals(that: Any) = that match {
      case that: Counting => this.entries === that.entries  &&  this.transform == that.transform
      case _ => false
    }
    override def hashCode() = (entries, transform).hashCode
  }
}




// Note on statistics:

// Could follow http://statpages.info/confint.html to provide exact Clopper-Pearson bounds for Poisson (Count) and Binomial (Fraction) confidence levels.

// Inefficient calculation (JavaScript) that assumes the number of entries is an integer (and scales with it!):

// var vTL=2.5; var vTU=2.5; var vCL=95

// function CalcCL(form) {
//     vTL = eval(form.TL.value)
//     vTU = eval(form.TU.value)
//     vCL = 100-(vTL+vTU)
//     form.CL.value = ''+vCL
// }

// function CalcTails(form) {
//     vCL = eval(form.CL.value)
//     vTU = (100-vCL)/2
//     vTL = vTU
//     form.TL.value = ''+vTL
//     form.TU.value = ''+vTU
// }

// function CalcBin(form) {
//     var vx = eval(form.x.value)
//     var vN = eval(form.N.value)
//     var vP = vx/vN
//     form.P.value = Fmt(vP)
//     if(vx==0) {
//         form.DL.value = "0.0000"
//     }
//     else {
//         var v=vP/2;
//         vsL=0;
//         vsH=vP;
//         var p=vTL/100;
//         while ((vsH-vsL)>1e-5) {
//             if(BinP(vN,v,vx,vN)>p) {
//                 vsH=v;
//                 v=(vsL+v)/2
//             }
//             else {
//                 vsL=v;
//                 v=(v+vsH)/2
//             }
//         }
//         form.DL.value = Fmt(v)
//     }
//     if(vx==vN) {
//         form.DU.value = "1.0000"
//     }
//     else {
//         var v=(1+vP)/2;
//         vsL=vP;
//         vsH=1;
//         var p=vTU/100
//         while ((vsH-vsL)>1e-5) {
//             if (BinP(vN,v,0,vx)=x1 & k<=x2) {
//             s=s+v
//         }
//         if (tot>1e30) {
//             s=s/1e30;
//             tot=tot/1e30;
//             v=v/1e30
//         }
//         k=k+1;
//         v=v*q*(N+1-k)/k
//     }
//     return s/tot
// }

// function CalcPois(form) {
//     var vZ = eval(form.Z.value)
//     if(vZ==0) {
//         form.QL.value = "0.0000"
//     }
//     else {
//         var v=0.5;
//         var dv=0.5;
//         var p=vTL/100
//         while (dv>1e-7) {
//             dv=dv/2;
//             if (PoisP((1+vZ)*v/(1-v),vZ,1e10)>p) {
//                 v=v-dv
//             }
//             else {
//                 v=v+dv
//             }
//         }
//         form.QL.value = Fmt((1+vZ)*v/(1-v))
//     }
//     if(vTU==0) {
//         form.QU.value = "Infinity"
//     }
//     else {
//         var v=0.5;
//         var dv=0.5;
//         var p=vTU/100
//         while (dv>1e-7) {
//             dv=dv/2;
//             if (PoisP((1+vZ)*v/(1-v),0,vZ)(tot*1e-10)) {
//         tot=tot+q
//         if(k>=x1 & k<=x2) {
//             s=s+q
//         }
//         if (tot>1e30) {
//             s=s/1e30;
//             tot=tot/1e30;
//             q=q/1e30
//         }
//         k=k+1;
//         q=q*Z/k
//     }
//     return s/tot
// }

// function Fmt(x) { 
//     var v
//     if (x>=0) {
//         v=''+(x+0.00005)
//     }
//     else {
//         v=''+(x-0.00005)
//     }
//     return v.substring(0,v.indexOf('.')+5)
// }