All Downloads are FREE. Search and download functionalities are using the official Maven repository.

commonMain.jetbrains.datalore.plot.base.stat.BoxplotStat.kt Maven / Gradle / Ivy

/*
 * Copyright (c) 2019. JetBrains s.r.o.
 * Use of this source code is governed by the MIT license that can be found in the LICENSE file.
 */

package jetbrains.datalore.plot.base.stat

import jetbrains.datalore.plot.base.Aes
import jetbrains.datalore.plot.base.Aes.Companion.WIDTH
import jetbrains.datalore.plot.base.DataFrame
import jetbrains.datalore.plot.base.StatContext
import jetbrains.datalore.plot.base.data.TransformVar
import jetbrains.datalore.plot.common.data.SeriesUtil
import kotlin.math.sqrt

/**
 * Calculate components of box and whisker plot.
 *
 * Creates a "stat" dataframe contaning:
 *  a) "box" data-points
 *      x
 *      y = NaN
 *      width    - width of box
 *      ymin     - lower whisker = smallest observation greater than or equal to lower hinge - 1.5 * IQR
 *      lower    - lower hinge, 25% quantile
 *      middle   - median, 50% quantile
 *      upper    - upper hinge, 75% quantile
 *      ymax     - upper whisker = largest observation less than or equal to upper hinge + 1.5 * IQR
 *
 *  b) "outlier" data-points
 *      x, y, width
 *      ymin, lower... = NaN
 *
 * Not implemented:
 * notchlower   - lower edge of notch = median - 1.58 * IQR / sqrt(n)
 * notchupper   - upper edge of notch = median + 1.58 * IQR / sqrt(n)
 */
class BoxplotStat(
    private val whiskerIQRRatio: Double,    // ggplot: 'coef'
    private val computeWidth: Boolean       // ggplot: 'varWidth'
) : BaseStat(DEF_MAPPING) {

    override fun hasDefaultMapping(aes: Aes<*>): Boolean {
        return super.hasDefaultMapping(aes) ||
                aes == WIDTH && computeWidth
    }

    override fun getDefaultMapping(aes: Aes<*>): DataFrame.Variable {
        return if (aes == WIDTH) {
            Stats.WIDTH
        } else {
            super.getDefaultMapping(aes)
        }
    }

    override fun consumes(): List> {
        return listOf(Aes.X, Aes.Y)
    }

    override fun apply(data: DataFrame, statCtx: StatContext, messageConsumer: (s: String) -> Unit): DataFrame {
        if (!hasRequiredValues(data, Aes.Y)) {
            return withEmptyStatValues()
        }

        val ys = data.getNumeric(TransformVar.Y)
        val xs = if (data.has(TransformVar.X)) {
            data.getNumeric(TransformVar.X)
        } else {
            List(ys.size) { 0.0 }
        }

        val statData = buildStat(xs, ys, whiskerIQRRatio)

        val statCount = statData.remove(Stats.COUNT)
        val maxCountPerBin = statCount?.maxOrNull()?.toInt() ?: 0
        if (maxCountPerBin == 0) {
            return withEmptyStatValues()
        }
        if (computeWidth) {
            // 'width' is in range 0..1
            val norm = sqrt(maxCountPerBin.toDouble())
            val statWidth = statCount!!.map { count -> sqrt(count) / norm }
            statData[Stats.WIDTH] = statWidth
        }

        val builder = DataFrame.Builder()
        for ((variable, series) in statData) {
            builder.putNumeric(variable, series)
        }
        return builder.build()
    }

    companion object {
        const val DEF_WHISKER_IQR_RATIO = 1.5
        const val DEF_COMPUTE_WIDTH = false

        private val DEF_MAPPING: Map, DataFrame.Variable> = mapOf(
            Aes.X to Stats.X,
            Aes.Y to Stats.Y,
            Aes.YMIN to Stats.Y_MIN,
            Aes.YMAX to Stats.Y_MAX,
            Aes.LOWER to Stats.LOWER,
            Aes.MIDDLE to Stats.MIDDLE,
            Aes.UPPER to Stats.UPPER
        )

        fun buildStat(
            xs: List,
            ys: List,
            whiskerIQRRatio: Double
        ): MutableMap> {

            val xyPairs = xs.zip(ys).filter { (x, y) ->
                SeriesUtil.allFinite(x, y)
            }
            if (xyPairs.isEmpty()) {
                return mutableMapOf()
            }

            val binnedData: MutableMap> = HashMap()
            for ((x, y) in xyPairs) {
                binnedData.getOrPut(x!!) { ArrayList() }.add(y!!)
            }

            val statX = ArrayList()
            val statY = ArrayList()
            val statMiddle = ArrayList()
            val statLower = ArrayList()
            val statUpper = ArrayList()
            val statMin = ArrayList()
            val statMax = ArrayList()

            val statCount = ArrayList()

            for ((x, bin) in binnedData) {
                val count = bin.size.toDouble()

                val summary = FiveNumberSummary(bin)
                val middle = summary.median
                val lowerHinge = summary.firstQuartile
                val upperHinge = summary.thirdQuartile
                val IQR = upperHinge - lowerHinge
                val lowerFence = lowerHinge - IQR * whiskerIQRRatio
                val upperFence = upperHinge + IQR * whiskerIQRRatio

                var lowerWhisker = lowerFence
                var upperWhisker = upperFence
                if (SeriesUtil.allFinite(lowerFence, upperFence)) {
                    val boxed = bin.filter { y -> y >= lowerFence && y <= upperFence }
                    val range = SeriesUtil.range(boxed)
                    if (range != null) {
                        lowerWhisker = range.lowerEnd
                        upperWhisker = range.upperEnd
                    }
                }

                // add outliers first
                val outliers = bin.filter { y -> y < lowerFence || y > upperFence }
                for (y in outliers) {
                    // 'outlier' data-point
                    statX.add(x)
                    statY.add(y)
                    // no 'box' data
                    statMiddle.add(Double.NaN)
                    statLower.add(Double.NaN)
                    statUpper.add(Double.NaN)
                    statMin.add(Double.NaN)
                    statMax.add(Double.NaN)

                    statCount.add(count)

                    // Note: outliers will also need 'width' value,
                    // for the 'dodge' positioning to work correctly for all data-points.
                }

                // add 'box' data-point
                statX.add(x)
                statY.add(Double.NaN)  // no Y for 'box' data-point
                statMiddle.add(middle)
                statLower.add(lowerHinge)
                statUpper.add(upperHinge)
                statMin.add(lowerWhisker)
                statMax.add(upperWhisker)

                statCount.add(count)
            }

            return mutableMapOf(
                Stats.X to statX,
                Stats.Y to statY,
                Stats.MIDDLE to statMiddle,
                Stats.LOWER to statLower,
                Stats.UPPER to statUpper,
                Stats.Y_MIN to statMin,
                Stats.Y_MAX to statMax,
                Stats.COUNT to statCount,
            )
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy