All Downloads are FREE. Search and download functionalities are using the official Maven repository.

commonMain.jetbrains.datalore.plot.base.DataFrame.kt Maven / Gradle / Ivy

There is a newer version: 4.5.3-alpha1
Show newest version
/*
 * Copyright (c) 2019. JetBrains s.r.o.
 * Use of this source code is governed by the MIT license that can be found in the LICENSE file.
 */

package jetbrains.datalore.plot.base

import jetbrains.datalore.base.interval.DoubleSpan
import jetbrains.datalore.base.logging.PortableLogging
import jetbrains.datalore.plot.common.data.SeriesUtil
import kotlin.jvm.JvmOverloads

class DataFrame private constructor(builder: Builder) {
    private val myVectorByVar: Map>
    private val myIsNumeric: MutableMap
    private val myIsDateTime: MutableMap

    // volatile variables (yet)
    private val myRanges = HashMap()
    private val myDistinctValues = HashMap>()

    class OrderSpec(
        val variable: Variable,
        val orderBy: Variable,
        val direction: Int,
        val aggregateOperation: ((List) -> Double?)? = null
    )

    private val myOrderSpecs: List

    val isEmpty: Boolean
        get() = myVectorByVar.isEmpty()

    init {
        assertAllSeriesAreSameSize(builder.myVectorByVar)
        myVectorByVar = HashMap(builder.myVectorByVar)
        myIsNumeric = HashMap(builder.myIsNumeric)
        myIsDateTime = HashMap(builder.myIsDateTime)
        myOrderSpecs = builder.myOrderSpecs
        myOrderSpecs.forEach { orderSpec ->
            myDistinctValues[orderSpec.variable] = getOrderedDistinctValues(orderSpec)
        }
    }

    private fun assertAllSeriesAreSameSize(vectorByVar: Map>) {
        if (vectorByVar.size > 1) {
            val entries = vectorByVar.entries.iterator()
            val first = entries.next()
            val size = first.value.size
            while (entries.hasNext()) {
                val next = entries.next()
                if (next.value.size != size) {
                    throw IllegalArgumentException(
                        "All data series in data frame must have equal size\n" + dumpSizes(
                            vectorByVar
                        )
                    )
                }
            }
        }
    }

    private fun dumpSizes(vectorByVar: Map>): String {
        val sb = StringBuilder()
        for ((key, value) in vectorByVar) {
            sb.append(key.name)
                .append(" : ")
                .append(value.size)
                .append('\n')
        }
        return sb.toString()
    }

    fun rowCount(): Int {
        return if (myVectorByVar.isEmpty()) 0 else myVectorByVar.entries.iterator().next().value.size
    }

    fun has(variable: Variable): Boolean {
        return myVectorByVar.containsKey(variable)
    }

    fun isEmpty(variable: Variable): Boolean {
        return get(variable).isEmpty()
    }

    fun hasNoOrEmpty(variable: Variable): Boolean {
        return !has(variable) || isEmpty(variable)
    }

    operator fun get(variable: Variable): List<*> {
        assertDefined(variable)
        return myVectorByVar.getValue(variable)
    }

    fun getNumeric(variable: Variable): List {
        assertDefined(variable)
        val list = myVectorByVar.getValue(variable)
        if (list.isNotEmpty()) {
            assertNumeric(variable)
        }

        @Suppress("UNCHECKED_CAST")
        return list as List
    }

    fun distinctValues(variable: Variable): Collection {
        assertDefined(variable)
        return myDistinctValues.getOrPut(variable) {
            val values = LinkedHashSet(get(variable)).apply {
                this.remove(null)
            }
            @Suppress("UNCHECKED_CAST")
            return values as Collection
        }
    }

    fun variables(): Set {
        return myVectorByVar.keys
    }

    fun isNumeric(variable: Variable): Boolean {
        assertDefined(variable)
        if (!myIsNumeric.containsKey(variable)) {
            val checkedDoubles = SeriesUtil.checkedDoubles(get(variable))
            myIsNumeric[variable] = checkedDoubles.notEmptyAndCanBeCast()
        }
        return myIsNumeric[variable]!!
    }

    fun isDateTime(variable: Variable): Boolean {
        assertDefined(variable)
        return myIsDateTime.containsKey(variable)
    }

    fun range(variable: Variable): DoubleSpan? {
        if (!myRanges.containsKey(variable)) {
            val v = getNumeric(variable)
            val r = SeriesUtil.range(v)
            myRanges[variable] = r
        }
        return myRanges[variable]
    }

    fun builder(): Builder {
        return Builder(this)
    }

    fun slice(indices: Iterable): DataFrame {
        return Builder(this, indices).build()
    }

    fun assertDefined(variable: Variable) {
        if (!has(variable)) {
            val e = IllegalArgumentException(undefinedVariableErrorMessage(variable.name))
            LOG.error(e) { e.message!! }
            throw e
        }
    }

    fun undefinedVariableErrorMessage(varName: String): String {
        return "Variable not found: '$varName'. Variables in data frame: ${
            this.variables().map { "'${it.name}'" }
        }"
    }

    private fun assertNumeric(variable: Variable) {
        if (!isNumeric(variable)) {
            val e = IllegalArgumentException("Not a numeric variable: '$variable'")
            LOG.error(e) { e.message!! }
            throw e
        }
    }

    fun selectIndices(indices: List): DataFrame {
        return buildModified { serie -> SeriesUtil.pickAtIndices(serie, indices) }
    }

    fun selectIndices(indices: Set): DataFrame {
        return buildModified { serie -> SeriesUtil.pickAtIndices(serie, indices) }
    }

    fun dropIndices(indices: Set): DataFrame {
        return if (indices.isEmpty()) this else buildModified { serie -> SeriesUtil.skipAtIndices(serie, indices) }
    }

    private fun buildModified(serieFun: (List<*>) -> List<*>): DataFrame {
        val builder = this.builder()
        for (variable in myVectorByVar.keys) {
            val serie = myVectorByVar[variable]
            val modifiedSerie = serieFun(serie!!)
            builder.putIntern(variable, modifiedSerie)
        }
        return builder.build()
    }

    private fun getOrderedDistinctValues(orderSpec: OrderSpec): Set {
        fun isValueComparable(value: Any?) = value != null && (value !is Double || value.isFinite())

        val orderedValues = if (orderSpec.aggregateOperation != null) {
            require(isNumeric(orderSpec.orderBy)) { "Can't apply aggregate operation to non-numeric values" }
            get(orderSpec.variable)
                .zip(getNumeric(orderSpec.orderBy))
                .groupBy({ (value) -> value }) { (_, byValue) -> byValue }
                .mapValues { (_, byValues) -> orderSpec.aggregateOperation.invoke(byValues.filter(::isValueComparable)) }
                .toList()
        } else {
            get(orderSpec.variable).zip(get(orderSpec.orderBy))
        }
            .filter { isValueComparable(it.second) && isValueComparable(it.first) }
            .sortedWith(compareBy({ it.second as Comparable<*> }, { it.first as Comparable<*> }))
            .mapNotNull { it.first }

        // the values corresponding to non-comparable values will be placed at the end of the result
        val nonComparableAppendix = get(orderSpec.variable).zip(get(orderSpec.orderBy))
            .filterNot { isValueComparable(it.second) }
            .mapNotNull { it.first }

        return (if (orderSpec.direction < 0) {
            orderedValues.reversed()
        } else {
            orderedValues
        } + nonComparableAppendix).toSet()
    }


    companion object {
        private val LOG = PortableLogging.logger(DataFrame::class)
    }


    class Variable @JvmOverloads constructor(
        val name: String,
        val source: Source = Source.ORIGIN,
        val label: String = name
    ) {

        val isOrigin: Boolean
            get() = source == Source.ORIGIN

        val isStat: Boolean
            get() = source == Source.STAT

        val isTransform: Boolean
            get() = source == Source.TRANSFORM

        override fun toString(): String {
            // important
            return name
        }

        fun toSummaryString(): String {
            return "$name, '$label' [$source]"
        }

        enum class Source {
            ORIGIN, TRANSFORM, STAT
        }

        companion object {

            @JvmOverloads
            fun createOriginal(name: String, label: String = name): Variable {
                return Variable(
                    name,
                    Source.ORIGIN,
                    label
                )
            }
        }
    }

    class Builder {
        internal val myVectorByVar = HashMap>()
        internal val myIsNumeric = HashMap()
        internal val myIsDateTime = HashMap()
        internal val myOrderSpecs = ArrayList()

        constructor()

        constructor(data: DataFrame) {
            initInternals(
                data.myVectorByVar,
                data.myIsNumeric,
                data.myIsDateTime,
                data.myOrderSpecs,
            )
        }

        internal constructor(data: DataFrame, indices: Iterable) {
            val newVectors = data.myVectorByVar.mapValues { (_, serie) ->
                serie.slice(indices)
            }

            initInternals(
                newVectors,
                data.myIsNumeric,
                data.myIsDateTime,
                data.myOrderSpecs,
            )
        }

        private fun initInternals(
            vectorByVar: Map>,
            isNumeric: Map,
            isDateTime: Map,
            orderSpecs: List,
        ) {
            myVectorByVar.putAll(vectorByVar)
            myIsNumeric.putAll(isNumeric)
            myIsDateTime.putAll(isDateTime)
            myOrderSpecs.addAll(orderSpecs)
        }

        fun put(variable: Variable, v: List<*>): Builder {
            putIntern(variable, v)
            myIsNumeric.remove(variable)  // unknown state
            myIsDateTime.remove(variable)
            return this
        }

        fun putNumeric(variable: Variable, v: List): Builder {
            putIntern(variable, v)
            myIsNumeric[variable] = true
            return this
        }

        fun putDiscrete(variable: Variable, v: List<*>): Builder {
            putIntern(variable, v)
            myIsNumeric[variable] = false
            return this
        }

        fun putDateTime(variable: Variable, v: List<*>): Builder {
            putIntern(variable, v)
            myIsDateTime[variable] = true
            return this
        }

        internal fun putIntern(variable: Variable, v: List<*>) {
            myVectorByVar[variable] = ArrayList(v)
        }

        fun remove(variable: Variable): Builder {
            myVectorByVar.remove(variable)
            myIsNumeric.remove(variable)
            myIsDateTime.remove(variable)
            return this
        }

        fun addOrderSpecs(orderSpecs: List): Builder {
            orderSpecs.forEach(::addOrderSpec)
            return this
        }

        fun addOrderSpec(orderSpec: OrderSpec): Builder {
            val currentOrderSpec = myOrderSpecs.find { it.variable == orderSpec.variable }
            // If multiple specifications for the variable - choose a more specific one:
            if (currentOrderSpec?.aggregateOperation == null) {
                myOrderSpecs.remove(currentOrderSpec)
                myOrderSpecs.add(orderSpec)
            }
            return this
        }

        fun build(): DataFrame {
            return DataFrame(this)
        }

        companion object {
            fun emptyFrame(): DataFrame {
                return Builder().build()
            }
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy