tus.sparktree.1.7.0.source-code.TreeBuilder.scala Maven / Gradle / Ivy

Go to download
/*
 * Copyright (c) 2019 by Andrew Charneski.
 *
 * The author licenses this file to you under the
 * Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance
 * with the License.  You may obtain a copy
 * of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import java.util
import java.util.regex.Pattern

import com.fasterxml.jackson.databind.{MapperFeature, ObjectMapper, SerializationFeature}
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import com.simiacryptus.lang.SerializableFunction
import com.simiacryptus.notebook.{JsonQuery, MarkdownNotebookOutput, NotebookOutput, TableOutput}
import com.simiacryptus.sparkbook._
import com.simiacryptus.sparkbook.repl.{SparkRepl, SparkSessionProvider}
import com.simiacryptus.sparkbook.util.Java8Util._
import com.simiacryptus.sparkbook.util.{Logging, ScalaJson}
import com.simiacryptus.text.{CharTrieIndex, IndexNode, TrieNode}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.storage.StorageLevel

import scala.collection.immutable
import scala.util.Random

abstract class TreeBuilder extends SerializableFunction[NotebookOutput, Object] with Logging with SparkSessionProvider with InteractiveSetup[Object] {

  private lazy val tokenizer = Option(tokenizerRegex).map(Pattern.compile(_))
  val ruleSamples = 5

  val minNodeSize = 1000
  val sampleSize = 1000
  val tokenizerRegex = "\\s+"
  val maxTreeDepth: Int = 4
  val ngramLength: Int = 6
  val branchStats: Boolean = false
  val leafStats: Boolean = false
  val selectionEntropyFactor = 1.0

  override def inputTimeoutSeconds = 600

  def dataSources: Map[String, String]

  def ruleBlacklist: Array[String]

  def entropySpec(schema: StructType): Map[String, Double]

  def validationColumns: Array[String]

  def sourceTableName: String

  final def sourceDataFrame = if (spark.sqlContext.tableNames().contains(sourceTableName)) spark.sqlContext.table(sourceTableName) else null

  def statsSpec(schema: StructType): List[String]

  def extractWords(str: String): Array[String] = {
    if (tokenizer.isEmpty) Array.empty else tokenizer.get.split(str)
  }

  override def postConfigure(log: NotebookOutput): Object = {
    log.h1("Data Staging")
    log.p("""First, we will stage the initial data and manually perform a data staging query:""")
    new SparkRepl() {

      override val defaultCmd: String =
        s"""%sql
           | CREATE TEMPORARY VIEW ${sourceTableName} AS
           | SELECT * FROM ${dataSources.values.head}
        """.stripMargin.trim

      override def shouldContinue(): Boolean = {
        sourceDataFrame == null
      }

      override def init(): Unit = {
        log.run(() => {
          dataSources.foreach(t => {
            val (k, v) = t
            val frame = spark.sqlContext.read.parquet(k).persist(StorageLevel.DISK_ONLY)
            frame.createOrReplaceTempView(v)
            println(s"Loaded ${frame.count()} rows to ${v}")
          })
        })
      }
    }.apply(log)
    log.p("""This sub-report can be used for concurrent adhoc data exploration:""")
    log.subreport("explore", (sublog: NotebookOutput) => {
      val thread = new Thread(() => {
        new SparkRepl().apply(sublog)
      }: Unit)
      thread.setName("Data Exploration REPL")
      thread.setDaemon(true)
      thread.start()
      null
    })

    val Array(trainingData, testingData) = sourceDataFrame.randomSplit(Array(0.9, 0.1))
    trainingData.persist(StorageLevel.MEMORY_ONLY_SER)
    log.h1("""Tree Parameters""")
    log.p("""Now that we have loaded the schema, here are the parameters we will use for tree building:""")
    log.eval(() => {
      ScalaJson.toJson(ruleBlacklist)
    })
    log.p("Statistics Spec:")
    val objectMapper = new ObjectMapper()
      .enable(SerializationFeature.INDENT_OUTPUT)
      .enable(SerializationFeature.ORDER_MAP_ENTRIES_BY_KEYS)
      .enable(MapperFeature.USE_STD_BEAN_NAMING)
      .registerModule(DefaultScalaModule)
      .enableDefaultTyping()
    val statsS = new JsonQuery[List[String]](log.asInstanceOf[MarkdownNotebookOutput]).setMapper({
      objectMapper
    }).print(statsSpec(sourceDataFrame.schema)).get()
    log.p("Entropy Spec:")
    val entropyConfig = new JsonQuery[Map[String, Double]](log.asInstanceOf[MarkdownNotebookOutput]).setMapper({
      objectMapper
    }).print(entropySpec(sourceDataFrame.schema)).get()

    log.h1("Construction")
    val root = split(TreeNode(
      count = trainingData.count(),
      parent = null,
      childId = 'x',
      childRule = ""
    ), trainingData, maxDepth = maxTreeDepth, statsS = statsS, entropyConfig = entropyConfig)(log, spark)

    log.h1("Validation")
    validate(log, testingData, root)
    null
  }

  def validate(log: NotebookOutput, testingData: Dataset[Row], treeRoot: TreeNode) = {
    validationColumns.filterNot(_ == null).filterNot(_.isEmpty).foreach(columnName => {
      log.h2("Validate " + columnName)
      val totalRows = testingData.count()
      val where = treeRoot.conditions().mkString("\n AND ").replaceAll("\n", "\n  ")
      var sql = s"""SELECT *, ${treeRoot.labelingSql().replaceAll("\n", "\n  ")} AS Cluster_ID FROM $sourceTableName AS T""".stripMargin.trim
      if (sql.isEmpty) sql += s"WHERE ${where}"

      val categoryStatsPivot = spark.sql(sql).withColumnRenamed(columnName, "raw_" + columnName)
        .withColumn(columnName, concat(lit(columnName + "_"), col("raw_" + columnName)))
        .drop("raw_" + columnName)
        .groupBy("Cluster_ID")
        .pivot(columnName)
        .agg((count(lit(1))))
        .sort("Cluster_ID").cache()
      SparkRepl.out(categoryStatsPivot)(log)
      val csv = categoryStatsPivot.schema.fields.map(_.name).mkString(",") + "\n" + categoryStatsPivot.rdd.collect().map(_.toSeq.map(value => if (null != value) value.toString else "").mkString(",")).mkString("\n")
      log.p(log.file(csv, columnName + "_stats.csv", columnName + "_stats.csv"))

      testingData.schema.apply(columnName).dataType match {
        case IntegerType =>
          def getCategory(row: Row, colname: String): Integer = {
            row.getAs[Integer](colname)
          }

          def predict(partition: Iterable[Row]): Map[Integer, Double] = {
            val classes = partition.groupBy((row: Row) => getCategory(row, columnName)).mapValues(_.size)
            val total = classes.values.sum
            classes.map(e => e._1 -> e._2 / total.doubleValue())
          }

          val predictionIndex = testingData.rdd.groupBy(row => treeRoot.route(row).id).mapValues(predict(_)).collect().toMap

          val accuracy = (testingData.rdd.collect().map(row => {
            if (predictionIndex(treeRoot.route(row).id).maxBy(_._2)._1.toInt == getCategory(row, columnName).toInt) 1.0 else 0.0
          }).sum / testingData.rdd.count()).toString
          log.eval(() => {
            accuracy
          })
      }
    })
  }

  def split(treeNode: TreeNode, dataFrame: DataFrame, maxDepth: Int, statsS: List[String], entropyConfig: Map[String, Double])(implicit log: NotebookOutput, session: SparkSession): TreeNode = {
    val prevStorageLevel = dataFrame.storageLevel
    dataFrame.persist(StorageLevel.MEMORY_ONLY_SER)
    try {
      log.h2("Context")
      log.run(() => {
        println(s"Current Tree Node: ${treeNode.id}\n")
        println(treeNode.conditions().mkString("\n\tAND "))
      })
      if (maxDepth <= 0 || prune(dataFrame, entropyConfig)) {
        if (leafStats) {
          log.h2("Statistics")
          log.eval(() => {
            ScalaJson.toJson(stats(dataFrame, statsS))
          })
        }
        treeNode
      } else {
        if (branchStats) {
          log.h2("Statistics")
          log.eval(() => {
            ScalaJson.toJson(stats(dataFrame, statsS))
          })
        }
        log.h2("Rules")
        val (ruleFn, name, _, entropyDetails) = {
          val dataSample = dataFrame.sparkSession.sparkContext.broadcast(dataFrame.rdd.takeSample(false, sampleSize))
          try {
            val suggestions = ruleSuggestions(dataFrame)
            val evaluatedRules = session.sparkContext.parallelize(suggestions).map(suggestionInfo => {
              val (rule, name) = suggestionInfo
              val dataSampleValue = dataSample.value
              val entropyValues: Seq[Map[String, Any]] = dataSampleValue.groupBy(rule).map(e1 => {
                val (branchName, rows: Array[Row]) = e1
                Map("rows" -> rows.length, "branchName" -> branchName) ++ entropyConfig.toList.map(e => {
                  val (id, weight) = e
                  id -> (entropyFunction(rows, id))
                }).toMap
              }).toList
              val routeEntropy = entropy(entropyValues.toList.map(map => map("rows").asInstanceOf[Number].doubleValue().intValue()))
              val entropyMap = entropyValues.toList.map(map => {
                val rows = map("rows").asInstanceOf[Number].doubleValue()
                map("branchName") -> entropyConfig.toList.map(e => {
                  val (id, weight) = e
                  map(id).asInstanceOf[Number].doubleValue() * rows * weight
                }).sum
              }).toMap
              val fitness = entropyMap.values.sum - selectionEntropyFactor * routeEntropy * dataSampleValue.length
              (rule, name, fitness, entropyValues.map(_ ++ Map("route_entropy" -> routeEntropy, "rule_entropy" -> fitness)))
            }).collect().sortBy(-_._3)
            val head = evaluatedRules.head
            (head._1, head._2, head._3, evaluatedRules.toList.flatMap(_._4))
          } finally {
            dataSample.destroy()
          }
        }

        val entropyTable = new TableOutput()
        entropyTable.schema.put("branchName", classOf[java.lang.String])
        entropyTable.schema.put("rows", classOf[java.lang.String])
        entropyDetails.flatMap(_.keys).distinct.sorted.filterNot(entropyTable.schema.containsKey(_)).foreach(s => entropyTable.schema.put(s, classOf[java.lang.String]))
        import scala.collection.JavaConverters._
        entropyDetails.foreach(row => entropyTable.putRow(new util.HashMap[CharSequence, Object](row.map(e => e._1 -> e._2.toString).asJava)))
        log.p(entropyTable.toMarkdownTable)

        log.h2("Children")
        val partitionedData = dataFrame.rdd.groupBy(ruleFn).persist(StorageLevel.MEMORY_ONLY_SER)
        dataFrame.persist(prevStorageLevel)
        val parentWithRule = treeNode.copy(
          key = name,
          fn = ruleFn
        )
        log.eval(() => {
          ScalaJson.toJson(partitionedData.mapValues(_.size).collect().toMap)
        })
        val partitions: Array[String] = log.eval(() => {
          partitionedData.keys.distinct().collect().sorted
        })
        (for (partitionIndex <- partitions) yield {
          val frame = session.sqlContext.createDataFrame(partitionedData.filter(_._1 == partitionIndex).flatMap(_._2), dataFrame.schema)
          frame.persist(StorageLevel.MEMORY_ONLY_SER)
          val newChild = TreeNode(
            count = frame.count(),
            parent = parentWithRule,
            childId = partitions.indexOf(partitionIndex).toString.charAt(0),
            childRule = partitionIndex
          )
          log.h3("Child " + newChild.id)
          log.eval(() => {
            ScalaJson.toJson(Map(
              "rule" -> name,
              "id" -> newChild.id
            ))
          })
          val value = log.subreport(newChild.childId.toString, (child: NotebookOutput) => {
            log.write()
            split(newChild, frame, maxDepth - 1, statsS, entropyConfig)(child, session)
          })
          frame.unpersist()
          value
        }).foreach((node: TreeNode) => parentWithRule.children(node.childRule) = node)
        partitionedData.unpersist()
        log.eval(() => {
          ScalaJson.toJson(parentWithRule.children.mapValues(_.count))
        })
        log.h2("Summary")

        def extractSimpleStructure(n: TreeNode): Any = Map(
          "id" -> n.id
        ) ++ n.children.mapValues(extractSimpleStructure(_))

        log.eval(() => {
          ScalaJson.toJson(extractSimpleStructure(parentWithRule))
        })
        log.eval(() => {
          s"""SELECT *, ${parentWithRule.labelingSql().replaceAll("\n", "\n  ")} AS label
             |FROM $sourceTableName AS T
             |WHERE ${parentWithRule.conditions().mkString("\n AND ").replaceAll("\n", "\n  ")}
             |""".stripMargin.trim
        })
        parentWithRule
      }
    } finally {
      dataFrame.persist(prevStorageLevel)
    }
  }

  def stats(dataFrame: DataFrame, statsS: List[String]): Map[String, Map[String, Any]] = {
    val schema = dataFrame.schema
    schema.filter(x => statsS.contains(x.name)).map(stats(dataFrame, _)).toMap
  }

  def stats(dataFrame: DataFrame, field: StructField): (String, Map[String, Any]) = {
    val topN = 10
    val colVals = dataFrame.select(dataFrame.col(field.name)).rdd
    field.dataType match {
      case _: IntegerType =>
        val values = colVals.map(row => Option(row.getAs[Int](0))).filter(_.isDefined).map(_.get).cache()
        val cnt = values.count().doubleValue()
        val entropy = values.countByValue().values.map(_ / cnt).map(x => x * Math.log(x) / Math.log(2)).sum
        val sum0 = cnt
        val sum1 = values.sum()
        val sum2 = values.map(Math.pow(_, 2)).sum()
        val mean = sum1 / sum0
        val max = values.max()
        val min = values.min()
        values.unpersist()
        field.name -> Map(
          "max" -> max,
          "min" -> min,
          "sum0" -> sum0,
          "sum1" -> sum1,
          "sum2" -> sum2,
          "mean" -> mean,
          "stddev" -> Math.sqrt(Math.abs((sum2 / sum0) - mean * mean)),
          "common_values" -> dataFrame.rdd.map(_.getAs[Integer](field.name)).countByValue().toList.sortBy(_._2).takeRight(topN).toMap,
          "entropy" -> entropy
        )

      case _: NumericType =>
        val values = colVals.map(row => Option(row.getAs[Number](0))).filter(_.isDefined).map(_.get.doubleValue()).cache()
        val sum0 = values.map(Math.pow(_, 0)).sum()
        val sum1 = values.sum()
        val sum2 = values.map(Math.pow(_, 2)).sum()
        val mean = sum1 / sum0
        val max = values.max()
        val min = values.min()
        values.unpersist()
        field.name -> Map(
          "max" -> max,
          "min" -> min,
          "sum0" -> sum0,
          "sum1" -> sum1,
          "sum2" -> sum2,
          "mean" -> mean,
          "stddev" -> Math.sqrt(Math.abs((sum2 / sum0) - mean * mean))
        )

      case _: StringType =>
        val strings = colVals.map(row => Option(row.getAs[String](0))).filter(_.isDefined).map(_.get.toString()).cache()
        val char_entropy = entropy(index(strings.take(sampleSize)))
        val allWords = strings
          .flatMap(_.split(tokenizerRegex)).countByValue()
        val totalWords = allWords.values.sum.doubleValue()
        val word_entropy = allWords.values.map(_ / totalWords).map(x => x * Math.log(x) / Math.log(2)).sum
        val words = allWords
          .toList.sortBy(_._2).takeRight(10).toMap
        val values = strings.map(_.length).cache()
        val sum0 = values.map(Math.pow(_, 0)).sum()
        val sum1 = values.sum()
        val sum2 = values.map(Math.pow(_, 2)).sum()
        val mean = sum1 / sum0
        val max = values.max()
        val min = values.min()
        values.unpersist()
        strings.unpersist()
        field.name -> Map(
          "length" -> Map(
            "max" -> max,
            "min" -> min,
            "sum0" -> sum0,
            "sum1" -> sum1,
            "sum2" -> sum2,
            "mean" -> mean,
            "stddev" -> Math.sqrt(Math.abs((sum2 / sum0) - mean * mean))
          ),
          "char_entropy" -> char_entropy,
          "word_entropy" -> word_entropy,
          "common_words" -> words,
          "common_values" -> dataFrame.rdd.map(_.getAs[String](field.name)).countByValue().toList.sortBy(_._2).takeRight(topN).toMap
        )
    }

  }

  def index(strings: Seq[String]) = {
    if (0 >= ngramLength) null else {
      val baseTree = new CharTrieIndex
      strings.foreach(txt => baseTree.addDocument(txt))
      baseTree.index(ngramLength).root()
    }
  }

  def prune(dataFrame: DataFrame, entropyConfig: Map[String, Double]) = {
    (dataFrame.count() < minNodeSize) || (
      dataFrame.count() < 10000 && entropyConfig.toList.map(e => {
        val (id, weight) = e
        (entropyFunction(dataFrame.rdd.collect(), id)) * weight
      }).sum == 0.0)
  }

  def entropyFunction(partition: Array[Row], id: String) = {
    partition.head.schema.apply(id).dataType match {
      case _: IntegerType =>
        val classes = partition.groupBy(_.getAs[Integer](id)).mapValues(_.size)
        val values = classes.values
        entropy(values.toList)
      case _: StringType =>
        val node = index(partition.map(_.getAs[String](id)))
        val ncharEntropy = if (node != null) {
          entropy(node) / partition.length
        } else 0.0
        val wordEntropy = {
          val words = partition.flatMap(_.getAs[String](id).split(tokenizerRegex)).groupBy(x => x).mapValues(_.size)
          val values = words.values
          entropy(values.toList)
        }
        if (ncharEntropy == 0 || wordEntropy == 0) ncharEntropy + wordEntropy
        else (ncharEntropy + wordEntropy) / 2
    }
  }

  def entropy(values: Seq[Int]) = {
    val totalPop = values.sum.doubleValue()
    values.map(x => x / totalPop).map(x => x * Math.log(x) / Math.log(2)).sum
  }

  def entropy(root: IndexNode) = {
    def totalSize = root.getCursorCount

    var entropy = 0.0
    if (null != root) root.visitFirst((n: TrieNode) => {
      if (!n.hasChildren) {
        val w = n.getCursorCount.doubleValue() / totalSize
        entropy = entropy + w * Math.log(w) / Math.log(2)
      }
    })
    entropy // * totalSize
  }

  def ruleSuggestions(dataFrame: DataFrame): immutable.Seq[(Row => String, (List[String], Any))] = {
    val sampled = dataFrame.sample(false, ruleSamples.doubleValue() / dataFrame.count()).cache()
    val list = sampled.schema.filterNot(f => ruleBlacklist.contains(f.name)).flatMap(ruleSuggestions(sampled, _)).toList
    sampled.unpersist()
    list
  }

  def ruleSuggestions(dataFrame: DataFrame, field: StructField): Seq[(Row => String, (List[String], Any))] = {
    val colVals = dataFrame.select(dataFrame.col(field.name)).rdd
    field.dataType match {
      case IntegerType =>
        colVals.map(_ (0).asInstanceOf[Number].doubleValue()).distinct.collect().sorted
          .tail
          .map(value => (
            (r: Row) => java.lang.Double.compare(r.getAs[Number](field.name).doubleValue(), value) match {
              case -1 => s"""T.${field.name} < $value"""
              case 0 => s"""T.${field.name} >= $value"""
              case 1 => s"""T.${field.name} >= $value"""
            },
            List(field.name) -> value
          ))
      case StringType =>
        val sampledRows = colVals.map(_ (0).asInstanceOf[String].toString).distinct.collect().sorted
        Stream.continually({
          val words: Seq[String] = sampledRows.flatMap(str => Random.shuffle(extractWords(str).toList).take(1))
          val ngrams: Seq[String] = if (0 >= ngramLength) Seq.empty else {
            sampledRows.flatMap(str => (0 until 1).map(i => {
              if (str.length <= ngramLength) str else {
                str.drop(Random.nextInt(str.length - ngramLength)).take(ngramLength)
              }
            }))
          }
          words ++ ngrams
        }).flatten.take(1000).distinct.filter(term => {
          val matchFraction = sampledRows.filter(_.contains(term)).size / sampledRows.size.doubleValue()
          (matchFraction > 0.1) && (matchFraction < 0.8)
        }).take(ruleSamples).map(term => {
          (
            (r: Row) => r.getAs[String](field.name).contains(term) match {
              case true => s"""T.${field.name} LIKE "%${term}%""""
              case false => s"""T.${field.name} NOT LIKE "%${term}%""""
            },
            List(field.name) -> term
          )
        })
      case _ => Seq.empty
    }

  }
}