All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.sadikovi.spark.util.Utils.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2016 sadikovi
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.github.sadikovi.spark.util

import java.util.UUID

import org.apache.hadoop.conf.{Configuration => HadoopConf}
import org.apache.hadoop.fs.{Path => HadoopPath}

import org.apache.spark.network.util.JavaUtils

private[spark] object Utils {
  /**
   * Convert string of bytes (1024, 10Mb, 5Kb, etc.) into number of bytes.
   * Copied from Apache Spark `Utils.scala`.
   * @param str string to parse
   * @return number of bytes for corresponding string
   */
  def byteStringAsBytes(str: String): Long = {
    JavaUtils.byteStringAsBytes(str)
  }

  /**
   * Create UUID for a string as 128-bit value string.
   * @param str string to create uuid for
   * @return generated UUID as string
   */
  def uuidForString(str: String): String = {
    UUID.nameUUIDFromBytes(str.getBytes()).toString()
  }

  /**
   * Get context class laoder on this thread or, if not present, default class loader for this
   * class.
   */
  def getContextClassLoader(): ClassLoader = {
    Option(Thread.currentThread().getContextClassLoader).getOrElse(getClass.getClassLoader)
  }

  /** Return updated path with suffix appended */
  def withSuffix(path: HadoopPath, suffix: String*): HadoopPath = {
    path.suffix(s"${HadoopPath.SEPARATOR}${suffix.mkString(HadoopPath.SEPARATOR)}")
  }

  /**
   * Compute histogram mode, default step is chosen as square root of data points in array, but can
   * be provided custom step function. Instead of computing average of the largest bucket (one has
   * more elements), we just return the largest element in the bucket, since bucket range can be
   * very significant. `stepFunc` takes min element, max element and number of data points and
   * expects number of buckets as a return value.
   */
  def histogramMode(arr: Array[Long], stepFunc: Option[(Long, Long, Int) => Int] = None): Long = {
    require(arr.nonEmpty, "Expected non-empty array to compute histogram mode")
    if (arr.length == 1) {
      arr.head
    } else {
      // Sort in increasing order
      val srt = arr.sortWith(_ < _)
      val min = srt.head
      val max = srt.last

      // Compute number of buckets based on step function
      val numBuckets = stepFunc match {
        case Some(func) => func(min, max, arr.length)
        case None => Math.ceil(Math.pow(arr.length, 0.5)).toInt
      }

      val buckets = for (bucket <- 0 until numBuckets) yield {
        val start = Math.ceil((max - min) * bucket.toDouble / numBuckets).toLong + min
        val end = Math.floor((max - min) * (bucket + 1).toDouble / numBuckets).toLong + min
        (start, end)
      }

      val bucketStats = buckets.map { case (start, end) => {
        var maxElem: Long = 0
        var cnt: Int = 0

        for (elem <- srt) {
          if (elem >= start && elem <= end) {
            cnt += 1
            maxElem = Math.max(elem, maxElem)
          }
        }
        (maxElem, cnt)
      } }

      // Extract largest bucket, and return maximum value from the bucket
      val largestBucket = bucketStats.sortWith(_._2 > _._2).head
      largestBucket._1
    }
  }

  /**
   * Compute truncated mean of the dataset based on sample. `1 - sample` of the dataset is
   * discarded, mean is computed on the rest and flatten across all data points. Currently only
   * tail is truncated, making assumption that skewness is positive only.
   */
  def truncatedMean(arr: Array[Long], sample: Double): Long = {
    val n = arr.length
    require(n > 0, "Expected non-empty array to compute mean")
    require(sample > 0, s"Expected positive sample, got ${sample}")

    if (n == 1) {
      arr.head
    } else {
      val srt = arr.sortWith(_ < _)

      var sum: Long = 0
      for (i <- 0 until (sample * n).toInt) {
        sum = sum + srt(i)
      }

      Math.ceil(sum / n / sample).toLong
    }
  }

  /** Create temporary directory on local file system */
  def createTempDir(
      root: String = System.getProperty("java.io.tmpdir"),
      namePrefix: String = "netflow"): HadoopPath = {
    val dir = Utils.withSuffix(new HadoopPath(root), namePrefix, UUID.randomUUID().toString)
    val fs = dir.getFileSystem(new HadoopConf(false))
    fs.mkdirs(dir)
    dir
  }

  /** Execute block of code with temporary hadoop path */
  private def withTempHadoopPath(path: HadoopPath)(func: HadoopPath => Unit): Unit = {
    try {
      func(path)
    } finally {
      val fs = path.getFileSystem(new HadoopConf(false))
      fs.delete(path, true)
    }
  }

  /** Execute code block with created temporary directory */
  def withTempDir(func: HadoopPath => Unit): Unit = {
    withTempHadoopPath(Utils.createTempDir())(func)
  }

  /** Execute code block with created temporary file */
  def withTempFile(func: HadoopPath => Unit): Unit = {
    val file = Utils.withSuffix(Utils.createTempDir(), UUID.randomUUID().toString)
    withTempHadoopPath(file)(func)
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy