com.intel.analytics.zoo.friesian.feature.Utils.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of analytics-zoo-bigdl_0.13.0-spark_2.1.1 Show documentation
Show all versions of analytics-zoo-bigdl_0.13.0-spark_2.1.1 Show documentation
Big Data AI platform for distributed TensorFlow and PyTorch on Apache Spark.
The newest version!
/*
* Copyright 2018 Analytics Zoo Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.intel.analytics.zoo.friesian.feature
import java.util
import java.util.{List => JList}
import org.apache.spark.TaskContext
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.collection.{immutable, mutable}
import scala.collection.mutable.WrappedArray
import reflect.runtime.universe._
import scala.util.Random
private[friesian] object Utils {
def fillNaIndex(df: DataFrame, fillVal: Any, columns: Array[Int]): DataFrame = {
val targetType = fillVal match {
case _: Double | _: Long | _: Int => "numeric"
case _: String => "string"
case _: Boolean => "boolean"
case _ => throw new IllegalArgumentException(
s"Unsupported value type ${fillVal.getClass.getName} ($fillVal).")
}
val schema = df.schema
val fillValList = columns.map(idx => {
val matchAndVal = checkTypeAndCast(schema(idx).dataType.typeName, targetType, fillVal)
if (!matchAndVal._1) {
throw new IllegalArgumentException(s"$targetType does not match the type of column " +
s"${schema(idx).name}")
}
matchAndVal._2
})
val dfUpdated = df.rdd.map(row => {
val origin = row.toSeq.toArray
for ((idx, fillV) <- columns zip fillValList) {
if (row.isNullAt(idx)) {
origin.update(idx, fillV)
}
}
Row.fromSeq(origin)
})
val spark = df.sparkSession
spark.createDataFrame(dfUpdated, schema)
}
def checkTypeAndCast(schemaType: String, targetType: String, fillVal: Any):
(Boolean, Any) = {
if (schemaType == targetType) {
return (true, fillVal)
} else if (targetType == "numeric") {
val fillNum = fillVal.asInstanceOf[Number]
return schemaType match {
case "long" => (true, fillNum.longValue)
case "integer" => (true, fillNum.intValue)
case "double" => (true, fillNum.doubleValue)
case _ => (false, fillVal)
}
}
(false, fillVal)
}
def castNumeric(value: Any, typeName: String): Any = {
if (value == null) {
null
}
else {
if (!value.isInstanceOf[Number]) {
throw new IllegalArgumentException(s"Unsupported data type ${value.getClass.getName}" +
s" $value")
}
typeName match {
case "long" => value.asInstanceOf[Number].longValue
case "integer" => value.asInstanceOf[Number].intValue
case "double" => value.asInstanceOf[Number].doubleValue
case _ => throw new IllegalArgumentException(s"The type $typeName is not numeric")
}
}
}
def isLarger(val1: Any, val2: Any, typeName: String): Boolean = {
typeName match {
case "long" => val1.asInstanceOf[Long] > val2.asInstanceOf[Long]
case "integer" => val1.asInstanceOf[Int] > val2.asInstanceOf[Int]
case "double" => val1.asInstanceOf[Double] > val2.asInstanceOf[Double]
case _ => throw new IllegalArgumentException(s"The type $typeName is not numeric")
}
}
def getClipFunc[T](minVal: Any, maxVal: Any, colType: String): T => T = {
if (minVal == null) {
(value: T) => {
if (Utils.isLarger(value, maxVal, colType)) maxVal.asInstanceOf[T]
else value
}
} else if (maxVal == null) {
(value: T) => {
if (Utils.isLarger(minVal, value, colType)) minVal.asInstanceOf[T]
else value
}
} else {
if (Utils.isLarger(minVal, maxVal, colType)) {
throw new IllegalArgumentException(s"min should be smaller than max but get $minVal >" +
s" $maxVal")
}
(value: T) => {
if (Utils.isLarger(value, maxVal, colType)) maxVal.asInstanceOf[T]
else if (Utils.isLarger(minVal, value, colType)) minVal.asInstanceOf[T]
else value
}
}
}
def getPartitionSize(rows: Iterator[Row]): Iterator[(Int, Int)] = {
if (rows.isEmpty) {
Array[(Int, Int)]().iterator
} else {
val part_id = TaskContext.get().partitionId()
Array(Tuple2(part_id, rows.size)).iterator
}
}
def checkColumnNumeric(df: DataFrame, column: String): Boolean = {
val typeName = df.schema(df.columns.indexOf(column)).dataType.typeName
typeName == "long" || typeName == "integer" || typeName == "double"
}
def getIndex(df: DataFrame, columns: Array[String]): Array[Int] = {
columns.map(col_n => {
val idx = df.columns.indexOf(col_n)
if (idx == -1) {
throw new IllegalArgumentException(s"The column name $col_n does not exist")
}
idx
})
}
def getMedian(df: DataFrame, columns: Array[String], relativeError: Double = 0.001):
Array[Any] = {
// approxQuantile: `Array(0.5)` corresponds to the median; the inner array
// of the return value is either empty or a singleton
df.stat.approxQuantile(columns, Array(0.5), relativeError).map(
quantiles => {
if (quantiles.isEmpty) {
null
}
else {
quantiles(0)
}
})
}
def hashBucket(content: Any, bucketSize: Int = 1000, start: Int = 0): Int = {
return (content.hashCode() % bucketSize + bucketSize) % bucketSize + start
}
def maskArr: (Int, mutable.WrappedArray[Any]) => Seq[Int] = {
(maxLength: Int, history: WrappedArray[Any]) => {
val n = history.length
val result = if (maxLength > n) {
(0 to n - 1).map(_ => 1) ++ (0 to (maxLength - n - 1)).map(_ => 0)
} else {
(0 to maxLength - 1).map(_ => 1)
}
result
}
}
def padArr[T]: (Int, mutable.WrappedArray[T]) => mutable.Seq[T] = {
(maxLength: Int, history: WrappedArray[T]) => {
val n = history.length
val padValue = castValueFromNum(history(0), 0)
val pads: mutable.Seq[T] = if (maxLength > n) {
history ++ (0 to maxLength - n - 1).map(_ => padValue)
} else {
history.slice(n - maxLength, n)
}
pads
}
}
def padMatrix[T]: (Int, WrappedArray[WrappedArray[T]]) => Seq[Seq[T]] = {
(maxLength: Int, history: WrappedArray[WrappedArray[T]]) => {
val n = history.length
val padValue = castValueFromNum(history(0)(0), 0)
if (maxLength > n) {
val hishead = history(0)
val padArray =
(0 to maxLength - n - 1).map(_ => (0 to hishead.length - 1).map(_ => padValue))
history ++ padArray
} else {
history.slice(n - maxLength, n)
}
}
}
def castValueFromNum[T](num: T, value: Int): T = {
val out: Any = num match {
case _: Double => value.toDouble
case _: Int => value
case _: Float => value.toFloat
case _: Long => value.toLong
case _ => throw new IllegalArgumentException(
s"Unsupported value type ${num.getClass.getName} ($num).")
}
out.asInstanceOf[T]
}
def addNegtiveItem[T](negNum: Int, itemSize: Int): Int => Seq[(Int, Int)] = {
val r = new Random()
(itemId: Int) =>
(1 to negNum).map(x => {
var negItem = 0
do {
negItem = 1 + r.nextInt(itemSize - 1)
} while (negItem == itemId)
negItem
}).map(x => (x, 0)) ++ Seq((itemId, 1))
}
def addNegativeList[T](negNum: Int, itemSize: Int): mutable.WrappedArray[Int] => Seq[Seq[Int]] = {
val r = new Random()
(history: WrappedArray[Int]) => {
val r = new Random()
val negItemSeq: Seq[Seq[Int]] = (0 to history.length - 1).map(i => {
(0 to negNum - 1).map(j => {
var negItem = 0
do {
negItem = 1 + r.nextInt(itemSize - 1)
} while (negItem == history(i))
negItem
})
})
negItemSeq
}
}
def get1row[T](full_rows: Array[Row], colName: String, index: Int, lowerBound: Int): Seq[Any] = {
val colValue = full_rows(index).getAs[T](colName)
val historySeq = full_rows.slice(lowerBound, index).map(row => row.getAs[T](colName))
Seq(colValue, historySeq)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy