Maven / Gradle / Ivy
The newest version!
* Copyright 2018 Analytics Zoo Authors.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import java.lang
import{Sample, TensorSample}
import{RandomGenerator, T}
import org.apache.spark.sql.functions.{max, udf}
import org.apache.spark.sql.{DataFrame, Row}
import scala.collection.mutable
import scala.collection.JavaConverters._
object Utils {
* generate negative samples given a dataframe of positive records, label >=2.
* @param indexed dataframe positive of userId, itemId and label.
* @return a dataframe of negative samples(label=1) with the same size as indexed dataframe
def getNegativeSamples(indexed: DataFrame): DataFrame = {
val schema = indexed.schema
require(schema.fieldNames.contains("userId"), s"Column userId should exist")
require(schema.fieldNames.contains("itemId"), s"Column itemId should exist")
require(schema.fieldNames.contains("label"), s"Column label should exist")
val indexedDF ="userId", "itemId", "label")
val minMaxRow = indexedDF.agg(max("userId"), max("itemId")).collect()(0)
val (userCount, itemCount) = (minMaxRow.getInt(0), minMaxRow.getInt(1))
val sampleDict = => row(0) + "," + row(1)).collect().toSet
val dfCount = indexedDF.count.toInt
import indexed.sqlContext.implicits._
@transient lazy val ran = RandomGenerator.RNG
val negative = indexedDF.rdd
.map(x => {
val uid = x.getAs[Int](0)
val iid = ran.uniform(0, itemCount).toInt + 1
(uid, iid)
.filter(x => !sampleDict.contains(x._1 + "," + x._2)).distinct()
.map(x => (x._1, x._2, 1))
.toDF("userId", "itemId", "label")
def buckBucket(bucketSize: Int): (String, String) => Int = {
val func = (col1: String, col2: String) =>
(Math.abs((col1 + "_" + col2).hashCode()) % bucketSize + 0)
def buckBuckets(bucketSize: Int)(col: String*): Int = {
Math.abs(col.reduce(_ + "_" + _).hashCode()) % bucketSize + 0
def bucketizedColumn(boundaries: Array[Float]): Float => Int = {
col1: Float => {
var index = 0
while (index < boundaries.length && col1 >= boundaries(index)) {
index += 1
// save 0 for uncovered ones
def categoricalFromVocabList(vocabList: Array[String]): (String) => Int = {
val func = (sth: String) => {
val default: Int = 0
val start: Int = 1
if (vocabList.contains(sth)) vocabList.indexOf(sth) + start
else default
* convert a row to sample given column information of WideAndDeep model.
* @param r Row of userId, itemId, features and label
* @param columnInfo ColumnFeatureInfo specify information of different features
* @param modelType support "wide_n_deep", "wide", "deep" only
* @return TensorSample as input for WideAndDeep model
def row2Sample(r: Row, columnInfo: ColumnFeatureInfo, modelType: String): Sample[Float] = {
val wideTensor: Tensor[Float] = getWideTensor(r, columnInfo)
val deepTensor: Array[Tensor[Float]] = getDeepTensors(r, columnInfo)
val l = r.getAs[Int](columnInfo.label)
val label = Tensor[Float](T(l))
label.resize(1, 1)
modelType match {
case "wide_n_deep" =>
TensorSample[Float](Array(wideTensor) ++ deepTensor, Array(label))
case "wide" =>
TensorSample[Float](Array(wideTensor), Array(label))
case "deep" =>
TensorSample[Float](deepTensor, Array(label))
case _ =>
throw new IllegalArgumentException("unknown type")
* convert a row to sample given column information of WideAndDeep Sequential model.
* @param r Row of userId, itemId, features and label
* @param columnInfo ColumnFeatureInfo specify information of different features
* @param modelType support "wide_n_deep", "wide", "deep" only
* @return TensorSample as input for WideAndDeep Sequential model
def row2SampleSequential(r: Row, columnInfo: ColumnFeatureInfo, modelType: String): Sample[Float]
= {
val wideTensor: Tensor[Float] = getWideTensorSequential(r, columnInfo)
val deepTensor: Tensor[Float] = getDeepTensor(r, columnInfo)
val l = r.getAs[Int](columnInfo.label)
val label = Tensor[Float](T(l))
label.resize(1, 1)
modelType match {
case "wide_n_deep" =>
TensorSample[Float](Array(wideTensor, deepTensor), Array(label))
case "wide" =>
TensorSample[Float](Array(wideTensor), Array(label))
case "deep" =>
TensorSample[Float](Array(deepTensor), Array(label))
case _ =>
throw new IllegalArgumentException("unknown type")
* prepare tensor for wide part of WideAndDeep model based on SparseDense.
* @param r Row of userId, itemId, features and label
* @param columnInfo ColumnFeatureInfo specify information of different features
* @return a tensor as input for wide part of a WideAndDeep model
def getWideTensor(r: Row, columnInfo: ColumnFeatureInfo): Tensor[Float] = {
val wideColumns = columnInfo.wideBaseCols ++ columnInfo.wideCrossCols
val wideDims = columnInfo.wideBaseDims ++ columnInfo.wideCrossDims
val wideLength = wideColumns.length
var acc = 0
val indices: Array[Int] = (0 to wideLength - 1).map(i => {
val index = r.getAs[Int](wideColumns(i))
if (i == 0) {index}
else {
acc = acc + wideDims(i - 1)
acc + index
val values = => 1.0f)
val shape = Array(wideDims.sum)
Tensor.sparse(Array(indices), values, shape)
* prepare tensor for wide part of WideAndDeep based on sequential api and LookupTableSparse.
* @param r Row of userId, itemId, features and label
* @param columnInfo ColumnFeatureInfo specify information of different features
* @return a tensor as input for wide part of a WideAndDeep model
def getWideTensorSequential(r: Row, columnInfo: ColumnFeatureInfo): Tensor[Float] = {
val wideColumns = columnInfo.wideBaseCols ++ columnInfo.wideCrossCols
val wideDims = columnInfo.wideBaseDims ++ columnInfo.wideCrossDims
val wideLength = wideColumns.length
var acc = 0
val indices: Array[Int] = (0 to wideLength - 1).map(i => {
val index = r.getAs[Int](wideColumns(i))
if (i == 0) index
else {
acc = acc + wideDims(i - 1)
acc + index
val values = + 1.0f)
val shape = Array(wideDims.sum)
Tensor.sparse(Array(indices), values, shape)
* convert a row to tensors given column feature information of WideAndDeep model.
* @param r Row of userId, itemId, features and label
* @param columnInfo ColumnFeatureInfo specify information of different features
* @return an array of tensors as input for deep part of a WideAndDeep model
def getDeepTensors(r: Row, columnInfo: ColumnFeatureInfo): Array[Tensor[Float]] = {
val indCol = columnInfo.indicatorCols.length
val embCol = columnInfo.embedCols.length
val contCol = columnInfo.continuousCols.length
val indTensor = Tensor[Float](columnInfo.indicatorDims.sum).fill(0)
// setup indicators
var acc = 0
(0 to indCol - 1).map {
i =>
val index = r.getAs[Int](columnInfo.indicatorCols(i))
val accIndex = if (i == 0) {
else {
acc = acc + columnInfo.indicatorDims(i - 1)
acc + index
indTensor.setValue(accIndex + 1, 1)
val embTensor = Tensor[Float](embCol).fill(0)
(0 to embCol - 1).map(i =>
embTensor.setValue(i + 1, r.getAs[Int](columnInfo.embedCols(i)).toFloat))
val contTensor = Tensor[Float](contCol).fill(0)
(0 to contCol - 1).map(i => {
val data = r.getAs[Any](columnInfo.continuousCols(i))
val td = data match {
case n: Int => n.toFloat
case n: Long => n.toFloat
case n: Double => n.toFloat
case n: Float => n
case _ => throw new Exception("wrong data type")
contTensor.setValue(i + 1, td)
(indCol > 0, embCol > 0, contCol > 0) match {
case (true, true, true) =>
Array(indTensor, embTensor, contTensor)
case (false, true, true) =>
Array(embTensor, contTensor)
case (true, false, true) =>
Array(indTensor, contTensor)
case (true, true, false) =>
Array(indTensor, embTensor)
case (false, true, false) =>
case (false, false, true) =>
case (true, false, false) =>
case _ =>
// setup deep tensor
def getDeepTensor(r: Row, columnInfo: ColumnFeatureInfo): Tensor[Float] = {
val deepColumns1 = columnInfo.indicatorCols
val deepColumns2 = columnInfo.embedCols ++ columnInfo.continuousCols
val deepLength = columnInfo.indicatorDims.sum + deepColumns2.length
val deepTensor = Tensor[Float](deepLength).fill(0)
// setup indicators
var acc = 0
(0 to deepColumns1.length - 1).map {
i =>
val index = r.getAs[Int](columnInfo.indicatorCols(i))
val accIndex = if (i == 0) index
else {
acc = acc + columnInfo.indicatorDims(i - 1)
acc + index
deepTensor.setValue(accIndex + 1, 1)
// setup embedding and continuous
(0 to deepColumns2.length - 1).map {
i =>
deepTensor.setValue(i + 1 + columnInfo.indicatorDims.sum,
def row2sampleSession(r: Row,
sessionLength: Int,
includeHistory: Boolean,
historyLength: Int): Sample[Float] = {
val label = Tensor[Float](T(r.getAs[Float]("label")))
val rnnFeature: Array[Float] = r
val rnnTensor = Tensor(rnnFeature, Array(sessionLength))
val sample = if (includeHistory) {
val mlpFeature: Array[Float] = r
val mlpTensor = Tensor(mlpFeature, Array(historyLength))
Sample[Float](Array(rnnTensor, mlpTensor), Array(label))
else {
Sample[Float](Array(rnnTensor), Array(label))
def prePadding(maxLength: Int): mutable.WrappedArray[java.lang.Float] => Array[Float] = {
(seq: mutable.WrappedArray[java.lang.Float]) => {
if (seq.array.size < maxLength) {, 0f).reverse
else {
def slideSession(df: DataFrame, sessionLength: Int): DataFrame = {
val sqlContext = df.sqlContext
import sqlContext.implicits._
val dfSlided = df.rdd.flatMap(x => {
val session: Array[Float] = x.getAs[mutable.WrappedArray[java.lang.Float]]("session")
val feature2 = x.getAs[mutable.WrappedArray[java.lang.Float]]("purchase_history")
val featureLabel = for (label <- session.slice(1, session.size)) yield {
val endIdx = session.indexOf(label)
val beginIdx = if (session.size <= sessionLength) 0 else endIdx - sessionLength
val feature1 = session.slice(beginIdx, endIdx)
(feature1, feature2, label)
}).toDF("session", "purchase_history", "label").na.drop()
© 2015 - 2025 Weber Informatics LLC | Privacy Policy