com.tencent.angel.sona.ml.feature.VectorIndexer.scala Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.tencent.angel.sona.ml.feature
import java.lang.{Double => JDouble, Integer => JInt}
import java.util.{NoSuchElementException, Map => JMap}
import com.tencent.angel.mlcore.utils.MLException
import scala.collection.JavaConverters._
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkException
import com.tencent.angel.sona.ml.attribute.{Attribute, AttributeGroup, BinaryAttribute, NominalAttribute, NumericAttribute}
import com.tencent.angel.sona.ml.{Estimator, Model}
import org.apache.spark.linalg.{DenseVector, IntSparseVector, LongSparseVector, VectorUDT}
import com.tencent.angel.sona.ml.param.{IntParam, Param, ParamMap, ParamValidators, Params}
import com.tencent.angel.sona.ml.param.shared.{HasHandleInvalid, HasInputCol, HasOutputCol}
import com.tencent.angel.sona.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.util.DatasetUtil
import org.apache.spark.util.collection.OpenHashSet
import org.apache.spark.linalg
import org.apache.spark.sql.util.SONASchemaUtils
/** Private trait for params for VectorIndexer and VectorIndexerModel */
private[sona] trait VectorIndexerParams extends Params with HasInputCol with HasOutputCol
with HasHandleInvalid {
/**
* Param for how to handle invalid data (unseen labels or NULL values).
* Note: this param only applies to categorical features, not continuous ones.
* Options are:
* 'skip': filter out rows with invalid data.
* 'error': throw an error.
* 'keep': put invalid data in a special additional bucket, at index of the number of
* categories of the feature.
* Default value: "error"
*
* @group param
*/
override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
"How to handle invalid data (unseen labels or NULL values). " +
"Options are 'skip' (filter out rows with invalid data), 'error' (throw an error), " +
"or 'keep' (put invalid data in a special additional bucket, at index of the " +
"number of categories of the feature).",
ParamValidators.inArray(VectorIndexer.supportedHandleInvalids))
setDefault(handleInvalid, VectorIndexer.ERROR_INVALID)
/**
* Threshold for the number of values a categorical feature can take.
* If a feature is found to have {@literal >} maxCategories values, then it is declared
* continuous. Must be greater than or equal to 2.
*
* (default = 20)
*
* @group param
*/
val maxCategories = new IntParam(this, "maxCategories",
"Threshold for the number of values a categorical feature can take (>= 2)." +
" If a feature is found to have > maxCategories values, then it is declared continuous.",
ParamValidators.gtEq(2))
setDefault(maxCategories -> 20)
/** @group getParam */
def getMaxCategories: Int = $(maxCategories)
}
/**
* Class for indexing categorical feature columns in a dataset of `Vector`.
*
* This has 2 usage modes:
* - Automatically identify categorical features (default behavior)
* - This helps process a dataset of unknown vectors into a dataset with some continuous
* features and some categorical features. The choice between continuous and categorical
* is based upon a maxCategories parameter.
* - Set maxCategories to the maximum number of categorical any categorical feature should have.
* - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
* If maxCategories = 2, then feature 0 will be declared categorical and use indices {0, 1},
* and feature 1 will be declared continuous.
* - Index all features, if all features are categorical
* - If maxCategories is set to be very large, then this will build an index of unique
* values for all features.
* - Warning: This can cause problems if features are continuous since this will collect ALL
* unique values to the driver.
* - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
* If maxCategories is greater than or equal to 3, then both features will be declared
* categorical.
*
* This returns a model which can transform categorical features to use 0-based indices.
*
* Index stability:
* - This is not guaranteed to choose the same category index across multiple runs.
* - If a categorical feature includes value 0, then this is guaranteed to map value 0 to index 0.
* This maintains vector sparsity.
* - More stability may be added in the future.
*
* TODO: Future extensions: The following functionality is planned for the future:
* - Preserve metadata in transform; if a feature's metadata is already present, do not recompute.
* - Specify certain features to not index, either via a parameter or via existing metadata.
* - Add warning if a categorical feature has only 1 category.
*/
class VectorIndexer(override val uid: String)
extends Estimator[VectorIndexerModel] with VectorIndexerParams with DefaultParamsWritable {
def this() = this(Identifiable.randomUID("vecIdx"))
/** @group setParam */
def setMaxCategories(value: Int): this.type = set(maxCategories, value)
/** @group setParam */
def setInputCol(value: String): this.type = set(inputCol, value)
/** @group setParam */
def setOutputCol(value: String): this.type = set(outputCol, value)
/** @group setParam */
def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
override def fit(dataset: Dataset[_]): VectorIndexerModel = {
transformSchema(dataset.schema, logging = true)
val firstRow = dataset.select($(inputCol)).take(1)
require(firstRow.length == 1, s"VectorIndexer cannot be fit on an empty dataset.")
val numFeatures = firstRow(0).getAs[linalg.Vector](0).size.toInt
val vectorDataset = dataset.select($(inputCol)).rdd.map { case Row(v: linalg.Vector) => v }
val maxCats = $(maxCategories)
val categoryStats: VectorIndexer.CategoryStats = vectorDataset.mapPartitions { iter =>
val localCatStats = new VectorIndexer.CategoryStats(numFeatures, maxCats)
iter.foreach(localCatStats.addVector)
Iterator(localCatStats)
}.reduce((stats1, stats2) => stats1.merge(stats2))
val model = new VectorIndexerModel(uid, numFeatures, categoryStats.getCategoryMaps)
.setParent(this)
copyValues(model)
}
override def transformSchema(schema: StructType): StructType = {
// We do not transfer feature metadata since we do not know what types of features we will
// produce in transform().
val dataType = new VectorUDT
require(isDefined(inputCol), s"VectorIndexer requires input column parameter: $inputCol")
require(isDefined(outputCol), s"VectorIndexer requires output column parameter: $outputCol")
SONASchemaUtils.checkColumnType(schema, $(inputCol), dataType)
SONASchemaUtils.appendColumn(schema, $(outputCol), dataType)
}
override def copy(extra: ParamMap): VectorIndexer = defaultCopy(extra)
}
object VectorIndexer extends DefaultParamsReadable[VectorIndexer] {
private[sona] val SKIP_INVALID: String = "skip"
private[sona] val ERROR_INVALID: String = "error"
private[sona] val KEEP_INVALID: String = "keep"
private[sona] val supportedHandleInvalids: Array[String] =
Array(SKIP_INVALID, ERROR_INVALID, KEEP_INVALID)
override def load(path: String): VectorIndexer = super.load(path)
/**
* Helper class for tracking unique values for each feature.
*
* TODO: Track which features are known to be continuous already; do not update counts for them.
*
* @param numFeatures This class fails if it encounters a Vector whose length is not numFeatures.
* @param maxCategories This class caps the number of unique values collected at maxCategories.
*/
private class CategoryStats(private val numFeatures: Int, private val maxCategories: Int)
extends Serializable {
/** featureValueSets[feature index] = set of unique values */
private val featureValueSets =
Array.fill[OpenHashSet[Double]](numFeatures)(new OpenHashSet[Double]())
/** Merge with another instance, modifying this instance. */
def merge(other: CategoryStats): CategoryStats = {
featureValueSets.zip(other.featureValueSets).foreach { case (thisValSet, otherValSet) =>
otherValSet.iterator.foreach { x =>
// Once we have found > maxCategories values, we know the feature is continuous
// and do not need to collect more values for it.
if (thisValSet.size <= maxCategories) thisValSet.add(x)
}
}
this
}
/** Add a new vector to this index, updating sets of unique feature values */
def addVector(v: linalg.Vector): Unit = {
require(v.size == numFeatures, s"VectorIndexer expected $numFeatures features but" +
s" found vector of size ${v.size}.")
v match {
case dv: DenseVector => addDenseVector(dv)
case sv: IntSparseVector => addSparseVector(sv)
case _ => throw new Exception("Vector Type Error!")
}
}
/**
* Based on stats collected, decide which features are categorical,
* and choose indices for categories.
*
* Sparsity: This tries to maintain sparsity by treating value 0.0 specially.
* If a categorical feature takes value 0.0, then value 0.0 is given index 0.
*
* @return Feature value index. Keys are categorical feature indices (column indices).
* Values are mappings from original features values to 0-based category indices.
*/
def getCategoryMaps: Map[Int, Map[Double, Int]] = {
// Filter out features which are declared continuous.
featureValueSets.zipWithIndex.filter(_._1.size <= maxCategories).map {
case (featureValues: OpenHashSet[Double], featureIndex: Int) =>
var sortedFeatureValues = featureValues.iterator.filter(_ != 0.0).toArray.sorted
val zeroExists = sortedFeatureValues.length + 1 == featureValues.size
if (zeroExists) {
sortedFeatureValues = 0.0 +: sortedFeatureValues
}
val categoryMap: Map[Double, Int] = sortedFeatureValues.zipWithIndex.toMap
(featureIndex, categoryMap)
}.toMap
}
private def addDenseVector(dv: DenseVector): Unit = {
var i = 0
val size = dv.size
while (i < size) {
if (featureValueSets(i).size <= maxCategories) {
featureValueSets(i).add(dv(i))
}
i += 1
}
}
private def addSparseVector(sv: IntSparseVector): Unit = {
// TODO: This might be able to handle 0's more efficiently.
var vecIndex = 0 // index into vector
var k = 0 // index into non-zero elements
val size = sv.size
while (vecIndex < size) {
val featureValue = if (k < sv.indices.length && vecIndex == sv.indices(k)) {
k += 1
sv.values(k - 1)
} else {
0.0
}
if (featureValueSets(vecIndex).size <= maxCategories) {
featureValueSets(vecIndex).add(featureValue)
}
vecIndex += 1
}
}
}
}
/**
* Model fitted by [[VectorIndexer]]. Transform categorical features to use 0-based indices
* instead of their original values.
* - Categorical features are mapped to indices.
* - Continuous features (columns) are left unchanged.
* This also appends metadata to the output column, marking features as Numeric (continuous),
* Nominal (categorical), or Binary (either continuous or categorical).
* Non-ML metadata is not carried over from the input to the output column.
*
* This maintains vector sparsity.
*
* @param numFeatures Number of features, i.e., length of Vectors which this transforms
* @param categoryMaps Feature value index. Keys are categorical feature indices (column indices).
* Values are maps from original features values to 0-based category indices.
* If a feature is not in this map, it is treated as continuous.
*/
class VectorIndexerModel private[sona](
override val uid: String,
val numFeatures: Int,
val categoryMaps: Map[Int, Map[Double, Int]])
extends Model[VectorIndexerModel] with VectorIndexerParams with MLWritable {
import VectorIndexerModel._
/** Java-friendly version of [[categoryMaps]] */
def javaCategoryMaps: JMap[JInt, JMap[JDouble, JInt]] = {
categoryMaps.mapValues(_.asJava).asJava.asInstanceOf[JMap[JInt, JMap[JDouble, JInt]]]
}
/**
* Pre-computed feature attributes, with some missing info.
* In transform(), set attribute name and other info, if available.
*/
private val partialFeatureAttributes: Array[Attribute] = {
val attrs = new Array[Attribute](numFeatures)
var categoricalFeatureCount = 0 // validity check for numFeatures, categoryMaps
var featureIndex = 0
while (featureIndex < numFeatures) {
if (categoryMaps.contains(featureIndex)) {
// categorical feature
val rawFeatureValues: Array[String] =
categoryMaps(featureIndex).toArray.sortBy(_._1).map(_._1).map(_.toString)
val featureValues = if (getHandleInvalid == VectorIndexer.KEEP_INVALID) {
(rawFeatureValues.toList :+ "__unknown").toArray
} else {
rawFeatureValues
}
if (featureValues.length == 2 && getHandleInvalid != VectorIndexer.KEEP_INVALID) {
attrs(featureIndex) = new BinaryAttribute(index = Some(featureIndex),
values = Some(featureValues))
} else {
attrs(featureIndex) = new NominalAttribute(index = Some(featureIndex),
isOrdinal = Some(false), values = Some(featureValues))
}
categoricalFeatureCount += 1
} else {
// continuous feature
attrs(featureIndex) = new NumericAttribute(index = Some(featureIndex))
}
featureIndex += 1
}
require(categoricalFeatureCount == categoryMaps.size, "VectorIndexerModel given categoryMaps" +
s" with keys outside expected range [0,...,numFeatures), where numFeatures=$numFeatures")
attrs
}
// TODO: Check more carefully about whether this whole class will be included in a closure.
/** Per-vector transform function */
private lazy val transformFunc: linalg.Vector => linalg.Vector = {
val sortedCatFeatureIndices = categoryMaps.keys.toArray.sorted
val localVectorMap = categoryMaps
val localNumFeatures = numFeatures
val localHandleInvalid = getHandleInvalid
val f: linalg.Vector => linalg.Vector = { (v: linalg.Vector) =>
assert(v.size == localNumFeatures, "VectorIndexerModel expected vector of length" +
s" $numFeatures but found length ${v.size}")
v match {
case dv: DenseVector =>
var hasInvalid = false
val tmpv = dv.copy
localVectorMap.foreach { case (featureIndex: Int, categoryMap: Map[Double, Int]) =>
try {
tmpv.values(featureIndex) = categoryMap(tmpv(featureIndex))
} catch {
case _: NoSuchElementException =>
localHandleInvalid match {
case VectorIndexer.ERROR_INVALID =>
throw new SparkException(s"VectorIndexer encountered invalid value " +
s"${tmpv(featureIndex)} on feature index ${featureIndex}. To handle " +
s"or skip invalid value, try setting VectorIndexer.handleInvalid.")
case VectorIndexer.KEEP_INVALID =>
tmpv.values(featureIndex) = categoryMap.size
case VectorIndexer.SKIP_INVALID =>
hasInvalid = true
}
}
}
if (hasInvalid) null else tmpv
case sv: IntSparseVector =>
// We use the fact that categorical value 0 is always mapped to index 0.
var hasInvalid = false
val tmpv = sv.copy
var catFeatureIdx = 0 // index into sortedCatFeatureIndices
var k = 0 // index into non-zero elements of sparse vector
while (catFeatureIdx < sortedCatFeatureIndices.length && k < tmpv.indices.length) {
val featureIndex = sortedCatFeatureIndices(catFeatureIdx)
if (featureIndex < tmpv.indices(k)) {
catFeatureIdx += 1
} else if (featureIndex > tmpv.indices(k)) {
k += 1
} else {
try {
tmpv.values(k) = localVectorMap(featureIndex)(tmpv.values(k))
} catch {
case _: NoSuchElementException =>
localHandleInvalid match {
case VectorIndexer.ERROR_INVALID =>
throw new SparkException(s"VectorIndexer encountered invalid value " +
s"${tmpv.values(k)} on feature index ${featureIndex}. To handle " +
s"or skip invalid value, try setting VectorIndexer.handleInvalid.")
case VectorIndexer.KEEP_INVALID =>
tmpv.values(k) = localVectorMap(featureIndex).size
case VectorIndexer.SKIP_INVALID =>
hasInvalid = true
}
}
catFeatureIdx += 1
k += 1
}
}
if (hasInvalid) null else tmpv
case _: LongSparseVector => throw MLException("LongSparseVector not support VectorIndexer")
case _ => throw new Exception("Vector Type Error!")
}
}
f
}
/** @group setParam */
def setInputCol(value: String): this.type = set(inputCol, value)
/** @group setParam */
def setOutputCol(value: String): this.type = set(outputCol, value)
override def transform(dataset: Dataset[_]): DataFrame = {
transformSchema(dataset.schema, logging = true)
val newField = prepOutputField(dataset.schema)
val transformUDF = udf { vector: linalg.Vector => transformFunc(vector) }
val newCol = transformUDF(dataset($(inputCol)))
val ds = DatasetUtil.withColumn(dataset, $(outputCol), newCol, newField.metadata)
if (getHandleInvalid == VectorIndexer.SKIP_INVALID) {
ds.na.drop(Array($(outputCol)))
} else {
ds
}
}
override def transformSchema(schema: StructType): StructType = {
val dataType = new VectorUDT
require(isDefined(inputCol),
s"VectorIndexerModel requires input column parameter: $inputCol")
require(isDefined(outputCol),
s"VectorIndexerModel requires output column parameter: $outputCol")
SONASchemaUtils.checkColumnType(schema, $(inputCol), dataType)
// If the input metadata specifies numFeatures, compare with expected numFeatures.
val origAttrGroup = AttributeGroup.fromStructField(schema($(inputCol)))
val origNumFeatures: Option[Int] = if (origAttrGroup.attributes.nonEmpty) {
Some(origAttrGroup.attributes.get.length)
} else {
origAttrGroup.numAttributes.map(_.toInt)
}
require(origNumFeatures.forall(_ == numFeatures), "VectorIndexerModel expected" +
s" $numFeatures features, but input column ${$(inputCol)} had metadata specifying" +
s" ${origAttrGroup.numAttributes.get} features.")
val newField = prepOutputField(schema)
val outputFields = schema.fields :+ newField
StructType(outputFields)
}
/**
* Prepare the output column field, including per-feature metadata.
*
* @param schema Input schema
* @return Output column field. This field does not contain non-ML metadata.
*/
private def prepOutputField(schema: StructType): StructField = {
val origAttrGroup = AttributeGroup.fromStructField(schema($(inputCol)))
val featureAttributes: Array[Attribute] = if (origAttrGroup.attributes.nonEmpty) {
// Convert original attributes to modified attributes
val origAttrs: Array[Attribute] = origAttrGroup.attributes.get
origAttrs.zip(partialFeatureAttributes).map {
case (origAttr: Attribute, featAttr: BinaryAttribute) =>
if (origAttr.name.nonEmpty) {
featAttr.withName(origAttr.name.get)
} else {
featAttr
}
case (origAttr: Attribute, featAttr: NominalAttribute) =>
if (origAttr.name.nonEmpty) {
featAttr.withName(origAttr.name.get)
} else {
featAttr
}
case (origAttr: Attribute, featAttr: NumericAttribute) =>
origAttr.withIndex(featAttr.index.get)
case (origAttr: Attribute, _) =>
origAttr
}
} else {
partialFeatureAttributes
}
val newAttributeGroup = new AttributeGroup($(outputCol), featureAttributes)
newAttributeGroup.toStructField
}
override def copy(extra: ParamMap): VectorIndexerModel = {
val copied = new VectorIndexerModel(uid, numFeatures, categoryMaps)
copyValues(copied, extra).setParent(parent)
}
override def write: MLWriter = new VectorIndexerModelWriter(this)
}
object VectorIndexerModel extends MLReadable[VectorIndexerModel] {
private[VectorIndexerModel]
class VectorIndexerModelWriter(instance: VectorIndexerModel) extends MLWriter {
private case class Data(numFeatures: Int, categoryMaps: Map[Int, Map[Double, Int]])
override protected def saveImpl(path: String): Unit = {
DefaultParamsWriter.saveMetadata(instance, path, sc)
val data = Data(instance.numFeatures, instance.categoryMaps)
val dataPath = new Path(path, "data").toString
sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
}
}
private class VectorIndexerModelReader extends MLReader[VectorIndexerModel] {
private val className = classOf[VectorIndexerModel].getName
override def load(path: String): VectorIndexerModel = {
val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
val dataPath = new Path(path, "data").toString
val data = sparkSession.read.parquet(dataPath)
.select("numFeatures", "categoryMaps")
.head()
val numFeatures = data.getAs[Int](0)
val categoryMaps = data.getAs[Map[Int, Map[Double, Int]]](1)
val model = new VectorIndexerModel(metadata.uid, numFeatures, categoryMaps)
metadata.getAndSetParams(model)
model
}
}
override def read: MLReader[VectorIndexerModel] = new VectorIndexerModelReader
override def load(path: String): VectorIndexerModel = super.load(path)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy