com.salesforce.op.test.TestOpVectorMetadataBuilder.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of transmogrifai-features_2.11 Show documentation
Show all versions of transmogrifai-features_2.11 Show documentation
AutoML library for building modular, reusable, strongly typed machine learning workflows on Spark with minimal hand tuning
The newest version!
/*
* Copyright (c) 2017, Salesforce.com, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.salesforce.op.test
import com.salesforce.op.FeatureHistory
import com.salesforce.op.features.OPFeature
import com.salesforce.op.stages.OpPipelineStage
import com.salesforce.op.test.TestOpVectorColumnType._
import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
/**
* Represents a type of column associated with a feature in a vector metadata.
*/
sealed trait TestOpVectorColumnType
object TestOpVectorColumnType {
/**
* Represents just a plain column with no indicator (e.g. (featureName)_operationName)
*/
case object RootCol extends TestOpVectorColumnType
/**
* Represents a column with an indicator (e.g. (featurename)_operationName_indicatorValue)
*
* @param name Name of the indicator value
*/
case class IndCol(name: Option[String]) extends TestOpVectorColumnType
/**
* Represents a column with an value but no defined grouping beyond the parent feature name
*
* @param name Name of the indicator value
*/
case class IndVal(name: Option[String]) extends TestOpVectorColumnType
/**
* Represents a column with an descriptor value but no defined grouping beyond the parent feature name
*
* @param name Name of the indicator value
*/
case class DescVal(name: Option[String]) extends TestOpVectorColumnType
/**
* Represents a column with an indicator (e.g. (featurename)_operationName_indicatorValue), but the
* resulting [[OpVectorColumnMetadata]] should not contain the indicator value (this is done in
* [[com.salesforce.op.stages.impl.feature.OPCollectionHashingVectorizer]], for instance, where there are indicators
* but these indicators do not represent mutually exclusive values)
*
* @param name Name of the indicator value
*/
case class PivotColNoInd(name: String) extends TestOpVectorColumnType
/**
* Represents a column with an indicator, but the resulting [[OpVectorColumnMetadata]] should have a different
* group than the parent feature's name.
*
* @param name Name of the indicator
* @param groupName Name of the grouping
*/
case class IndColWithGroup(name: Option[String], groupName: String) extends TestOpVectorColumnType
/**
* Represents a column with an descriptor, but the resulting [[OpVectorColumnMetadata]] should have a different
* group than the parent feature's name.
*
* @param name Name of the descriptor
* @param groupName Name of the grouping
*/
case class DescColWithGroup(name: Option[String], groupName: String) extends TestOpVectorColumnType
}
/**
* Helps construct [[OpVectorMetadata]] as expected from a stage
*/
object TestOpVectorMetadataBuilder {
/**
* Construct an [[OpVectorMetadata]] from the given stage and features, along with any columns associated with each
* feature.
*
* @param stage The stage to construct from
* @param fs A seq of tuples. The first element is the feature, and the second element is all the columns that
* the vectorizer should produce from that feature
* @return OpVectorMetadata
*/
def apply(stage: OpPipelineStage[_], fs: (OPFeature, List[TestOpVectorColumnType])*): OpVectorMetadata = {
withOpNames(stage,
fs.map { case (f, cols) => (f, stage.operationName, cols)
}: _*)
}
/**
* Construct an [[OpVectorMetadata]] from the given stage and features, along with any columns associated with each
* feature. This lets the user provide the operation name for the column names, instead of it being assumed to be
* the stage's operation name.
*
* @param stage The stage to construct from
* @param fs A seq of tuples. The first element is the feature, the second element is the operation name that
* produced it, and the third element is all the columns that the vectorizer should produce from that
* feature
* @return OpVectorMetadata
*/
def withOpNames(
stage: OpPipelineStage[_],
fs: (OPFeature, String, List[TestOpVectorColumnType])*
): OpVectorMetadata = {
val hist = fs.map{ case (f, _, _) =>
f.name -> f.history().merge(FeatureHistory(Seq.empty, Seq(stage.stageName)))
}
withOpNamesAndHist(stage, hist.toMap, fs: _*)
}
/**
* Construct an [[OpVectorMetadata]] from the given stage and features, along with any columns associated with each
* feature. This lets the user provide the operation name for the column names, instead of it being assumed to be
* the stage's operation name.
*
* @param stage The stage to construct from
* @param hist The history of the parent features
* @param fs A seq of tuples. The first element is the feature, the second element is the operation name that
* produced it, and the third element is all the columns that the vectorizer should produce from that
* feature
* @return OpVectorMetadata
*/
def withOpNamesAndHist(
stage: OpPipelineStage[_],
hist: Map[String, FeatureHistory],
fs: (OPFeature, String, List[TestOpVectorColumnType])*
): OpVectorMetadata = {
val cols = for {
(f, opName, colNames) <- fs.toArray
col <- colNames
} yield OpVectorColumnMetadata(
parentFeatureName = Seq(f.name),
parentFeatureType = Seq(f.typeName),
grouping = col match {
case RootCol => None
case PivotColNoInd(_) => None
case IndCol(name) => Option(f.name)
case IndVal(_) => None
case DescVal(_) => None
case IndColWithGroup(_, groupName) => Option(groupName)
case DescColWithGroup(_, groupName) => Option(groupName)
},
indicatorValue = col match {
case RootCol => None
case PivotColNoInd(_) => None
case IndCol(maybeName) => maybeName
case IndVal(name) => name
case DescVal(_) => None
case IndColWithGroup(maybeName, _) => maybeName
case DescColWithGroup(_, _) => None
},
descriptorValue = col match {
case RootCol => None
case PivotColNoInd(_) => None
case IndCol(_) => None
case IndVal(_) => None
case DescVal(maybeName) => maybeName
case IndColWithGroup(_, _) => None
case DescColWithGroup(maybeName, _) => maybeName
}
)
OpVectorMetadata(stage.getOutputFeatureName, cols, hist)
}
}