org.apache.spark.sql.hive.HivemallOps.scala Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.spark.sql.hive
import java.util.UUID
import org.apache.spark.annotation.Experimental
import org.apache.spark.internal.Logging
import org.apache.spark.ml.feature.HivemallFeature
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, VectorUDT}
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.Inner
import org.apache.spark.sql.catalyst.plans.logical.{Generate, JoinTopK, LogicalPlan}
import org.apache.spark.sql.execution.UserProvidedPlanner
import org.apache.spark.sql.execution.datasources.csv.{CsvToStruct, StructToCsv}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
/**
* Hivemall wrapper and some utility functions for DataFrame. These functions below derives
* from `resources/ddl/define-all-as-permanent.hive`.
*
* @groupname regression
* @groupname classifier
* @groupname classifier.multiclass
* @groupname recommend
* @groupname topicmodel
* @groupname geospatial
* @groupname smile
* @groupname xgboost
* @groupname anomaly
* @groupname knn.similarity
* @groupname knn.distance
* @groupname knn.lsh
* @groupname ftvec
* @groupname ftvec.amplify
* @groupname ftvec.hashing
* @groupname ftvec.paring
* @groupname ftvec.scaling
* @groupname ftvec.selection
* @groupname ftvec.conv
* @groupname ftvec.trans
* @groupname ftvec.ranking
* @groupname tools
* @groupname tools.array
* @groupname tools.bits
* @groupname tools.compress
* @groupname tools.map
* @groupname tools.text
* @groupname misc
*
* A list of unsupported functions is as follows:
* * smile
* - guess_attribute_types
* * mapred functions
* - taskid
* - jobid
* - rownum
* - distcache_gets
* - jobconf_gets
* * matrix factorization
* - mf_predict
* - train_mf_sgd
* - train_mf_adagrad
* - train_bprmf
* - bprmf_predict
* * Factorization Machine
* - fm_predict
* - train_fm
* - train_ffm
* - ffm_predict
*/
final class HivemallOps(df: DataFrame) extends Logging {
import internal.HivemallOpsImpl._
private lazy val _sparkSession = df.sparkSession
private lazy val _strategy = new UserProvidedPlanner(_sparkSession.sqlContext.conf)
/**
* @see [[hivemall.regression.GeneralRegressorUDTF]]
* @group regression
*/
@scala.annotation.varargs
def train_regressor(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.GeneralRegressorUDTF",
"train_regressor",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.regression.AdaDeltaUDTF]]
* @group regression
*/
@scala.annotation.varargs
def train_adadelta_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.AdaDeltaUDTF",
"train_adadelta_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.regression.AdaGradUDTF]]
* @group regression
*/
@scala.annotation.varargs
def train_adagrad_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.AdaGradUDTF",
"train_adagrad_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.regression.AROWRegressionUDTF]]
* @group regression
*/
@scala.annotation.varargs
def train_arow_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.AROWRegressionUDTF",
"train_arow_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight", "conv")
)
}
/**
* @see [[hivemall.regression.AROWRegressionUDTF.AROWe]]
* @group regression
*/
@scala.annotation.varargs
def train_arowe_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.AROWRegressionUDTF$AROWe",
"train_arowe_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight", "conv")
)
}
/**
* @see [[hivemall.regression.AROWRegressionUDTF.AROWe2]]
* @group regression
*/
@scala.annotation.varargs
def train_arowe2_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.AROWRegressionUDTF$AROWe2",
"train_arowe2_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight", "conv")
)
}
/**
* @see [[hivemall.regression.LogressUDTF]]
* @group regression
*/
@scala.annotation.varargs
def train_logistic_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.LogressUDTF",
"train_logistic_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.regression.PassiveAggressiveRegressionUDTF]]
* @group regression
*/
@scala.annotation.varargs
def train_pa1_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.PassiveAggressiveRegressionUDTF",
"train_pa1_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.regression.PassiveAggressiveRegressionUDTF.PA1a]]
* @group regression
*/
@scala.annotation.varargs
def train_pa1a_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.PassiveAggressiveRegressionUDTF$PA1a",
"train_pa1a_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.regression.PassiveAggressiveRegressionUDTF.PA2]]
* @group regression
*/
@scala.annotation.varargs
def train_pa2_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.PassiveAggressiveRegressionUDTF$PA2",
"train_pa2_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.regression.PassiveAggressiveRegressionUDTF.PA2a]]
* @group regression
*/
@scala.annotation.varargs
def train_pa2a_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.regression.PassiveAggressiveRegressionUDTF$PA2a",
"train_pa2a_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.classifier.GeneralClassifierUDTF]]
* @group classifier
*/
@scala.annotation.varargs
def train_classifier(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.GeneralClassifierUDTF",
"train_classifier",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.classifier.PerceptronUDTF]]
* @group classifier
*/
@scala.annotation.varargs
def train_perceptron(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.PerceptronUDTF",
"train_perceptron",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.classifier.PassiveAggressiveUDTF]]
* @group classifier
*/
@scala.annotation.varargs
def train_pa(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.PassiveAggressiveUDTF",
"train_pa",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.classifier.PassiveAggressiveUDTF.PA1]]
* @group classifier
*/
@scala.annotation.varargs
def train_pa1(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.PassiveAggressiveUDTF$PA1",
"train_pa1",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.classifier.PassiveAggressiveUDTF.PA2]]
* @group classifier
*/
@scala.annotation.varargs
def train_pa2(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.PassiveAggressiveUDTF$PA2",
"train_pa2",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.classifier.ConfidenceWeightedUDTF]]
* @group classifier
*/
@scala.annotation.varargs
def train_cw(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.ConfidenceWeightedUDTF",
"train_cw",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight", "conv")
)
}
/**
* @see [[hivemall.classifier.AROWClassifierUDTF]]
* @group classifier
*/
@scala.annotation.varargs
def train_arow(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.AROWClassifierUDTF",
"train_arow",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight", "conv")
)
}
/**
* @see [[hivemall.classifier.AROWClassifierUDTF.AROWh]]
* @group classifier
*/
@scala.annotation.varargs
def train_arowh(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.AROWClassifierUDTF$AROWh",
"train_arowh",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight", "conv")
)
}
/**
* @see [[hivemall.classifier.SoftConfideceWeightedUDTF.SCW1]]
* @group classifier
*/
@scala.annotation.varargs
def train_scw(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.SoftConfideceWeightedUDTF$SCW1",
"train_scw",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight", "conv")
)
}
/**
* @see [[hivemall.classifier.SoftConfideceWeightedUDTF.SCW1]]
* @group classifier
*/
@scala.annotation.varargs
def train_scw2(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.SoftConfideceWeightedUDTF$SCW2",
"train_scw2",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight", "conv")
)
}
/**
* @see [[hivemall.classifier.AdaGradRDAUDTF]]
* @group classifier
*/
@scala.annotation.varargs
def train_adagrad_rda(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.AdaGradRDAUDTF",
"train_adagrad_rda",
setMixServs(toHivemallFeatures(exprs)),
Seq("feature", "weight")
)
}
/**
* @see [[hivemall.classifier.KernelExpansionPassiveAggressiveUDTF]]
* @group classifier
*/
@scala.annotation.varargs
def train_kpa(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.KernelExpansionPassiveAggressiveUDTF",
"train_kpa",
setMixServs(toHivemallFeatures(exprs)),
Seq("h", "hk", "w0", "w1", "w2", "w3")
)
}
/**
* @see [[hivemall.classifier.multiclass.MulticlassPerceptronUDTF]]
* @group classifier.multiclass
*/
@scala.annotation.varargs
def train_multiclass_perceptron(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.multiclass.MulticlassPerceptronUDTF",
"train_multiclass_perceptron",
setMixServs(toHivemallFeatures(exprs)),
Seq("label", "feature", "weight")
)
}
/**
* @see [[hivemall.classifier.multiclass.MulticlassPassiveAggressiveUDTF]]
* @group classifier.multiclass
*/
@scala.annotation.varargs
def train_multiclass_pa(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.multiclass.MulticlassPassiveAggressiveUDTF",
"train_multiclass_pa",
setMixServs(toHivemallFeatures(exprs)),
Seq("label", "feature", "weight")
)
}
/**
* @see [[hivemall.classifier.multiclass.MulticlassPassiveAggressiveUDTF.PA1]]
* @group classifier.multiclass
*/
@scala.annotation.varargs
def train_multiclass_pa1(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.multiclass.MulticlassPassiveAggressiveUDTF$PA1",
"train_multiclass_pa1",
setMixServs(toHivemallFeatures(exprs)),
Seq("label", "feature", "weight")
)
}
/**
* @see [[hivemall.classifier.multiclass.MulticlassPassiveAggressiveUDTF.PA2]]
* @group classifier.multiclass
*/
@scala.annotation.varargs
def train_multiclass_pa2(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.multiclass.MulticlassPassiveAggressiveUDTF$PA2",
"train_multiclass_pa2",
setMixServs(toHivemallFeatures(exprs)),
Seq("label", "feature", "weight")
)
}
/**
* @see [[hivemall.classifier.multiclass.MulticlassConfidenceWeightedUDTF]]
* @group classifier.multiclass
*/
@scala.annotation.varargs
def train_multiclass_cw(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.multiclass.MulticlassConfidenceWeightedUDTF",
"train_multiclass_cw",
setMixServs(toHivemallFeatures(exprs)),
Seq("label", "feature", "weight", "conv")
)
}
/**
* @see [[hivemall.classifier.multiclass.MulticlassAROWClassifierUDTF]]
* @group classifier.multiclass
*/
@scala.annotation.varargs
def train_multiclass_arow(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.multiclass.MulticlassAROWClassifierUDTF",
"train_multiclass_arow",
setMixServs(toHivemallFeatures(exprs)),
Seq("label", "feature", "weight", "conv")
)
}
/**
* @see [[hivemall.classifier.multiclass.MulticlassAROWClassifierUDTF.AROWh]]
* @group classifier.multiclass
*/
@scala.annotation.varargs
def train_multiclass_arowh(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.multiclass.MulticlassAROWClassifierUDTF$AROWh",
"train_multiclass_arowh",
setMixServs(toHivemallFeatures(exprs)),
Seq("label", "feature", "weight", "conv")
)
}
/**
* @see [[hivemall.classifier.multiclass.MulticlassSoftConfidenceWeightedUDTF.SCW1]]
* @group classifier.multiclass
*/
@scala.annotation.varargs
def train_multiclass_scw(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.multiclass.MulticlassSoftConfidenceWeightedUDTF$SCW1",
"train_multiclass_scw",
setMixServs(toHivemallFeatures(exprs)),
Seq("label", "feature", "weight", "conv")
)
}
/**
* @see [[hivemall.classifier.multiclass.MulticlassSoftConfidenceWeightedUDTF.SCW2]]
* @group classifier.multiclass
*/
@scala.annotation.varargs
def train_multiclass_scw2(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.classifier.multiclass.MulticlassSoftConfidenceWeightedUDTF$SCW2",
"train_multiclass_scw2",
setMixServs(toHivemallFeatures(exprs)),
Seq("label", "feature", "weight", "conv")
)
}
/**
* @see [[hivemall.recommend.SlimUDTF]]
* @group recommend
*/
@scala.annotation.varargs
def train_slim(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.recommend.SlimUDTF",
"train_slim",
setMixServs(toHivemallFeatures(exprs)),
Seq("j", "nn", "w")
)
}
/**
* @see [[hivemall.topicmodel.LDAUDTF]]
* @group topicmodel
*/
@scala.annotation.varargs
def train_lda(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.topicmodel.LDAUDTF",
"train_lda",
setMixServs(toHivemallFeatures(exprs)),
Seq("topic", "word", "score")
)
}
/**
* @see [[hivemall.topicmodel.PLSAUDTF]]
* @group topicmodel
*/
@scala.annotation.varargs
def train_plsa(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.topicmodel.PLSAUDTF",
"train_plsa",
setMixServs(toHivemallFeatures(exprs)),
Seq("topic", "word", "score")
)
}
/**
* @see [[hivemall.smile.regression.RandomForestRegressionUDTF]]
* @group smile
*/
@scala.annotation.varargs
def train_randomforest_regressor(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.smile.regression.RandomForestRegressionUDTF",
"train_randomforest_regressor",
setMixServs(toHivemallFeatures(exprs)),
Seq("model_id", "model_type", "pred_model", "var_importance", "oob_errors", "oob_tests")
)
}
/**
* @see [[hivemall.smile.classification.RandomForestClassifierUDTF]]
* @group smile
*/
@scala.annotation.varargs
def train_randomforest_classifier(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.smile.classification.RandomForestClassifierUDTF",
"train_randomforest_classifier",
setMixServs(toHivemallFeatures(exprs)),
Seq("model_id", "model_type", "pred_model", "var_importance", "oob_errors", "oob_tests")
)
}
/**
* :: Experimental ::
* @see [[hivemall.xgboost.regression.XGBoostRegressionUDTF]]
* @group xgboost
*/
@Experimental
@scala.annotation.varargs
def train_xgboost_regr(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.xgboost.regression.XGBoostRegressionUDTF",
"train_xgboost_regr",
setMixServs(toHivemallFeatures(exprs)),
Seq("model_id", "pred_model")
)
}
/**
* :: Experimental ::
* @see [[hivemall.xgboost.classification.XGBoostBinaryClassifierUDTF]]
* @group xgboost
*/
@Experimental
@scala.annotation.varargs
def train_xgboost_classifier(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.xgboost.classification.XGBoostBinaryClassifierUDTF",
"train_xgboost_classifier",
setMixServs(toHivemallFeatures(exprs)),
Seq("model_id", "pred_model")
)
}
/**
* :: Experimental ::
* @see [[hivemall.xgboost.classification.XGBoostMulticlassClassifierUDTF]]
* @group xgboost
*/
@Experimental
@scala.annotation.varargs
def train_xgboost_multiclass_classifier(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.xgboost.classification.XGBoostMulticlassClassifierUDTF",
"train_xgboost_multiclass_classifier",
setMixServs(toHivemallFeatures(exprs)),
Seq("model_id", "pred_model")
)
}
/**
* :: Experimental ::
* @see [[hivemall.xgboost.tools.XGBoostPredictUDTF]]
* @group xgboost
*/
@Experimental
@scala.annotation.varargs
def xgboost_predict(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.xgboost.tools.XGBoostPredictUDTF",
"xgboost_predict",
setMixServs(toHivemallFeatures(exprs)),
Seq("rowid", "predicted")
)
}
/**
* :: Experimental ::
* @see [[hivemall.xgboost.tools.XGBoostMulticlassPredictUDTF]]
* @group xgboost
*/
@Experimental
@scala.annotation.varargs
def xgboost_multiclass_predict(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.xgboost.tools.XGBoostMulticlassPredictUDTF",
"xgboost_multiclass_predict",
setMixServs(toHivemallFeatures(exprs)),
Seq("rowid", "label", "probability")
)
}
/**
* @see [[hivemall.knn.similarity.DIMSUMMapperUDTF]]
* @group knn.similarity
*/
@scala.annotation.varargs
def dimsum_mapper(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.knn.similarity.DIMSUMMapperUDTF",
"dimsum_mapper",
exprs,
Seq("j", "k", "b_jk")
)
}
/**
* @see [[hivemall.knn.lsh.MinHashUDTF]]
* @group knn.lsh
*/
@scala.annotation.varargs
def minhash(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.knn.lsh.MinHashUDTF",
"minhash",
exprs,
Seq("clusterid", "item")
)
}
/**
* @see [[hivemall.ftvec.amplify.AmplifierUDTF]]
* @group ftvec.amplify
*/
@scala.annotation.varargs
def amplify(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.ftvec.amplify.AmplifierUDTF",
"amplify",
exprs,
Seq("clusterid", "item")
)
}
/**
* @see [[hivemall.ftvec.amplify.RandomAmplifierUDTF]]
* @group ftvec.amplify
*/
@scala.annotation.varargs
def rand_amplify(exprs: Column*): DataFrame = withTypedPlan {
throw new UnsupportedOperationException("`rand_amplify` not supported yet")
}
/**
* Amplifies and shuffle data inside partitions.
* @group ftvec.amplify
*/
def part_amplify(xtimes: Column): DataFrame = {
val xtimesInt = xtimes.expr match {
case Literal(v: Any, IntegerType) => v.asInstanceOf[Int]
case e => throw new AnalysisException("`xtimes` must be integer, however " + e)
}
val rdd = df.rdd.mapPartitions({ iter =>
val elems = iter.flatMap{ row =>
Seq.fill[Row](xtimesInt)(row)
}
// Need to check how this shuffling affects results
scala.util.Random.shuffle(elems)
}, true)
df.sqlContext.createDataFrame(rdd, df.schema)
}
/**
* Quantifies input columns.
* @see [[hivemall.ftvec.conv.QuantifyColumnsUDTF]]
* @group ftvec.conv
*/
@scala.annotation.varargs
def quantify(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.ftvec.conv.QuantifyColumnsUDTF",
"quantify",
exprs,
(0 until exprs.size - 1).map(i => s"c$i")
)
}
/**
* @see [[hivemall.ftvec.trans.BinarizeLabelUDTF]]
* @group ftvec.trans
*/
@scala.annotation.varargs
def binarize_label(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.ftvec.trans.BinarizeLabelUDTF",
"binarize_label",
exprs,
(0 until exprs.size - 1).map(i => s"c$i")
)
}
/**
* @see [[hivemall.ftvec.trans.QuantifiedFeaturesUDTF]]
* @group ftvec.trans
*/
@scala.annotation.varargs
def quantified_features(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.ftvec.trans.QuantifiedFeaturesUDTF",
"quantified_features",
exprs,
Seq("features")
)
}
/**
* @see [[hivemall.ftvec.ranking.BprSamplingUDTF]]
* @group ftvec.ranking
*/
@scala.annotation.varargs
def bpr_sampling(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.ftvec.ranking.BprSamplingUDTF",
"bpr_sampling",
exprs,
Seq("user", "pos_item", "neg_item")
)
}
/**
* @see [[hivemall.ftvec.ranking.ItemPairsSamplingUDTF]]
* @group ftvec.ranking
*/
@scala.annotation.varargs
def item_pairs_sampling(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.ftvec.ranking.ItemPairsSamplingUDTF",
"item_pairs_sampling",
exprs,
Seq("pos_item_id", "neg_item_id")
)
}
/**
* @see [[hivemall.ftvec.ranking.PopulateNotInUDTF]]
* @group ftvec.ranking
*/
@scala.annotation.varargs
def populate_not_in(exprs: Column*): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.ftvec.ranking.PopulateNotInUDTF",
"populate_not_in",
exprs,
Seq("item")
)
}
/**
* Splits Seq[String] into pieces.
* @group ftvec
*/
def explode_array(features: Column): DataFrame = {
df.explode(features) { case Row(v: Seq[_]) =>
// Type erasure removes the component type in Seq
v.map(s => HivemallFeature(s.asInstanceOf[String]))
}
}
/**
* Splits [[Vector]] into pieces.
* @group ftvec
*/
def explode_vector(features: Column): DataFrame = {
val elementSchema = StructType(
StructField("feature", StringType) :: StructField("weight", DoubleType) :: Nil)
val explodeFunc: Row => TraversableOnce[InternalRow] = (row: Row) => {
row.get(0) match {
case dv: DenseVector =>
dv.values.zipWithIndex.map {
case (value, index) =>
InternalRow(UTF8String.fromString(s"$index"), value)
}
case sv: SparseVector =>
sv.values.zip(sv.indices).map {
case (value, index) =>
InternalRow(UTF8String.fromString(s"$index"), value)
}
}
}
withTypedPlan {
Generate(
UserDefinedGenerator(elementSchema, explodeFunc, features.expr :: Nil),
join = true, outer = false, None,
generatorOutput = Nil,
df.logicalPlan)
}
}
/**
* @see [[hivemall.tools.GenerateSeriesUDTF]]
* @group tools
*/
def generate_series(start: Column, end: Column): DataFrame = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.tools.GenerateSeriesUDTF",
"generate_series",
start :: end :: Nil,
Seq("generate_series")
)
}
/**
* Returns `top-k` records for each `group`.
* @group misc
*/
def each_top_k(k: Column, score: Column, group: Column*): DataFrame = withTypedPlan {
val kInt = k.expr match {
case Literal(v: Any, IntegerType) => v.asInstanceOf[Int]
case e => throw new AnalysisException("`k` must be integer, however " + e)
}
if (kInt == 0) {
throw new AnalysisException("`k` must not have 0")
}
val clusterDf = df.repartition(group: _*).sortWithinPartitions(group: _*)
.select(score, Column("*"))
val analyzedPlan = clusterDf.queryExecution.analyzed
val inputAttrs = analyzedPlan.output
val scoreExpr = BindReferences.bindReference(analyzedPlan.expressions.head, inputAttrs)
val groupNames = group.map { _.expr match {
case ne: NamedExpression => ne.name
case ua: UnresolvedAttribute => ua.name
}}
val groupExprs = analyzedPlan.expressions.filter {
case ne: NamedExpression => groupNames.contains(ne.name)
}.map { e =>
BindReferences.bindReference(e, inputAttrs)
}
val rankField = StructField("rank", IntegerType)
Generate(
generator = EachTopK(
k = kInt,
scoreExpr = scoreExpr,
groupExprs = groupExprs,
elementSchema = StructType(
rankField +: inputAttrs.map(d => StructField(d.name, d.dataType))
),
children = inputAttrs
),
join = false,
outer = false,
qualifier = None,
generatorOutput = Seq(rankField.name).map(UnresolvedAttribute(_)) ++ inputAttrs,
child = analyzedPlan
)
}
/**
* :: Experimental ::
* Joins input two tables with the given keys and the top-k highest `score` values.
* @group misc
*/
@Experimental
def top_k_join(k: Column, right: DataFrame, joinExprs: Column, score: Column)
: DataFrame = withTypedPlanInCustomStrategy {
val kInt = k.expr match {
case Literal(v: Any, IntegerType) => v.asInstanceOf[Int]
case e => throw new AnalysisException("`k` must be integer, however " + e)
}
if (kInt == 0) {
throw new AnalysisException("`k` must not have 0")
}
JoinTopK(kInt, df.logicalPlan, right.logicalPlan, Inner, Option(joinExprs.expr))(score.named)
}
private def doFlatten(schema: StructType, separator: Char, prefixParts: Seq[String] = Seq.empty)
: Seq[Column] = {
schema.fields.flatMap { f =>
val colNameParts = prefixParts :+ f.name
f.dataType match {
case st: StructType =>
doFlatten(st, separator, colNameParts)
case _ =>
col(colNameParts.mkString(".")).as(colNameParts.mkString(separator.toString)) :: Nil
}
}
}
// Converts string representation of a character to actual character
@throws[IllegalArgumentException]
private def toChar(str: String): Char = {
if (str.length == 1) {
str.charAt(0) match {
case '$' | '_' | '.' => str.charAt(0)
case _ => throw new IllegalArgumentException(
"Must use '$', '_', or '.' for separator, but got " + str)
}
} else {
throw new IllegalArgumentException(
s"Separator cannot be more than one character: $str")
}
}
/**
* Flattens a nested schema into a flat one.
* @group misc
*
* For example:
* {{{
* scala> val df = Seq((0, (1, (3.0, "a")), (5, 0.9))).toDF()
* scala> df.printSchema
* root
* |-- _1: integer (nullable = false)
* |-- _2: struct (nullable = true)
* | |-- _1: integer (nullable = false)
* | |-- _2: struct (nullable = true)
* | | |-- _1: double (nullable = false)
* | | |-- _2: string (nullable = true)
* |-- _3: struct (nullable = true)
* | |-- _1: integer (nullable = false)
* | |-- _2: double (nullable = false)
*
* scala> df.flatten(separator = "$").printSchema
* root
* |-- _1: integer (nullable = false)
* |-- _2$_1: integer (nullable = true)
* |-- _2$_2$_1: double (nullable = true)
* |-- _2$_2$_2: string (nullable = true)
* |-- _3$_1: integer (nullable = true)
* |-- _3$_2: double (nullable = true)
* }}}
*/
def flatten(separator: String = "$"): DataFrame =
df.select(doFlatten(df.schema, toChar(separator)): _*)
/**
* @see [[hivemall.dataset.LogisticRegressionDataGeneratorUDTF]]
* @group misc
*/
@scala.annotation.varargs
def lr_datagen(exprs: Column*): Dataset[Row] = withTypedPlan {
planHiveGenericUDTF(
df,
"hivemall.dataset.LogisticRegressionDataGeneratorUDTFWrapper",
"lr_datagen",
exprs,
Seq("label", "features")
)
}
/**
* Returns all the columns as Seq[Column] in this [[DataFrame]].
*/
private[sql] def cols: Seq[Column] = {
df.schema.fields.map(col => df.col(col.name)).toSeq
}
/**
* :: Experimental ::
* If a parameter '-mix' does not exist in a 3rd argument,
* set it from an environmental variable
* 'HIVEMALL_MIX_SERVERS'.
*
* TODO: This could work if '--deploy-mode' has 'client';
* otherwise, we need to set HIVEMALL_MIX_SERVERS
* in all possible spark workers.
*/
@Experimental
private def setMixServs(exprs: Seq[Column]): Seq[Column] = {
val mixes = System.getenv("HIVEMALL_MIX_SERVERS")
if (mixes != null && !mixes.isEmpty()) {
val groupId = df.sqlContext.sparkContext.applicationId + "-" + UUID.randomUUID
logInfo(s"set '${mixes}' as default mix servers (session: ${groupId})")
exprs.size match {
case 2 => exprs :+ Column(
Literal.create(s"-mix ${mixes} -mix_session ${groupId}", StringType))
/** TODO: Add codes in the case where exprs.size == 3. */
case _ => exprs
}
} else {
exprs
}
}
/**
* If the input is a [[Vector]], transform it into Hivemall features.
*/
@inline private def toHivemallFeatures(exprs: Seq[Column]): Seq[Column] = {
df.select(exprs: _*).queryExecution.analyzed.schema.zip(exprs).map {
case (StructField(_, _: VectorUDT, _, _), c) => HivemallUtils.to_hivemall_features(c)
case (_, c) => c
}
}
/**
* A convenient function to wrap a logical plan and produce a DataFrame.
*/
@inline private def withTypedPlan(logicalPlan: => LogicalPlan): DataFrame = {
val queryExecution = _sparkSession.sessionState.executePlan(logicalPlan)
val outputSchema = queryExecution.sparkPlan.schema
new Dataset[Row](df.sparkSession, queryExecution, RowEncoder(outputSchema))
}
@inline private def withTypedPlanInCustomStrategy(logicalPlan: => LogicalPlan)
: DataFrame = {
// Inject custom strategies
if (!_sparkSession.experimental.extraStrategies.contains(_strategy)) {
_sparkSession.experimental.extraStrategies = Seq(_strategy)
}
withTypedPlan(logicalPlan)
}
}
object HivemallOps {
import internal.HivemallOpsImpl._
/**
* Implicitly inject the [[HivemallOps]] into [[DataFrame]].
*/
implicit def dataFrameToHivemallOps(df: DataFrame): HivemallOps =
new HivemallOps(df)
/**
* @see [[hivemall.HivemallVersionUDF]]
* @group misc
*/
def hivemall_version(): Column = withExpr {
planHiveUDF(
"hivemall.HivemallVersionUDF",
"hivemall_version",
Nil
)
}
/**
* @see [[hivemall.geospatial.TileUDF]]
* @group geospatial
*/
def tile(lat: Column, lon: Column, zoom: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.geospatial.TileUDF",
"tile",
lat :: lon :: zoom :: Nil
)
}
/**
* @see [[hivemall.geospatial.MapURLUDF]]
* @group geospatial
*/
@scala.annotation.varargs
def map_url(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.geospatial.MapURLUDF",
"map_url",
exprs
)
}
/**
* @see [[hivemall.geospatial.Lat2TileYUDF]]
* @group geospatial
*/
def lat2tiley(lat: Column, zoom: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.geospatial.Lat2TileYUDF",
"lat2tiley",
lat :: zoom :: Nil
)
}
/**
* @see [[hivemall.geospatial.Lon2TileXUDF]]
* @group geospatial
*/
def lon2tilex(lon: Column, zoom: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.geospatial.Lon2TileXUDF",
"lon2tilex",
lon :: zoom :: Nil
)
}
/**
* @see [[hivemall.geospatial.TileX2LonUDF]]
* @group geospatial
*/
def tilex2lon(x: Column, zoom: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.geospatial.TileX2LonUDF",
"tilex2lon",
x :: zoom :: Nil
)
}
/**
* @see [[hivemall.geospatial.TileY2LatUDF]]
* @group geospatial
*/
def tiley2lat(y: Column, zoom: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.geospatial.TileY2LatUDF",
"tiley2lat",
y :: zoom :: Nil
)
}
/**
* @see [[hivemall.geospatial.HaversineDistanceUDF]]
* @group geospatial
*/
@scala.annotation.varargs
def haversine_distance(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.geospatial.HaversineDistanceUDF",
"haversine_distance",
exprs
)
}
/**
* @see [[hivemall.smile.tools.TreePredictUDF]]
* @group smile
*/
@scala.annotation.varargs
def tree_predict(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.smile.tools.TreePredictUDF",
"tree_predict",
exprs
)
}
/**
* @see [[hivemall.smile.tools.TreeExportUDF]]
* @group smile
*/
@scala.annotation.varargs
def tree_export(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.smile.tools.TreeExportUDF",
"tree_export",
exprs
)
}
/**
* @see [[hivemall.anomaly.ChangeFinderUDF]]
* @group anomaly
*/
@scala.annotation.varargs
def changefinder(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.anomaly.ChangeFinderUDF",
"changefinder",
exprs
)
}
/**
* @see [[hivemall.anomaly.SingularSpectrumTransformUDF]]
* @group anomaly
*/
@scala.annotation.varargs
def sst(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.anomaly.SingularSpectrumTransformUDF",
"sst",
exprs
)
}
/**
* @see [[hivemall.knn.similarity.CosineSimilarityUDF]]
* @group knn.similarity
*/
@scala.annotation.varargs
def cosine_similarity(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.knn.similarity.CosineSimilarityUDF",
"cosine_similarity",
exprs
)
}
/**
* @see [[hivemall.knn.similarity.JaccardIndexUDF]]
* @group knn.similarity
*/
@scala.annotation.varargs
def jaccard_similarity(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.knn.similarity.JaccardIndexUDF",
"jaccard_similarity",
exprs
)
}
/**
* @see [[hivemall.knn.similarity.AngularSimilarityUDF]]
* @group knn.similarity
*/
@scala.annotation.varargs
def angular_similarity(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.knn.similarity.AngularSimilarityUDF",
"angular_similarity",
exprs
)
}
/**
* @see [[hivemall.knn.similarity.EuclidSimilarity]]
* @group knn.similarity
*/
@scala.annotation.varargs
def euclid_similarity(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.knn.similarity.EuclidSimilarity",
"euclid_similarity",
exprs
)
}
/**
* @see [[hivemall.knn.similarity.Distance2SimilarityUDF]]
* @group knn.similarity
*/
@scala.annotation.varargs
def distance2similarity(exprs: Column*): Column = withExpr {
// TODO: Need a wrapper class because of using unsupported types
planHiveGenericUDF(
"hivemall.knn.similarity.Distance2SimilarityUDF",
"distance2similarity",
exprs
)
}
/**
* @see [[hivemall.knn.distance.HammingDistanceUDF]]
* @group knn.distance
*/
@scala.annotation.varargs
def hamming_distance(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.knn.distance.HammingDistanceUDF",
"hamming_distance",
exprs
)
}
/**
* @see [[hivemall.knn.distance.PopcountUDF]]
* @group knn.distance
*/
@scala.annotation.varargs
def popcnt(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.knn.distance.PopcountUDF",
"popcnt",
exprs
)
}
/**
* @see [[hivemall.knn.distance.KLDivergenceUDF]]
* @group knn.distance
*/
@scala.annotation.varargs
def kld(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.knn.distance.KLDivergenceUDF",
"kld",
exprs
)
}
/**
* @see [[hivemall.knn.distance.EuclidDistanceUDF]]
* @group knn.distance
*/
@scala.annotation.varargs
def euclid_distance(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.knn.distance.EuclidDistanceUDF",
"euclid_distance",
exprs
)
}
/**
* @see [[hivemall.knn.distance.CosineDistanceUDF]]
* @group knn.distance
*/
@scala.annotation.varargs
def cosine_distance(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.knn.distance.CosineDistanceUDF",
"cosine_distance",
exprs
)
}
/**
* @see [[hivemall.knn.distance.AngularDistanceUDF]]
* @group knn.distance
*/
@scala.annotation.varargs
def angular_distance(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.knn.distance.AngularDistanceUDF",
"angular_distance",
exprs
)
}
/**
* @see [[hivemall.knn.distance.JaccardDistanceUDF]]
* @group knn.distance
*/
@scala.annotation.varargs
def jaccard_distance(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.knn.distance.JaccardDistanceUDF",
"jaccard_distance",
exprs
)
}
/**
* @see [[hivemall.knn.distance.ManhattanDistanceUDF]]
* @group knn.distance
*/
@scala.annotation.varargs
def manhattan_distance(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.knn.distance.ManhattanDistanceUDF",
"manhattan_distance",
exprs
)
}
/**
* @see [[hivemall.knn.distance.MinkowskiDistanceUDF]]
* @group knn.distance
*/
@scala.annotation.varargs
def minkowski_distance(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.knn.distance.MinkowskiDistanceUDF",
"minkowski_distance",
exprs
)
}
/**
* @see [[hivemall.knn.lsh.bBitMinHashUDF]]
* @group knn.lsh
*/
@scala.annotation.varargs
def bbit_minhash(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.knn.lsh.bBitMinHashUDF",
"bbit_minhash",
exprs
)
}
/**
* @see [[hivemall.knn.lsh.MinHashesUDFWrapper]]
* @group knn.lsh
*/
@scala.annotation.varargs
def minhashes(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.knn.lsh.MinHashesUDFWrapper",
"minhashes",
exprs
)
}
/**
* Returns new features with `1.0` (bias) appended to the input features.
* @see [[hivemall.ftvec.AddBiasUDFWrapper]]
* @group ftvec
*/
def add_bias(expr: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.AddBiasUDFWrapper",
"add_bias",
expr :: Nil
)
}
/**
* @see [[hivemall.ftvec.ExtractFeatureUDFWrapper]]
* @group ftvec
*
* TODO: This throws java.lang.ClassCastException because
* HiveInspectors.toInspector has a bug in spark.
* Need to fix it later.
*/
def extract_feature(expr: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.ExtractFeatureUDFWrapper",
"extract_feature",
expr :: Nil
)
}.as("feature")
/**
* @see [[hivemall.ftvec.ExtractWeightUDFWrapper]]
* @group ftvec
*
* TODO: This throws java.lang.ClassCastException because
* HiveInspectors.toInspector has a bug in spark.
* Need to fix it later.
*/
def extract_weight(expr: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.ExtractWeightUDFWrapper",
"extract_weight",
expr :: Nil
)
}.as("value")
/**
* @see [[hivemall.ftvec.AddFeatureIndexUDFWrapper]]
* @group ftvec
*/
def add_feature_index(features: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.AddFeatureIndexUDFWrapper",
"add_feature_index",
features :: Nil
)
}
/**
* @see [[hivemall.ftvec.SortByFeatureUDFWrapper]]
* @group ftvec
*/
def sort_by_feature(expr: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.SortByFeatureUDFWrapper",
"sort_by_feature",
expr :: Nil
)
}
/**
* @see [[hivemall.ftvec.hashing.MurmurHash3UDF]]
* @group ftvec.hashing
*/
def mhash(expr: Column): Column = withExpr {
planHiveUDF(
"hivemall.ftvec.hashing.MurmurHash3UDF",
"mhash",
expr :: Nil
)
}
/**
* @see [[hivemall.ftvec.hashing.Sha1UDF]]
* @group ftvec.hashing
*/
@scala.annotation.varargs
def sha1(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.ftvec.hashing.Sha1UDF",
"sha1",
exprs
)
}
/**
* @see [[hivemall.ftvec.hashing.ArrayHashValuesUDF]]
* @group ftvec.hashing
*/
@scala.annotation.varargs
def array_hash_values(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.ftvec.hashing.ArrayHashValuesUDF",
"array_hash_values",
exprs
)
}
/**
* @see [[hivemall.ftvec.hashing.ArrayPrefixedHashValuesUDF]]
* @group ftvec.hashing
*/
@scala.annotation.varargs
def prefixed_hash_values(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.ftvec.hashing.ArrayPrefixedHashValuesUDF",
"prefixed_hash_values",
exprs
)
}
/**
* @see [[hivemall.ftvec.hashing.FeatureHashingUDF]]
* @group ftvec.hashing
*/
@scala.annotation.varargs
def feature_hashing(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.hashing.FeatureHashingUDF",
"feature_hashing",
exprs
)
}
/**
* @see [[hivemall.ftvec.pairing.PolynomialFeaturesUDF]]
* @group ftvec.paring
*/
@scala.annotation.varargs
def polynomial_features(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.ftvec.pairing.PolynomialFeaturesUDF",
"polynomial_features",
exprs
)
}
/**
* @see [[hivemall.ftvec.pairing.PoweredFeaturesUDF]]
* @group ftvec.paring
*/
@scala.annotation.varargs
def powered_features(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.ftvec.pairing.PoweredFeaturesUDF",
"powered_features",
exprs
)
}
/**
* @see [[hivemall.ftvec.scaling.RescaleUDF]]
* @group ftvec.scaling
*/
def rescale(value: Column, max: Column, min: Column): Column = withExpr {
planHiveUDF(
"hivemall.ftvec.scaling.RescaleUDF",
"rescale",
value.cast(FloatType) :: max :: min :: Nil
)
}
/**
* @see [[hivemall.ftvec.scaling.ZScoreUDF]]
* @group ftvec.scaling
*/
@scala.annotation.varargs
def zscore(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.ftvec.scaling.ZScoreUDF",
"zscore",
exprs
)
}
/**
* @see [[hivemall.ftvec.scaling.L2NormalizationUDFWrapper]]
* @group ftvec.scaling
*/
def l2_normalize(expr: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.scaling.L2NormalizationUDFWrapper",
"normalize",
expr :: Nil
)
}
/**
* @see [[hivemall.ftvec.selection.ChiSquareUDF]]
* @group ftvec.selection
*/
def chi2(observed: Column, expected: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.selection.ChiSquareUDF",
"chi2",
Seq(observed, expected)
)
}
/**
* @see [[hivemall.ftvec.conv.ToDenseFeaturesUDF]]
* @group ftvec.conv
*/
@scala.annotation.varargs
def to_dense_features(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.ftvec.conv.ToDenseFeaturesUDF",
"to_dense_features",
exprs
)
}
/**
* @see [[hivemall.ftvec.conv.ToSparseFeaturesUDF]]
* @group ftvec.conv
*/
@scala.annotation.varargs
def to_sparse_features(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.ftvec.conv.ToSparseFeaturesUDF",
"to_sparse_features",
exprs
)
}
/**
* @see [[hivemall.ftvec.binning.FeatureBinningUDF]]
* @group ftvec.conv
*/
@scala.annotation.varargs
def feature_binning(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.binning.FeatureBinningUDF",
"feature_binning",
exprs
)
}
/**
* @see [[hivemall.ftvec.trans.VectorizeFeaturesUDF]]
* @group ftvec.trans
*/
@scala.annotation.varargs
def vectorize_features(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.trans.VectorizeFeaturesUDF",
"vectorize_features",
exprs
)
}
/**
* @see [[hivemall.ftvec.trans.CategoricalFeaturesUDF]]
* @group ftvec.trans
*/
@scala.annotation.varargs
def categorical_features(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.trans.CategoricalFeaturesUDF",
"categorical_features",
exprs
)
}
/**
* @see [[hivemall.ftvec.trans.FFMFeaturesUDF]]
* @group ftvec.trans
*/
@scala.annotation.varargs
def ffm_features(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.trans.FFMFeaturesUDF",
"ffm_features",
exprs
)
}
/**
* @see [[hivemall.ftvec.trans.IndexedFeatures]]
* @group ftvec.trans
*/
@scala.annotation.varargs
def indexed_features(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.trans.IndexedFeatures",
"indexed_features",
exprs
)
}
/**
* @see [[hivemall.ftvec.trans.QuantitativeFeaturesUDF]]
* @group ftvec.trans
*/
@scala.annotation.varargs
def quantitative_features(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.trans.QuantitativeFeaturesUDF",
"quantitative_features",
exprs
)
}
/**
* @see [[hivemall.ftvec.trans.AddFieldIndicesUDF]]
* @group ftvec.trans
*/
def add_field_indices(features: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.ftvec.trans.AddFieldIndicesUDF",
"add_field_indices",
features :: Nil
)
}
/**
* @see [[hivemall.tools.ConvertLabelUDF]]
* @group tools
*/
def convert_label(label: Column): Column = withExpr {
planHiveUDF(
"hivemall.tools.ConvertLabelUDF",
"convert_label",
label :: Nil
)
}
/**
* @see [[hivemall.tools.RankSequenceUDF]]
* @group tools
*/
def x_rank(key: Column): Column = withExpr {
planHiveUDF(
"hivemall.tools.RankSequenceUDF",
"x_rank",
key :: Nil
)
}
/**
* @see [[hivemall.tools.array.AllocFloatArrayUDF]]
* @group tools.array
*/
def float_array(nDims: Column): Column = withExpr {
planHiveUDF(
"hivemall.tools.array.AllocFloatArrayUDF",
"float_array",
nDims :: Nil
)
}
/**
* @see [[hivemall.tools.array.ArrayRemoveUDF]]
* @group tools.array
*/
def array_remove(original: Column, target: Column): Column = withExpr {
planHiveUDF(
"hivemall.tools.array.ArrayRemoveUDF",
"array_remove",
original :: target :: Nil
)
}
/**
* @see [[hivemall.tools.array.SortAndUniqArrayUDF]]
* @group tools.array
*/
def sort_and_uniq_array(ar: Column): Column = withExpr {
planHiveUDF(
"hivemall.tools.array.SortAndUniqArrayUDF",
"sort_and_uniq_array",
ar :: Nil
)
}
/**
* @see [[hivemall.tools.array.SubarrayEndWithUDF]]
* @group tools.array
*/
def subarray_endwith(original: Column, key: Column): Column = withExpr {
planHiveUDF(
"hivemall.tools.array.SubarrayEndWithUDF",
"subarray_endwith",
original :: key :: Nil
)
}
/**
* @see [[hivemall.tools.array.ArrayConcatUDF]]
* @group tools.array
*/
@scala.annotation.varargs
def array_concat(arrays: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.array.ArrayConcatUDF",
"array_concat",
arrays
)
}
/**
* Alias of array_slice for a backward compatibility.
*
* @see [[hivemall.tools.array.ArraySliceUDF]]
* @group tools.array
*/
def subarray(original: Column, fromIndex: Column, toIndex: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.array.ArraySliceUDF",
"subarray",
original :: fromIndex :: toIndex :: Nil
)
}
/**
* @see [[hivemall.tools.array.ArraySliceUDF]]
* @group tools.array
*/
def array_slice(original: Column, fromIndex: Column, toIndex: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.array.ArraySliceUDF",
"array_slice",
original :: fromIndex :: toIndex :: Nil
)
}
/**
* @see [[hivemall.tools.array.ToStringArrayUDF]]
* @group tools.array
*/
def to_string_array(ar: Column): Column = withExpr {
planHiveUDF(
"hivemall.tools.array.ToStringArrayUDF",
"to_string_array",
ar :: Nil
)
}
/**
* @see [[hivemall.tools.array.ArrayIntersectUDF]]
* @group tools.array
*/
@scala.annotation.varargs
def array_intersect(arrays: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.array.ArrayIntersectUDF",
"array_intersect",
arrays
)
}
/**
* @see [[hivemall.tools.array.SelectKBestUDF]]
* @group tools.array
*/
def select_k_best(X: Column, importanceList: Column, k: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.array.SelectKBestUDF",
"select_k_best",
Seq(X, importanceList, k)
)
}
/**
* @see [[hivemall.tools.bits.ToBitsUDF]]
* @group tools.bits
*/
def to_bits(indexes: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.bits.ToBitsUDF",
"to_bits",
indexes :: Nil
)
}
/**
* @see [[hivemall.tools.bits.UnBitsUDF]]
* @group tools.bits
*/
def unbits(bitset: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.bits.UnBitsUDF",
"unbits",
bitset :: Nil
)
}
/**
* @see [[hivemall.tools.bits.BitsORUDF]]
* @group tools.bits
*/
@scala.annotation.varargs
def bits_or(bits: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.bits.BitsORUDF",
"bits_or",
bits
)
}
/**
* @see [[hivemall.tools.compress.InflateUDF]]
* @group tools.compress
*/
@scala.annotation.varargs
def inflate(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.compress.InflateUDF",
"inflate",
exprs
)
}
/**
* @see [[hivemall.tools.compress.DeflateUDF]]
* @group tools.compress
*/
@scala.annotation.varargs
def deflate(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.compress.DeflateUDF",
"deflate",
exprs
)
}
/**
* @see [[hivemall.tools.map.MapGetSumUDF]]
* @group tools.map
*/
@scala.annotation.varargs
def map_get_sum(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.tools.map.MapGetSumUDF",
"map_get_sum",
exprs
)
}
/**
* @see [[hivemall.tools.map.MapTailNUDF]]
* @group tools.map
*/
@scala.annotation.varargs
def map_tail_n(exprs: Column*): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.map.MapTailNUDF",
"map_tail_n",
exprs
)
}
/**
* @see [[hivemall.tools.text.TokenizeUDF]]
* @group tools.text
*/
@scala.annotation.varargs
def tokenize(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.tools.text.TokenizeUDF",
"tokenize",
exprs
)
}
/**
* @see [[hivemall.tools.text.StopwordUDF]]
* @group tools.text
*/
def is_stopword(word: Column): Column = withExpr {
planHiveUDF(
"hivemall.tools.text.StopwordUDF",
"is_stopword",
word :: Nil
)
}
/**
* @see [[hivemall.tools.text.SingularizeUDF]]
* @group tools.text
*/
def singularize(word: Column): Column = withExpr {
planHiveUDF(
"hivemall.tools.text.SingularizeUDF",
"singularize",
word :: Nil
)
}
/**
* @see [[hivemall.tools.text.SplitWordsUDF]]
* @group tools.text
*/
@scala.annotation.varargs
def split_words(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.tools.text.SplitWordsUDF",
"split_words",
exprs
)
}
/**
* @see [[hivemall.tools.text.NormalizeUnicodeUDF]]
* @group tools.text
*/
@scala.annotation.varargs
def normalize_unicode(exprs: Column*): Column = withExpr {
planHiveUDF(
"hivemall.tools.text.NormalizeUnicodeUDF",
"normalize_unicode",
exprs
)
}
/**
* @see [[hivemall.tools.text.Base91UDF]]
* @group tools.text
*/
def base91(bin: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.text.Base91UDF",
"base91",
bin :: Nil
)
}
/**
* @see [[hivemall.tools.text.Unbase91UDF]]
* @group tools.text
*/
def unbase91(base91String: Column): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.text.Unbase91UDF",
"unbase91",
base91String :: Nil
)
}
/**
* @see [[hivemall.tools.text.WordNgramsUDF]]
* @group tools.text
*/
def word_ngrams(words: Column, minSize: Column, maxSize: Column): Column = withExpr {
planHiveUDF(
"hivemall.tools.text.WordNgramsUDF",
"word_ngrams",
words :: minSize :: maxSize :: Nil
)
}
/**
* @see [[hivemall.tools.math.SigmoidGenericUDF]]
* @group misc
*/
def sigmoid(expr: Column): Column = {
val one: () => Literal = () => Literal.create(1.0, DoubleType)
Column(one()) / (Column(one()) + exp(-expr))
}
/**
* @see [[hivemall.tools.mapred.RowIdUDFWrapper]]
* @group misc
*/
def rowid(): Column = withExpr {
planHiveGenericUDF(
"hivemall.tools.mapred.RowIdUDFWrapper",
"rowid",
Nil
)
}.as("rowid")
/**
* Parses a column containing a CSV string into a [[StructType]] with the specified schema.
* Returns `null`, in the case of an unparseable string.
* @group misc
*
* @param e a string column containing CSV data.
* @param schema the schema to use when parsing the csv string
* @param options options to control how the csv is parsed. accepts the same options and the
* csv data source.
*/
def from_csv(e: Column, schema: StructType, options: Map[String, String]): Column = withExpr {
CsvToStruct(schema, options, e.expr)
}
/**
* Parses a column containing a CSV string into a [[StructType]] with the specified schema.
* Returns `null`, in the case of an unparseable string.
* @group misc
*
* @param e a string column containing CSV data.
* @param schema the schema to use when parsing the json string
*/
def from_csv(e: Column, schema: StructType): Column =
from_csv(e, schema, Map.empty[String, String])
/**
* Converts a column containing a [[StructType]] into a CSV string with the specified schema.
* Throws an exception, in the case of an unsupported type.
* @group misc
*
* @param e a struct column.
* @param options options to control how the struct column is converted into a json string.
* accepts the same options and the json data source.
*/
def to_csv(e: Column, options: Map[String, String]): Column = withExpr {
StructToCsv(options, e.expr)
}
/**
* Converts a column containing a [[StructType]] into a CSV string with the specified schema.
* Throws an exception, in the case of an unsupported type.
* @group misc
*
* @param e a struct column.
*/
def to_csv(e: Column): Column = to_csv(e, Map.empty[String, String])
/**
* A convenient function to wrap an expression and produce a Column.
*/
@inline private def withExpr(expr: Expression): Column = Column(expr)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy