streaming.dsl.mmlib.algs.feature.StringFeature.scala Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package streaming.dsl.mmlib.algs.feature
import _root_.streaming.dsl.mmlib.algs.MetaConst._
import _root_.streaming.dsl.mmlib.algs._
import _root_.streaming.log.WowLog
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.feature.NGram
import org.apache.spark.ml.help.HSQLStringIndex
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, functions => F, _}
import scala.collection.mutable.ArrayBuffer
import scala.util.control.Breaks.{break, breakable}
/**
* Created by allwefantasy on 14/5/2018.
*/
object StringFeature extends BaseFeatureFunctions with WowLog {
def loadStopwords(df: DataFrame, stopWordsPaths: String) = {
val stopwords = if (stopWordsPaths == null || stopWordsPaths.isEmpty) {
Set[String]()
} else {
val dtt = new SQLDicOrTableToArray()
val stopwordsMapping = dtt.internal_train(df,
Map(
"dic.paths" -> stopWordsPaths,
"dic.names" -> "stopwords"
)).collect().map(f => (f.getString(0), f.getSeq(1))).toMap
stopwordsMapping("stopwords").toSet[String]
}
stopwords
}
def loadPriorityWords(df: DataFrame, priorityDicPaths: String,
priority: Double,
predictSingleWordFunc: String => Int) = {
val priorityWords = (if (priorityDicPaths == null || priorityDicPaths.isEmpty) {
Set[String]()
} else {
val dtt = new SQLDicOrTableToArray()
val prioritywordsMapping = dtt.internal_train(df,
Map(
"dic.paths" -> priorityDicPaths,
"dic.names" -> "prioritywords"
)).collect().map(f => (f.getString(0), f.getSeq(1))).toMap
prioritywordsMapping("prioritywords").toSet[String]
}).map(f => predictSingleWordFunc(f)).filter(f => f != -1)
val prioritywordsBr = df.sparkSession.sparkContext.broadcast(priorityWords)
val priorityFunc = (vec: Vector) => {
val indices = ArrayBuffer[Int]()
val values = ArrayBuffer[Double]()
vec.foreachActive { (index, value) =>
val newValue = if (prioritywordsBr.value.contains(index)) {
value * priority
} else value
indices += index
values += newValue
}
Vectors.sparse(vec.size, indices.toArray, values.toArray)
}
(priorityWords, priorityFunc)
}
def analysisWords(df: DataFrame, metaPath: String, dicPaths: String, inputCol: String,
stopwordsBr: Broadcast[Set[String]],
nGrams: Seq[Int],
outputWordAndIndex: Boolean,
split: String,
wordsArray: Array[String] = Array[String]()
) = {
var newDF = new SQLTokenAnalysis().internal_train(df, Map("dic.paths" -> dicPaths, "inputCol" -> inputCol, "ignoreNature" -> "true", "wordsArray" -> wordsArray.mkString(","), "split" -> split))
val filterStopWordFunc = F.udf((a: Seq[String]) => {
a.filterNot(stopwordsBr.value.contains(_))
})
newDF = replaceColumn(newDF, inputCol, filterStopWordFunc)
//ngram support
val ngramfields = nGrams.map { ngram =>
val newField = inputCol + "_ngrams_" + ngram
val ngramTF = new NGram().setN(ngram).setInputCol(inputCol).setOutputCol(newField)
newDF = ngramTF.transform(newDF)
newField
}
val mergeFunc = F.udf((a: Seq[String], b: Seq[String]) => {
a ++ b
})
ngramfields.foreach { newField =>
newDF = newDF.withColumn(inputCol, mergeFunc(F.col(inputCol), F.col(newField))).drop(newField)
}
val inputColIndex = newDF.schema.fieldIndex(inputCol)
val newRdd = newDF.rdd.flatMap(f =>
f.getSeq[String](inputColIndex)
).distinct().map(f =>
Row.fromSeq(Seq(f))
)
//create uniq int for analysed token
val tmpWords = df.sparkSession.createDataFrame(newRdd, StructType(Seq(StructField("words", StringType))))
val wordCount = tmpWords.count()
format(s"TFIDF: total words in corpus: ${wordCount}")
//represent content with sequence of number
val si = new SQLStringIndex()
si.train(tmpWords, WORD_INDEX_PATH(metaPath, inputCol), Map("inputCol" -> "words"))
val siModel = si.load(df.sparkSession, WORD_INDEX_PATH(metaPath, inputCol), Map())
// keep word and index
val wordToIndex = HSQLStringIndex.wordToIndex(df.sparkSession, siModel)
format(s"TFIDF: wordToIndex: ${wordToIndex.size}")
val spark = df.sparkSession
spark.createDataFrame(
spark.sparkContext.parallelize(wordToIndex.toSeq).map(f => Row.fromSeq(Seq(f._1, f._2))),
StructType(Seq(
StructField("word", StringType),
StructField("index", DoubleType)
))).write.
mode(SaveMode.Overwrite).
parquet(WORD_INDEX_PATH(metaPath, inputCol))
if (outputWordAndIndex) {
val res = wordToIndex.toSeq.sortBy(f => f._2).map(f => s"${f._1}:${f._2}").mkString("\n")
format(res)
}
val funcMap = si.internal_predict(df.sparkSession, siModel, "wow")
val predictFunc = funcMap("wow_array").asInstanceOf[(Seq[String]) => Array[Int]]
val udfPredictFunc = F.udf(predictFunc)
newDF = replaceColumn(newDF, inputCol, udfPredictFunc)
(newDF, funcMap, wordCount)
}
def tfidf(df: DataFrame,
metaPath: String,
dicPaths: String,
inputCol: String,
stopWordsPaths: String,
priorityDicPaths: String,
priority: Double,
nGrams: Seq[Int],
split: String,
outputWordAndIndex: Boolean = false
) = {
//check stopwords dic is whether configured
val stopwords = loadStopwords(df, stopWordsPaths)
val stopwordsBr = df.sparkSession.sparkContext.broadcast(stopwords)
//analysis
var (newDF, funcMap, wordCount) = analysisWords(df, metaPath, dicPaths, inputCol, stopwordsBr, nGrams, outputWordAndIndex, split)
val spark = df.sparkSession
//tfidf feature
val tfidf = new SQLTfIdf()
tfidf.train(newDF, TF_IDF_PATH(metaPath, inputCol), Map("inputCol" -> inputCol, "numFeatures" -> wordCount.toString, "binary" -> "true"))
val tfidfModel = tfidf.load(df.sparkSession, TF_IDF_PATH(metaPath, inputCol), Map())
val tfidfFunc = tfidf.internal_predict(df.sparkSession, tfidfModel, "wow")("wow")
val tfidfUDFFunc = F.udf(tfidfFunc)
newDF = replaceColumn(newDF, inputCol, tfidfUDFFunc)
//enhance
val predictSingleWordFunc = funcMap("wow").asInstanceOf[(String) => Int]
val (priorityWords, priorityFunc) = loadPriorityWords(newDF, priorityDicPaths, priority, predictSingleWordFunc)
newDF = replaceColumn(newDF, inputCol, F.udf(priorityFunc))
newDF
}
def word2vec(df: DataFrame, metaPath: String, dicPaths: String, wordvecPaths: String, inputCol: String, stopWordsPaths: String, resultFeature: String, split: String, vectorSize: Int = 100, length: Int = 100, minCount: Int = 1) = {
val stopwords = loadStopwords(df, stopWordsPaths)
val stopwordsBr = df.sparkSession.sparkContext.broadcast(stopwords)
val spark = df.sparkSession
val wordsArray = loadDicsFromWordvec(spark, wordvecPaths)
val wordvecsMap = loadWordvecs(spark, wordvecPaths)
if (wordvecsMap.size > 0) {
var newDF = new SQLTokenAnalysis().internal_train(df, Map("dic.paths" -> dicPaths, "inputCol" -> inputCol, "ignoreNature" -> "true", "wordsArray" -> wordsArray.mkString(",")))
val filterStopWordFunc = F.udf((a: Seq[String]) => {
a.filterNot(stopwordsBr.value.contains(_))
})
newDF = replaceColumn(newDF, inputCol, filterStopWordFunc)
val toVecFunc = F.udf((wordArray: Seq[String]) => {
val r = new Array[Seq[Double]](length)
val wSize = wordArray.size
for (i <- 0 until length) {
if (i < wSize && wordvecsMap.contains(wordArray(i))) {
r(i) = wordvecsMap(wordArray(i)).toSeq
} else
r(i) = new Array[Double](vectorSize).toSeq
}
r.toSeq
})
replaceColumn(newDF, inputCol, toVecFunc)
} else {
var (newDF, funcMap, wordCount) = analysisWords(df, metaPath, dicPaths, inputCol, stopwordsBr, Seq(), false, split, wordsArray)
// word2vec only accept String sequence, so we should convert int to str
val word2vec = new SQLWord2Vec()
word2vec.train(replaceColumn(newDF, inputCol, F.udf((a: Seq[Int]) => {
a.map(f => f.toString)
})), WORD2VEC_PATH(metaPath, inputCol), Map("inputCol" -> inputCol, "vectorSize" -> (vectorSize + ""), "minCount" -> minCount.toString))
val model = word2vec.load(df.sparkSession, WORD2VEC_PATH(metaPath, inputCol), Map())
if (!resultFeature.equals("index")) {
val predictFunc = word2vec.internal_predict(df.sparkSession, model, "wow")("wow_array").asInstanceOf[(Seq[String]) => Seq[Seq[Double]]]
val udfPredictFunc = F.udf(predictFunc)
newDF = replaceColumn(newDF, inputCol, udfPredictFunc)
}
newDF
}
}
def loadDicsFromWordvec(spark: SparkSession, wordvecPaths: String) = {
var result = Array[String]()
result ++= wordvecPaths.split(",").filter(f => !f.isEmpty).flatMap { f =>
spark.sparkContext.textFile(f).map(line =>
line.split(" ")(0)
).collect()
}
result
}
def loadWordvecs(spark: SparkSession, wordvecPaths: String) = {
var wordVecMap = Map[String, Array[Double]]()
var wordVec = Array[(String, Array[Double])]()
wordVec ++= wordvecPaths.split(",").filter(f => !f.isEmpty).flatMap { f =>
spark.sparkContext.textFile(f).map(line =>
(line.split(" ")(0), line.split(" ")(1).split(",").map(a => a.toDouble))
).collect()
}
for (a <- wordVec) {
wordVecMap += a
}
wordVecMap
}
def wordvec(df: DataFrame, dicPaths: String, inputCol: String, stopWordsPaths: String, vectorSize: Int, length: Int) = {
val spark = df.sparkSession
val stopwords = loadStopwords(df, stopWordsPaths)
val stopwordsBr = df.sparkSession.sparkContext.broadcast(stopwords)
var wordVecMap = Map[String, Array[Double]]()
val wordVec = dicPaths.split(",").filter(f => !f.isEmpty).flatMap { f =>
spark.sparkContext.textFile(f).map(line =>
(line.split(" ")(0), line.split(" ")(1).split(",").map(a => a.toDouble))
).collect()
}
val dic = wordVec.map(_._1)
for (a <- wordVec) {
wordVecMap += a
}
val rdd = df.rdd.mapPartitions { mp =>
val parser = SQLTokenAnalysis.createAnalyzer(dic, Map())
mp.map { f =>
val content = f.getAs[String](inputCol)
val res = SQLTokenAnalysis.parseStr(parser, content, Map())
val index = f.fieldIndex(inputCol)
val newValue = f.toSeq.zipWithIndex.filterNot(f => f._2 == index).map(f => f._1) ++ Seq(res)
Row.fromSeq(newValue)
}
}
var newDF = spark.createDataFrame(rdd,
StructType(df.schema.filterNot(f => f.name == inputCol) ++ Seq(StructField(inputCol, ArrayType(StringType)))))
val filterStopWordFunc = F.udf((a: Seq[String]) => {
a.filterNot(stopwordsBr.value.contains(_))
})
newDF = replaceColumn(newDF, inputCol, filterStopWordFunc)
//toVec
val toVecFunc = F.udf((wordArray: Seq[String]) => {
val r = new Array[Seq[Double]](length)
val wSize = wordArray.size
for (i <- 0 until length) {
if (i < wSize && wordVecMap.contains(wordArray(i))) {
r(i) = wordVecMap(wordArray(i)).toSeq
} else
r(i) = new Array[Double](vectorSize).toSeq
}
r.toSeq
})
newDF = replaceColumn(newDF, inputCol, toVecFunc)
newDF
}
def strToInt(df: DataFrame, mappingPath: String, inputCol: String, outputWordAndIndex: Boolean) = {
val wordIndexPath = mappingPath.stripSuffix("/") + s"/wordIndex/$inputCol"
val si = new SQLStringIndex()
si.train(df, wordIndexPath, Map("inputCol" -> inputCol))
val siModel = si.load(df.sparkSession, wordIndexPath, Map())
if (outputWordAndIndex) {
val wordToIndex = HSQLStringIndex.wordToIndex(df.sparkSession, siModel)
val res = wordToIndex.toSeq.sortBy(f => f._2).map(f => s"${f._1}:${f._2}").mkString("\n")
println(res)
}
val funcMap = si.internal_predict(df.sparkSession, siModel, "wow")
val predictSingleWordFunc = funcMap("wow").asInstanceOf[(String) => Int]
val newDF = replaceColumn(df, inputCol, F.udf(predictSingleWordFunc))
(newDF, funcMap)
}
def mergeFunc(seq: Seq[Seq[Double]], vectorSize: Int) = {
if (seq.size == 0) {
Seq[Double]()
} else {
val r = new Array[Double](vectorSize)
for (a1 <- seq) {
val b = a1.toList
for (i <- 0 until b.size) {
r(i) = b(i) + r(i)
}
}
r.toSeq
}
}
def analysisRaw(df: DataFrame, inputCol: String, sentenceSplit: String, modelSplit: String, dicPaths: String, wordsArray: Array[String] = Array[String]()) = {
val spark = df.sparkSession
val words = SQLTokenAnalysis.loadDics(spark, Map("dic.paths" -> dicPaths)) ++ wordsArray
val parser = SQLTokenAnalysis.createAnalyzer(words, Map())
val analysisRawFunc = F.udf((raw: String) => {
var raw1 = ""
val splitArray = sentenceSplit.split("")
for (split <- splitArray) {
raw1 = raw.replace(split, "。")
}
raw1.split("。").map(f => {
if (modelSplit != null) {
f.split(modelSplit).toSeq
} else {
val parser = SQLTokenAnalysis.createAnalyzer(words, Map())
SQLTokenAnalysis.parseStr(parser, f, Map("ignoreNature" -> "true")).toSeq
}
}).toSeq
})
replaceColumn(df, inputCol, analysisRawFunc)
}
def raw2vec(df: DataFrame, inputCol: String, sentenceSplit: String, modelPath: String) = {
val spark = df.sparkSession
import spark.implicits._
val modelMetaPath = getMetaPath(modelPath)
val modelParams = spark.read.parquet(PARAMS_PATH(modelMetaPath, "params")).map(f => (f.getString(0), f.getString(1))).collect().toMap
val modelInputCol = modelParams.getOrElse("inputCol", "")
val dicPaths = modelParams.getOrElse("dicPaths", "")
val wordvecPaths = modelParams.getOrElse("wordvecPaths", "")
val modelSplit = modelParams.getOrElse("split", null)
val vectorSize = modelParams.getOrElse("vectorSize", "100").toInt
val wordvecsMapBr = spark.sparkContext.broadcast(loadWordvecs(spark, wordvecPaths))
val wordsArray = StringFeature.loadDicsFromWordvec(spark, wordvecPaths)
val word2vec = new SQLWord2Vec()
val model = word2vec.load(spark, WORD2VEC_PATH(modelMetaPath, modelInputCol), Map())
val predictFunc = word2vec.internal_predict(df.sparkSession, model, "wow")("wow_array").asInstanceOf[(Seq[String]) => Seq[Seq[Double]]]
val wordIndexBr = spark.sparkContext.broadcast(spark.read.parquet(WORD_INDEX_PATH(modelMetaPath, modelInputCol)).map(f => ((f.getString(0), f.getDouble(1)))).collect().toMap)
val newDf = analysisRaw(df, inputCol, sentenceSplit, modelSplit, dicPaths, wordsArray)
val toVecFunc = F.udf((raw: Seq[Seq[String]]) => {
raw.map(wordSeq => {
val vecSeq = if (wordvecsMapBr.value.size > 0) {
wordSeq.filter(f => wordvecsMapBr.value.contains(f)).map(f => wordvecsMapBr.value(f).toSeq)
}
else {
val wordIntArray = wordSeq.filter(f => wordIndexBr.value.contains(f)).map(f => wordIndexBr.value(f).toInt)
predictFunc(wordIntArray.map(f => f.toString))
}
//merge
if (vecSeq.size == 0) {
Seq[Double]()
}
else {
val r = new Array[Double](vectorSize)
for (vec <- vecSeq) {
for (i <- 0 until vec.size) {
r(i) = vec(i) + r(i)
}
}
r.toSeq
}
})
})
replaceColumn(newDf, inputCol, toVecFunc)
}
def cosineSimilarity(array1: Seq[Double], array2: Seq[Double]): Double = {
var dotProduct = 0.0
var magnitude1 = 0.0
var magnitude2 = 0.0
var cosineSimilarity = 0.0
if (array1.length == 0 || array2.length == 0 || array1.length != array2.length) {
0.0D
} else {
for (i <- 0 until array1.length) {
dotProduct += array1(i) * array2(i) //a.b
magnitude1 += Math.pow(array1(i), 2) //(a^2)
magnitude2 += Math.pow(array2(i), 2) //(b^2)
}
magnitude1 = Math.sqrt(magnitude1) //sqrt(a^2)
magnitude2 = Math.sqrt(magnitude2) //sqrt(b^2)
if ((magnitude1 != 0.0D) && (magnitude2 != 0.0D))
cosineSimilarity = dotProduct / (magnitude1 * magnitude2)
else return 0.0
cosineSimilarity
}
}
def rawSimilar(array1: Seq[Seq[Double]], array2: Seq[Seq[Double]], threshold: Double = 0.8): Double = {
var similar: Double = 0
if (array1.length > 0 && array2.length > 0) {
val (smallArray, bigArray) = {
if (array1.length < array2.length)
(array1, array2)
else
(array2, array1)
}
val size = smallArray.length.toDouble
var num = 0
for (r1 <- smallArray) {
breakable {
for (r2 <- bigArray) {
if (cosineSimilarity(r1, r2) >= threshold) {
num = num + 1
break
}
}
}
}
similar = num / size
}
similar
}
}