com.github.mrpowers.spark.stringmetric.SimilarityFunctions.scala Maven / Gradle / Ivy
The newest version!
package com.github.mrpowers.spark.stringmetric
import com.github.mrpowers.spark.stringmetric.expressions.HammingDistance
import org.apache.spark.sql.Column
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.functions._
import java.util.Locale
import org.apache.commons.text.similarity.{
CosineDistance,
JaccardSimilarity,
JaroWinklerDistance,
FuzzyScore
}
object SimilarityFunctions {
private def withExpr(expr: Expression): Column = new Column(expr)
val cosine_distance = udf[Option[Double], String, String](cosineDistanceFun)
def cosineDistanceFun(s1: String, s2: String): Option[Double] = {
val str1 = Option(s1).getOrElse(return None)
val str2 = Option(s2).getOrElse(return None)
val cd = new CosineDistance()
Some(cd(s1, s2))
}
val fuzzy_score = udf[Option[Integer], String, String](fuzzyScoreFun)
def fuzzyScoreFun(s1: String, s2: String): Option[Integer] = {
val str1 = Option(s1).getOrElse(return None)
val str2 = Option(s2).getOrElse(return None)
val f = new FuzzyScore(Locale.ENGLISH)
Some(f.fuzzyScore(str1, str2))
}
def hamming(s1: Column, s2: Column): Column = withExpr {
HammingDistance(s1.expr, s2.expr)
}
val jaccard_similarity = udf[Option[Double], String, String](jaccardSimilarityFun)
def jaccardSimilarityFun(s1: String, s2: String): Option[Double] = {
val str1 = Option(s1).getOrElse(return None)
val str2 = Option(s2).getOrElse(return None)
val j = new JaccardSimilarity()
Some(j.apply(str1, str2))
}
val jaro_winkler = udf[Option[Double], String, String](jaroWinlkerFun)
def jaroWinlkerFun(s1: String, s2: String): Option[Double] = {
val str1 = Option(s1).getOrElse(return None)
val str2 = Option(s2).getOrElse(return None)
val j = new JaroWinklerDistance()
Some(j.apply(str1, str2))
}
}