All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.mrpowers.spark.stringmetric.SimilarityFunctions.scala Maven / Gradle / Ivy

The newest version!
package com.github.mrpowers.spark.stringmetric

import com.github.mrpowers.spark.stringmetric.expressions.HammingDistance
import org.apache.spark.sql.Column
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.functions._

import java.util.Locale

import org.apache.commons.text.similarity.{
  CosineDistance,
  JaccardSimilarity,
  JaroWinklerDistance,
  FuzzyScore
}


object SimilarityFunctions {
  private def withExpr(expr: Expression): Column = new Column(expr)

  val cosine_distance = udf[Option[Double], String, String](cosineDistanceFun)

  def cosineDistanceFun(s1: String, s2: String): Option[Double] = {
    val str1 = Option(s1).getOrElse(return None)
    val str2 = Option(s2).getOrElse(return None)
    val cd = new CosineDistance()
    Some(cd(s1, s2))
  }

  val fuzzy_score = udf[Option[Integer], String, String](fuzzyScoreFun)

  def fuzzyScoreFun(s1: String, s2: String): Option[Integer] = {
    val str1 = Option(s1).getOrElse(return None)
    val str2 = Option(s2).getOrElse(return None)
    val f = new FuzzyScore(Locale.ENGLISH)
    Some(f.fuzzyScore(str1, str2))
  }

  def hamming(s1: Column, s2: Column): Column = withExpr {
    HammingDistance(s1.expr, s2.expr)
  }

  val jaccard_similarity = udf[Option[Double], String, String](jaccardSimilarityFun)

  def jaccardSimilarityFun(s1: String, s2: String): Option[Double] = {
    val str1 = Option(s1).getOrElse(return None)
    val str2 = Option(s2).getOrElse(return None)
    val j = new JaccardSimilarity()
    Some(j.apply(str1, str2))
  }

  val jaro_winkler = udf[Option[Double], String, String](jaroWinlkerFun)

  def jaroWinlkerFun(s1: String, s2: String): Option[Double] = {
    val str1 = Option(s1).getOrElse(return None)
    val str2 = Option(s2).getOrElse(return None)
    val j = new JaroWinklerDistance()
    Some(j.apply(str1, str2))
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy