All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.johnsnowlabs.nlp.annotators.spell.util.Utilities.scala Maven / Gradle / Ivy

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators.spell.util

import org.slf4j.LoggerFactory

import scala.math.min
import scala.util.Random

object Utilities {

  private val logger = LoggerFactory.getLogger("SpellCheckersUtilities")

  private val alphabet = "abcdefghijjklmnopqrstuvwxyz".toCharArray
  private val vowels = "aeiouy".toCharArray

  /** distance measure between two words */
  def computeHammingDistance(word1: String, word2: String): Long =
    if (word1 == word2) 0
    else word1.zip(word2).count { case (c1, c2) => c1 != c2 } + (word1.length - word2.length).abs

  /** retrieve frequency */
  def getFrequency(word: String, wordCount: Map[String, Long]): Long = {
    wordCount.getOrElse(word, 0)
  }

  /** Computes Levenshtein distance : Metric of measuring difference between two sequences (edit
    * distance) Source: https://rosettacode.org/wiki/Levenshtein_distance
    */
  def levenshteinDistance(s1: String, s2: String): Int = {
    val dist = Array.tabulate(s2.length + 1, s1.length + 1) { (j, i) =>
      if (j == 0) i else if (i == 0) j else 0
    }

    for (j <- 1 to s2.length; i <- 1 to s1.length)
      dist(j)(i) =
        if (s2(j - 1) == s1(i - 1)) dist(j - 1)(i - 1)
        else minimum(dist(j - 1)(i) + 1, dist(j)(i - 1) + 1, dist(j - 1)(i - 1) + 1)

    dist(s2.length)(s1.length)
  }

  private def minimum(i1: Int, i2: Int, i3: Int) = min(min(i1, i2), i3)

  def limitDuplicates(
      duplicatesLimit: Int,
      text: String,
      overrideLimit: Option[Int] = None): String = {
    var duplicates = 0
    text.zipWithIndex
      .collect { case (w, i) =>
        if (i == 0) {
          w
        } else if (w == text(i - 1)) {
          if (duplicates < overrideLimit.getOrElse(duplicatesLimit)) {
            duplicates += 1
            w
          } else {
            ""
          }
        } else {
          duplicates = 0
          w
        }
      }
      .mkString("")
  }

  /** Possibilities analysis */
  def variants(targetWord: String): List[String] = {

    val splits = (0 to targetWord.length).map(i => (targetWord.take(i), targetWord.drop(i)))
    val vars = scala.collection.mutable.Set.empty[String]
    splits.toIterator.foreach { case (a, b) =>
      if (b.nonEmpty) {
        vars.add(a + b.tail)
      }
      if (b.length > 1) {
        vars.add(a + b(1) + b(0) + b.drop(2))
      }
      if (b.nonEmpty) {
        alphabet.foreach(c => vars.add(a + c + b.tail))
      }
      alphabet.foreach(c => vars.add(a + c + b))
    }
    vars.toList
  }

  /** possible variations of the word by removing duplicate letters */
  def reductions(word: String, reductionsLimit: Int): List[String] = {

    val flatWord: Iterator[Iterator[String]] = word.toCharArray.toIterator.zipWithIndex.collect {
      case (c, i) =>
        val n = numberOfDuplicates(word, i)
        if (n > 0) {
          (0 to n).map(r => c.toString * r).take(reductionsLimit).toIterator
        } else {
          Iterator(c.toString)
        }
    }

    val reds = cartesianProduct(flatWord).map(_.mkString(""))
    logger.debug("parsed reductions: " + reds.size)
    reds.toList
  }

  private def numberOfDuplicates(text: String, id: Int): Int = {
    var idx = id
    val initialId = idx
    val last = text(idx)
    while (idx + 1 < text.length && text(idx + 1) == last) {
      idx += 1
    }
    idx - initialId
  }

  /** flattens vowel possibilities */
  def getVowelSwaps(word: String, vowelSwapLimit: Int): List[String] = {
    if (word.length > vowelSwapLimit) return List.empty[String]
    val flatWord: Iterator[Iterator[Char]] = word.toCharArray.toIterator.collect { case c =>
      if (vowels.contains(c)) {
        vowels.toIterator
      } else {
        Iterator(c)
      }
    }
    val vowelSwaps = cartesianProduct(flatWord).map(_.mkString(""))
    logger.debug("vowel swaps: " + vowelSwaps.size)
    vowelSwaps.toList
  }

  private def cartesianProduct[T](seqs: Iterator[Iterator[T]]): Seq[Seq[T]] = {
    seqs.foldLeft(Seq(Seq.empty[T]))((b, a) => b.flatMap(i => a.map(j => i ++ Seq(j))))
  }

  def getRandomValueFromList[A](list: List[A]): Option[A] = {
    list.lift(Random.nextInt(list.size))
  }

  def computeConfidenceValue[A](list: List[A]): Double = {
    1 / list.length.toDouble
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy