
.13.e3.source-code.fuzzy.scala Maven / Gradle / Ivy
/*
Copyright 2010 Aaron J. Radke
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cc.drx
object Fuzzy{
implicit object ParsableFuzzy extends Parsable[Fuzzy]{ def apply(pat:String):Fuzzy = Fuzzy(pat) }
}
// TODO implement and remove fuzzy specific logic from the collection additions
case class Fuzzy(pat:String){
// def compare(a:String, b:String) = TODO
// collection operators
private def defaultAsString[A](x:A):String = x.toString
/**filter list where some match exists*/
def filter[A](xs:Seq[A]):Seq[A] = filter(xs, defaultAsString) //show only matches
def filter[A](xs:Seq[A],asString:A=>String):Seq[A] = xs.filter{x => penalty(asString(x)).isDefined} //show only matches
/**sort list from best to worst match (lowest penalty)*/ //
def sort[A](xs:Seq[A]):Seq[A] = sort(xs, defaultAsString)
def sort[A](xs:Seq[A],asString:A=>String):Seq[A] = zipWithPenalty(xs,asString).sortBy(_._2).map(_._1) //sort the matches
/**find the best match out of the bunch (lowest penalty) */
// def find[A](xs:Seq[A]):Option[A] = zipWithPenalty(xs).getNonEmpty.map{_.minBy(_._2)._1} //get the best/first match if found
def find[A](xs:Seq[A]):Option[A] = find(xs, defaultAsString)
def find[A](xs:Seq[A], asString:A=>String):Option[A] =
zipWithPenalty(xs,asString).getNonEmpty.map{_.minBy(_._2)._1} //get the best/first match if found
private def zipWithPenalty[A](xs:Seq[A],asString:A=>String):Seq[(A,Int)] =
for(x <- xs; z <- penalty(asString(x))) yield (x, z)
/** integer penalty, where a zero is a perfect match, None means no match at all*/
private def penalty(str:String):Option[Int] = { //think about a more generic interface than an integer penalty that may change
//TODO move the pat dependent calcs as private lazy val's local the Fuzzy case class to cache calcs for future penalty
lazy val lStr = str.toLowerCase
lazy val lPat = pat.toLowerCase
lazy val pats = Parse.split[String](pat)
lazy val lPats = pats.map{_.toLowerCase}
lazy val terms = pat.splitTerms
lazy val lTerms = terms.map{_.toLowerCase}
lazy val soundexStr = Soundex(str)
lazy val soundexTerms = terms map Soundex.apply
lazy val beacon:String = str.toList.sliding(2,1).filter{
case List(a,b) => (a.isLower && b.isUpper) || (!a.isLetter && b.isLetter) || b.isDigit
case _ => true
}.flatMap(_.lastOption).mkString(str.take(1),"","") //instead of headOption this works: ("".take(1) == "")
lazy val lBeacon = beacon.toLowerCase
// Log(str, beacon)
lazy val N = str.size
lazy val n = pat.size
lazy val Nb = beacon.size
lazy val np = pats.map{_.size}.sum
lazy val deltaNPenalty:Int = (N-n).abs.sat(0,99) //0 same size, 99 lots of extra chars penalty step size
// val useCase = pat exists (_.isUpperCase) //vim smartcase like auto mode setting
// note: case matching is baked in with priority penalty
//-- cost function search //smaller numbers are a better match, zero is perfect, (prime numbers and relative size matching confuse separation, just use simple priority based thinking), this prevents the need to penalty by all and use best
//--empty string
val res = {
if(str == "") Int.MaxValue
//--full match case
else if(str == pat) 0 //zero is perfect match
//--full match case insensitive
else if(lStr == lPat) 100
//--partial match case
else if(str contains pat) 200 //TODO differentiate degree of partial match
//--partial match case insensitive
else if(lStr contains lPat) 300
//--match all list of pats Case
else if(pats forall (str contains _)) 400
//--match all list of pats NoCase
else if(lPats forall (lStr contains _)) 500
//--match all list of terms Case
else if(terms forall (str contains _)) 600
//--match all list of terms NoCase
else if(lTerms forall (lStr contains _)) 700
//--match beacon with Case
else if(beacon contains pat) 800
//--match beacon NoCase
else if(lBeacon contains lPat) 900
//--match beacon elements subsequences
else if(beacon containsSubsequence pat) 1000
//--match beacon elements subsequences
else if(lBeacon containsSubsequence lPat) 1100
//--match with subsequence
else if(str containsSubsequence pat) 1200
//--match with subsequence NoCase
else if(lStr containsSubsequence lPat) 1300
//--match all list of terms by simularity
//--match all terms by a soundex simularity
else if(soundexTerms forall (soundexStr contains _)) 1400
//--match with spaces
//--TODO allow skips in beacon matches
//--TODO allow beacon like matches as part of the pattern lists
else Int.MaxValue
}
if(res == Int.MaxValue) None else Some(res + deltaNPenalty)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy