fuzzycsv.Fuzzy.groovy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of fuzzy-csv Show documentation
Show all versions of fuzzy-csv Show documentation
A groovy/java tabular Data (from CSV,SQL,JSON) processing library that supports fuzzy column matching,tranformations/merging/querying etc
package fuzzycsv
import com.github.kayr.phrasematcher.PhraseMatcher
import groovy.transform.CompileStatic
import groovy.transform.Memoized
import org.slf4j.Logger
import org.slf4j.LoggerFactory
@CompileStatic
class Fuzzy {
private static Logger log = LoggerFactory.getLogger(Fuzzy)
static int findBestPosition(def phrases, String header, double minScore) {
phrases = phrases as List
def csvColIdx = findPosition(phrases, header)
if (csvColIdx == -1 && minScore < 1.0) {
csvColIdx = findClosestPosition(phrases, header, minScore)
}
csvColIdx
}
static int findClosestPosition(def phrases, String phrase, double minScore) {
phrases = phrases as List
def ph = PhraseMatcher.train(phrases as List)
def newName = ph.bestHit(phrase, minScore)
if (newName.isInvalid()) {
if (log.isDebugEnabled())
log.debug "getColumnPositionUsingHeuristic(): warning: no column match found: [$phrase] = [$newName]"
return -1
}
if (log.isDebugEnabled())
log.debug "getColumnPositionUsingHeuristic(): heuristic: [$phrase] = [$newName]"
return findPosition(phrases, newName.phrase)
}
static int findPosition(def phrases, String name) {
phrases.findIndexOf { value -> value.toString().toLowerCase().trim().equalsIgnoreCase(name.trim().toLowerCase()) }
}
}