ua.net.nlp.other.EvaluateText.groovy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of nlp_uk Show documentation
Show all versions of nlp_uk Show documentation
NLP tools for Ukrainian language
The newest version!
#!/bin/env groovy
// This script checks the text with LanguageTool
// and prints a error rating (along with count and total # of words)
//
// NOTE: it disables some rules, like spelling, double whitespace etc
package ua.net.nlp.other
import groovy.transform.CompileStatic
import org.languagetool.*
import org.languagetool.rules.*
import org.languagetool.tokenizers.*
import org.languagetool.language.*
class EvaluateText {
private static final String RULES_TO_IGNORE="MORFOLOGIK_RULE_UK_UA,COMMA_PARENTHESIS_WHITESPACE,WHITESPACE_RULE," \
+ "EUPHONY_PREP_V_U,EUPHONY_CONJ_I_Y,EUPHONY_PREP_Z_IZ_ZI,EUPHONY_PREP_O_OB" \
+ "DATE_WEEKDAY1,DASH,UK_HIDDEN_CHARS,UPPER_INDEX_FOR_M,DEGREES_FOR_C,DIGITS_AND_LETTERS," \
+ "UK_MIXED_ALPHABETS,UK_SIMPLE_REPLACE_SOFT"
//,UK_SIMPLE_REPLACE,INVALID_DATE,YEAR_20001,"
final JLanguageTool langTool = new MultiThreadedJLanguageTool(new Ukrainian());
final SRXSentenceTokenizer stokenizer = new SRXSentenceTokenizer(new Ukrainian());
final List allRules = langTool.getAllRules()
List check(String text, boolean force, List errorLines) {
if( ! force && text.trim().isEmpty() )
return
List matches = langTool.check(text);
if( matches.size() > 0 ) {
printMatches(matches, text, errorLines)
}
return matches
}
@CompileStatic
void printMatches(List matches, String text, List errorLines) {
def i = 0
def total = 0
def lines = text.split("\n")
for (RuleMatch match : matches) {
errorLines << "Rule ID: ${match.getRule().getId()}".toString()
errorLines << "Message: " + match.getMessage().replace("", "«").replace(" ", "»")
def chunkOffset = 0
def leftOff = 40
def rightOff = 40
def posInSent = match.getFromPos() - leftOff
def posToInSent = match.getToPos() + rightOff
def prefix = ""
def suffix = ""
if( posInSent <= 0 ) {
posInSent = 0
}
else {
prefix = "…"
chunkOffset = 1
}
if( posToInSent >= text.length() ) {
posToInSent = text.length()
}
else {
suffix = "…"
}
def sample = text[posInSent.. 0 ? args[0] : "."
def outDir = "$dir/err"
def outDirFile = new File(outDir)
if( ! outDirFile.isDirectory() ) {
System.err.println "Output dir $outDir does not exists"
return
}
def nlpUk = new EvaluateText()
nlpUk.langTool.disableRules(Arrays.asList(RULES_TO_IGNORE.split(",")))
def ratings = ["коеф помил унік слів файл"]
new File("$outDir/ratings.txt").text = ""
new File(dir).eachFile { file->
if( ! file.name.endsWith(".txt") )
return
def text = file.text
List errorLines = []
println(String.format("checking $file.name, words: %d, size: %d", word_count(text), text.size()))
def paragraphs = text.split("\n\n")
int matchCnt = 0
int uniqueRules = 0
try {
paragraphs.each { String para ->
def matches = nlpUk.check(para, false, errorLines)
if( matches ) {
matchCnt += matches.size()
uniqueRules += getUniqueRuleCount(matches)
}
}
def matches = nlpUk.check("", true, errorLines)
if( matches ) {
matchCnt += matches.size()
uniqueRules += getUniqueRuleCount(matches)
}
def wc = word_count(text)
def rating = Math.round(matchCnt * 10000 / wc)/100
ratings << String.format("%1.2f %4d %4d %6d %s", rating, matchCnt, uniqueRules, wc, file.name)
new File(outDir + "/" + file.name.replace(".txt", ".err.txt")).text = errorLines.join("\n")
errorLines.clear() }
catch(Exception e) {
e.printStackTrace();
}
}
new File("$outDir/ratings.txt").text = ratings.join("\n")
}
static getUniqueRuleCount(matches) {
matches.collect{ it.rule.id == "UK_SIMPLE_REPLACE" ? it.message : it.rule.id }.unique().size()
}
}