ua.net.nlp.other.CheckText.groovy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of nlp_uk Show documentation
Show all versions of nlp_uk Show documentation
NLP tools for Ukrainian language
The newest version!
#!/usr/bin/env groovy
package ua.net.nlp.other
// This script checks the text with LanguageTool
// NOTE: it disables some rules, like spelling, double whitespace etc
@Grab(group='org.languagetool', module='languagetool-core', version='6.5')
@Grab(group='org.languagetool', module='language-uk', version='6.5')
@Grab(group='ch.qos.logback', module='logback-classic', version='1.4.+')
@Grab(group='info.picocli', module='picocli', version='4.6.+')
import org.languagetool.*
import org.languagetool.rules.*
import org.languagetool.tokenizers.*
import org.languagetool.language.*
import groovy.transform.CompileStatic
import picocli.CommandLine
import picocli.CommandLine.Option
import picocli.CommandLine.ParameterException
import org.languagetool.JLanguageTool.ParagraphHandling
import org.languagetool.markup.*
class CheckText {
private static final String RULES_TO_IGNORE="MORFOLOGIK_RULE_UK_UA,COMMA_PARENTHESIS_WHITESPACE,WHITESPACE_RULE," \
+ "UK_MIXED_ALPHABETS,UK_SIMPLE_REPLACE,UK_SIMPLE_REPLACE_SOFT,EUPHONY_OTHER,EUPHONY_PREP_V_U,INVALID_DATE,YEAR_20001," \
+ "DATE_WEEKDAY1,DASH,UK_HIDDEN_CHARS,UPPER_INDEX_FOR_M,DEGREES_FOR_C,OVKA_FOR_PROCESS"
final JLanguageTool langTool = new MultiThreadedJLanguageTool(new Ukrainian());
final SRXSentenceTokenizer stokenizer = new SRXSentenceTokenizer(new Ukrainian());
final List allRules = langTool.getAllRules()
List check(String text, boolean force, List errorLines) {
if( ! force && text.trim().isEmpty() )
return
List matches = langTool.check(text);
if( matches.size() > 0 ) {
printMatches(matches, text, errorLines)
}
return matches
}
@CompileStatic
void printMatches(List matches, String text, List errorLines) {
def i = 0
def total = 0
def lines = text.split("\n")
for (RuleMatch match : matches) {
errorLines << "Rule ID: ${match.getRule().getId()}".toString()
errorLines << "Message: " + match.getMessage().replace("", "«").replace(" ", "»")
def chunkOffset = 0
def leftOff = 40
def rightOff = 40
def posInSent = match.getFromPos() - leftOff
def posToInSent = match.getToPos() + rightOff
def prefix = ""
def suffix = ""
if( posInSent <= 0 ) {
posInSent = 0
}
else {
prefix = "…"
chunkOffset = 1
}
if( posToInSent >= text.length() ) {
posToInSent = text.length()
}
else {
suffix = "…"
}
def sample = text[posInSent.. - .txt + .tagged.txt/.xml)"])
// String output
boolean quiet
@Option(names= ["-h", "--help"], usageHelp= true, description= "Show this help message and exit.")
boolean helpRequested
}
@CompileStatic
static CheckOptions parseOptions(String[] argv) {
CheckOptions options = new CheckOptions()
CommandLine commandLine = new CommandLine(options)
try {
commandLine.parseArgs(argv)
if (options.helpRequested) {
commandLine.usage(System.out)
System.exit 0
}
} catch (ParameterException ex) {
println ex.message
commandLine.usage(System.out)
System.exit 1
}
options
}
static void main(String[] argv) {
CheckOptions options = parseOptions(argv)
def nlpUk = new CheckText()
nlpUk.langTool.disableRules(Arrays.asList(RULES_TO_IGNORE.split(",")))
def textToAnalyze = new File(options.input).text
def paragraphs = textToAnalyze.split("\n\n")
long tm1 = System.currentTimeMillis()
paragraphs.each { para ->
def errors = []
nlpUk.check(para, false, errors)
errors.each { println it }
}
def errors = []
nlpUk.check("", false, [])
errors.each { println it }
long tm2 = System.currentTimeMillis()
println String.format("Check time: %d ms, (%d chars/sec), %d paragraphs",
tm2-tm1, (int)(textToAnalyze.length()*1000/(tm2-tm1)), paragraphs.size())
}
}