ua.net.nlp.other.CleanText.groovy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of nlp_uk Show documentation
Show all versions of nlp_uk Show documentation
NLP tools for Ukrainian language
The newest version!
#!/bin/env groovy
package ua.net.nlp.other
// This script reads all .txt files in given directory (default is "txt/")
// and tries to clean up the text ot make it more suitable for NLP
// The output files go into -good
// Cleanups:
// fix broken encoding (broken cp1251 etc)
// remove soft hyphen
// replace weird apostrophe characters with correct one (')
// merge some simple word wraps
// remove backslash from escaped quotes
// weird ї and й via combining characters (U+0308)
// і instead of ї: промисловоі, нацполіціі
// clean up latin/cyrillic character mix
// CO/CO2 with cyr/lat mix
// degree Celcius with cyr
// digit 3 instead of letter З
// try to detect and skip two-column texts
// separate leading hyphen (e.g. -Алло! - проричав він в слухавку)
// fix dangling hyphen (at the end of the line)
// check and warn for spaced words (e.g. Н А Т А Л К А)
// mark/rate or remove Russian paragraphs
@GrabConfig(systemClassLoader=true)
@Grab(group='org.languagetool', module='languagetool-core', version='6.5')
@Grab(group='org.languagetool', module='language-uk', version='6.5')
@Grab(group='org.languagetool', module='language-ru', version='6.5')
@Grab(group='org.languagetool', module='language-en', version='6.5')
@Grab(group='ch.qos.logback', module='logback-classic', version='1.4.+')
@Grab(group='info.picocli', module='picocli', version='4.6.+')
import java.nio.charset.StandardCharsets
import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.Paths
import java.util.function.Function
import java.util.regex.MatchResult
import java.util.regex.Matcher
import java.util.regex.Pattern
import picocli.CommandLine
import picocli.CommandLine.Option
import picocli.CommandLine.ParameterException
import groovy.io.FileVisitResult
import groovy.transform.CompileStatic
import org.languagetool.tagging.uk.*
import org.languagetool.tokenizers.SRXSentenceTokenizer
import org.languagetool.tokenizers.uk.UkrainianWordTokenizer
import org.languagetool.tagging.Tagger
import org.languagetool.tagging.ru.RussianTagger
import org.languagetool.AnalyzedToken
import org.languagetool.language.Ukrainian
import org.slf4j.Logger
@CompileStatic
class CleanText {
@groovy.transform.SourceURI
static URI SOURCE_URI
static String SCRIPT_DIR=new File(SOURCE_URI).parent
static void main(String[] args) {
// warnForEncoding()
long tm1 = System.currentTimeMillis()
def cl = new GroovyClassLoader()
cl.addClasspath(SCRIPT_DIR + "/../../../../")
def resourceDir = SCRIPT_DIR + "/../../../../../resources"
if( ! new File(resourceDir).isDirectory() ) {
// println "making missing dir: $resourceDir"
new File(resourceDir).mkdirs()
}
cl.addClasspath(resourceDir)
def basePkg = CleanText.class.getPackageName()
def tagTextClass = cl.loadClass("${basePkg}.clean.CleanTextCore")
def m = tagTextClass.getMethod("main", String[].class)
def mArgs = [args].toArray() // new Object[]{args} - Eclips chokes on this
long tm2 = System.currentTimeMillis()
if( "--timing" in args ) {
System.err.println("Loaded classes in ${tm2-tm1} ms")
}
m.invoke(null, mArgs)
}
}