ua.net.nlp.other.clean.MarkLanguageModule.groovy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of nlp_uk Show documentation
Show all versions of nlp_uk Show documentation
NLP tools for Ukrainian language
The newest version!
package ua.net.nlp.other.clean
import java.nio.charset.StandardCharsets
import java.util.regex.Matcher
import java.util.regex.Pattern
import org.languagetool.AnalyzedToken
import org.languagetool.tagging.ru.RussianTagger
import groovy.transform.CompileDynamic
import groovy.transform.CompileStatic
import groovy.transform.PackageScope
import ua.net.nlp.other.clean.CleanOptions.MarkOption
import ua.net.nlp.other.clean.CleanOptions.ParagraphDelimiter
import ua.net.nlp.other.clean.CleanRequest
@PackageScope
@CompileStatic
class MarkLanguageModule {
CleanOptions options
OutputTrait out
LtModule ltModule
static class ParaIterator implements Iterator {
String text;
String delim;
int from = 0
@Override
public boolean hasNext() {
return from < text.length()
}
@Override
public String next() {
if( from == text.length()) throw new IllegalArgumentException()
int pos = text.indexOf(delim, from)
if( pos == -1 ) pos = text.length()
def ret = pos == from ? delim : text[from..(?!---<\/span>)(.*?)<\/span>/
String markRussian(CleanRequest request, String outDirName) {
// clean previous marks unless they are cut
def text = request.text
if( text.contains('
markRuChunks(sent, ruChunks, outDirName)
}
.join("")
if( options.markLanguages == MarkOption.cut ) {
writeCutTextToFile(ruChunks, request, outDirName)
}
text
}
private markRuChunks(String sent, List ruChunks, String outDirName) {
if( ! (sent =~ /[а-яіїєґёА-ЯІЇЄҐЁ]/) ) {
return sent
}
List vals = evalChunk(sent)
Double ukRate = vals[0], ruRate = vals[1]
if( ukRate < ruRate ) {
ruRate = Math.round(ruRate * 100d)/100d
String marked = "$sent".replaceFirst(/(?s)([\h\v]+)(<\/span>)$/, '$2$1')
if( options.markLanguages == MarkOption.mark ) {
marked
}
else {
ruChunks << marked
'---'
}
}
else {
sent
}
}
private writeCutTextToFile(List ruChunks, CleanRequest request, String outDirName) {
if( ruChunks && request.file ) {
String ruText = ruChunks.join("\n\n")
def ruFile
if( outDirName ) {
def ruFilename = request.file.name.replaceFirst(/\.txt/, '.ru.txt')
def parentDir = request.outFile.getParentFile()
def ruDir = new File(parentDir, "ru")
ruDir.mkdirs()
ruFile = new File(ruDir, ruFilename)
}
else {
ruFile = new File(request.outFile.absolutePath.replaceFirst(/\.txt/, '.ru.txt'))
}
ruFile.setText(ruText, StandardCharsets.UTF_8.name())
}
}
@CompileStatic
List evalChunk(String text) {
// double ukCnt = 0
int ruCnt = 0
int totalCnt = 0
int ruCharCnt = 0
def chunks = ltModule.ukWordTokenizer.tokenize(text)
.findAll{ it && it ==~ /(?ius)[а-яіїєґё'\u2019\u02BC\u0301-]+/ }
.collect { it.replaceAll(/^['\u2019\u02BC]+|['\u2019\u02BC]+$/, '') }
.findAll { it ==~ /(?ius)[а-яіїєґё'\u2019\u02BC\u0301-]+/ }
if( chunks.isEmpty() )
return [(double)1.0, (double)0.0]
// Лариса ГУТОРОВА
if( chunks.size() < 10 ) {
if( ! (text =~ /[ыэъё]/) || text =~ /[ієїґ]/ ) {
return [(double)0.5, (double)0.1]
}
}
int ukSum = 0
int ruSum = 0
int ukSum10 = 0
int ruSum10 = 0
for(String w: chunks) {
int ukWeight = getUkWordRating(w)
ukSum += ukWeight
if( ukWeight == 10 ) {
ukSum10 += ukWeight
}
def debugStr = "$w: uk: $ukWeight"
if( ukWeight < 10 ) {
int ruWeight = w =~ /(?iu)[ыэёъ]/ ? 10 : ltModule.knownWordRu(w) ? 8 : 0
// workaround for 1-letter abbreviations that Russian tagger claims to know
if( w ==~ /(?iu)[бвгдежзклмнпрстуфхцчшщю]/ ) {
ruWeight = 0
}
ruSum += ruWeight
if( ruWeight == 10 ) {
ruSum10 += ruWeight
}
debugStr += ", ru: $ruWeight"
}
// println debugStr
}
double ukRate = (double)ukSum / chunks.size() / 10
double ruRate = (double)ruSum / chunks.size() / 10
if( ruSum10 > 0 && ukSum10 == 0 ) {
ruRate = 1
}
//println ":: uk: $ukRate ru: $ruRate words: ${chunks.size()}"
if( ukRate == 0 && ruRate > 0.1 ) {
ruRate = 1
}
// println "check:: '${text.trim()}' : $ukRate, $ruRate"
[ukRate, ruRate]
}
@CompileStatic
static List splitWithDelimiters(String str, Pattern delimPattern) {
List parts = new ArrayList();
Matcher matcher = delimPattern.matcher(str);
int lastEnd = 0;
while (matcher.find()) {
int start = matcher.start();
if (lastEnd != start) {
String nonDelim = str.substring(lastEnd, start);
parts.add(nonDelim);
}
String delim = matcher.group();
parts.add(delim);
lastEnd = matcher.end();
}
if (lastEnd != str.length()) {
String nonDelim = str.substring(lastEnd);
parts.add(nonDelim);
}
return parts;
}
@CompileDynamic
int getUkWordRating(String word) {
if( word =~ /(?iu)[іїєґ'\u2019\u02BC]|^й$/ )
return 10
try {
List tokenReadings = ltModule.ukTagger.getAnalyzedTokens(word)
if( tokenReadings[0].hasNoTag() )
return 0
def badToken = tokenReadings.find { AnalyzedToken t ->
t.getPOSTag() =~ /:(bad|subst)/ && ! t.getPOSTag().contains("&adjp:actv") && ! (t.getLemma() =~ /(ння|ий)$/) }
if( badToken ) {
def nonBadToken = badToken = tokenReadings.find { AnalyzedToken t -> ! (t.getPOSTag() =~ /:(bad|subst)/) }
if( ! nonBadToken )
return 2
}
// if( token.find { AnalyzedToken t -> t.getPOSTag() =~ /:prop:geo|noun:inanim:.:v_kly/ } )
// return 5
return 8
}
catch (Exception e) {
System.err.println("Failed on word: " + word)
throw e
}
}
}