ua.net.nlp.other.clean.SpacingModule.groovy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of nlp_uk Show documentation
Show all versions of nlp_uk Show documentation
NLP tools for Ukrainian language
The newest version!
package ua.net.nlp.other.clean
import java.util.regex.Pattern
import java.util.stream.Collectors
import org.languagetool.AnalyzedToken
import org.languagetool.AnalyzedTokenReadings
import groovy.transform.CompileStatic
import groovy.transform.PackageScope
import groovy.transform.ToString
import ua.net.nlp.bruk.WordReading
import ua.net.nlp.tools.tag.DisambigStats
import ua.net.nlp.tools.tag.DisambigStats.Stat
@PackageScope
@CompileStatic
class SpacingModule {
private static final String MONTHS = /січня|лютого|березня|квітня|травня|червня|липня|серпня|вересня|жовтня|листопада|грудня/
private static final Pattern SPACED_MONTHS_REGEX = Pattern.compile(MONTHS.replaceAll(/([а-яіїє])(?=[а-яіїє])/, '$1 '))
private static final Pattern SPACING_PATTERN = ~ /([а-яіїєґА-ЯІЇЄҐ] ){5,}/
OutputTrait out
LtModule ltModule
DisambigStats disambigStats = new DisambigStats()
boolean fullSpacing = false
SpacingModule() {
// disambigStats.loadDisambigStats()
}
String cleanupSpacing(String text) {
text = text.replace("У к р а ї н и", "України")
text = text.replaceAll(/([0-9])\h+р о к у\b/, '$1 року')
text = text.replaceAll(/([0-9])\h+г о д и н а\b/, '$1 година')
text = text.replaceAll(SPACED_MONTHS_REGEX, { String w1 ->
w1.replace(' ', '')
})
// stenograms from Rada
text = text.replaceAll(/[СсCc] е с і й н и й\h+з а л\h+В е р х о в н о ї\h+Р а д и/, "Сесійний зал Верховної Ради")
text = text.replace("О п л е с к и", "Оплески")
text = text.replaceAll(/П і с л я\h+п е р е р в и/, "Після перерви")
text = text.replaceAll(/Л у н а є\h+Г і м н/, "Лунає Гімн")
text = text.replaceAll(/Л у н а є\h+Д е р ж а в н и й\h+Г і м н/, "Лунає Державний Гімн")
text = text.replace("п о с т а н о в л я є", "постановляє")
text = text.replaceAll(/Х в и л и н а\h+м о в ч а н н я/, "Хвилина мовчання")
text = text.replaceAll(/Й д е\h+р е є с т р а ц і я/, "Йде реєстрація")
text = text.replaceAll(/Ш у м\h+у\h+з а л і/, "Шум у залі")
text = text.replaceAll(/\b([гГ])\h+([а-яіїєґ'\u2019\u02bc-]{3,})/, { all, g, rest ->
if( ! ltModule.knownWord(rest) ) {
def newWord = "$g$rest"
if( ltModule.knownWord(newWord) ) {
return newWord
}
}
all
})
def m = SPACING_PATTERN.matcher(text)
if( m.find() ) {
def lines = text.lines()
long cnt = lines.filter{l -> SPACING_PATTERN.matcher(l).find()}.count()
out.println "\tWARNING: Possible spacing in words, e.g \"${CleanTextCore2.getContext(m, text)}\": ${cnt} of ${text.lines().count()} lines"
if( fullSpacing ) {
text = removeSpacing(text)
}
}
text
}
String removeSpacing(String text) {
text = text.replaceAll(/\. (?=\.)/, '.')
// def t = text.readLines()
// .parallelStream()
// .map{ removeSpacingLine(it) }
// .collect(Collectors.joining("\n"))
def t = new StringBuilder(text.length())
text.readLines().eachWithIndex { l, idx ->
debug "line $idx :: $l"
t.append( removeSpacingLine(l) ).append('\n')
}
if( ! text.endsWith("\n") ) {
t.deleteCharAt(t.length()-1)
// t = "$t\n"
}
t.toString()
}
String removeSpacingLine(String text) {
List chunks = MarkLanguageModule.splitWithDelimiters(text, ~ /(?iu)[^ .а-яіїєґa-z0-9\u0301'\u2019\u02BC\u2013\u2011-]+/)
chunks = (List)chunks.collect{ it.split(/ +/) as List }.flatten()
debug "chunks size: ${chunks.size()}"
List chunks2 = (List)chunks.collect { String cnk ->
List newChunks = []
def sb = new StringBuilder(512)
cnk.eachWithIndex { String ss, int idx ->
char ch = ss.charAt(0)
sb.append(ch)
if( (ch == '.') // && ! (cnk[idx..-1] =~ /^ ?\./))
|| cnk[idx..-1] =~ /^[а-яіїєґ] ?[А-ЯІЇЄҐ]/ ) {
newChunks << sb.toString()
sb = new StringBuilder(512)
}
}
if( sb ) {
newChunks << sb.toString()
}
newChunks
}
.flatten()
debug "chunks2 size: ${chunks2.size()}"
def dags = chunks2.parallelStream().map { c ->
if( c.trim() && c =~ /(?iu)[а-яіїєґa-z0-9]/ && ! (c =~ /- ?$/)) {
debug "chunk: $c"
def noSpaces = c.replace(' ', '')
def dags = getDag(noSpaces, '', [])
// debug "Got dags: ${dags}"
if( dags.size() == 0 ) {
out.println "\tFailed to merge: $c"
return c
}
return pickTheDag(dags)
}
else {
if( c == '\n' ) {
debug "-- NL --"
}
else {
debug c
}
c
}
}
.toList()
debug "-- $dags"
def sb = new StringBuilder(1024)
dags.each {
if( sb && ! Character.isWhitespace(sb.charAt(sb.length()-1)) && ! noSpace(it.charAt(0)) ) {
sb.append(" ")
}
sb.append(it)
}
return sb
}
static boolean noSpace(char ch) {
".,:;»)] \n\t".indexOf((int)ch) >= 0
}
List getDag(String text, String indent, List prevs) {
debug "$indent got: $text"
List nodes = []
assert text
for(int ii=text.length()-1; ii>=0; ii--) {
String curr = text[0..ii]
if( curr == "." ) {
nodes << new Node(word: curr, children: [])
return nodes
}
// 4 1-letter words is not good
if( curr.length() == 1
&& prevs.size() > 3
&& prevs[-1].length() == 1
&& prevs[-2].length() == 1
&& prevs[-3].length() == 1 )
return nodes
if( goodWord(curr) ) {
// println "$indent$curr"
if( ii < text.length()-1 ) {
def others = getDag(text[ii+1..-1], "$indent ", prevs + curr)
// def status = others == null ? "" : ""
// debug "$indent :${ others == null || others.size() == 0 ? '<--' : others}"
if( ! others /*== null*/ )
continue
nodes << new Node(word: curr, children: others)
}
else { // end of the chunk
nodes << new Node(word: curr, children: [])
debug "$indent +++"
return nodes
// this DAG is done
}
}
else {
if( curr.length() == 1 )
return nodes
}
}
nodes
}
boolean goodWord(String word) {
int maxLen = word.contains("-") ? 45 : 30
if( word.length() >= maxLen )
return false
List tokens = ltModule.tagWord(word)
for(AnalyzedToken token: tokens) {
if( ! token.hasNoTag() ) {
if( (word.length() > 1 || ! token.getPOSTag().contains("abbr"))
&& ! (token.getPOSTag() =~ /^noun:inanim:.:v_kly/) )
return true
}
}
return false
}
static class St {
String dag
double rate
String toString() {
"$rate ${dag.replaceAll(/[\n\r]+/, '\n')}"
}
}
int nested = 0
List getText(Node node, String parentBase) {
if( parentBase.length() > nested ) {
nested = parentBase.length()
debug "Nested: $nested"
}
List out = []
String currBase = "$parentBase${node.word}"
if( node.children ) {
node.children.each { ch ->
out += getText(ch, "${currBase} ")
}
}
else {
out << currBase
}
return out.collect{ it.replaceAll(/ +([,.:;»“)\]\n\t])/, '$1') }
}
String pickTheDag(List dags) {
// if( dags.size() == 1 && ! dags[0].children )
// return getText(dags[0].word, "")
List dagTexts = (List)dags.collect{ getText(it, "") }.flatten()
def sb = new StringBuilder(256)
def rated = dagTexts.collect { txt ->
def tokens = ltModule.tagSent(txt)
def sum = tokens.sum{ t -> /*rate(t)*/ Math.pow(t.cleanToken.length(), 3) } as Double
def rate = sum / Math.pow(tokens.size(), 3)
new St(rate: rate, dag: txt)
}
// println "rated: $rated"
rated.max { it.rate }.dag
}
double rate(AnalyzedTokenReadings readings) {
def cleanToken = readings.getCleanToken()
if( ! disambigStats.statsByWord.containsKey(cleanToken) ) {
// if no stats and there are non-prop readings, then try lowercase
if( disambigStats.UPPERCASED_PATTERN.matcher(cleanToken).matches()
&& readings.size() > readings.count{ r -> r.getPOSTag() == null || r.getPOSTag().contains(":prop") } ) {
cleanToken = cleanToken.toLowerCase()
}
}
Map statsForWord = disambigStats.statsByWord[cleanToken]
// assert statsForWord, "no status for $cleanToken"
return statsForWord ? statsForWord.max { it.value.rate }.value.rate : 0
}
static class Node {
String word
List children
String toString() {
"$word $children"
}
String toText() {
def next = children ? " ${children[0].toText()}" : ""
"$word$next"
}
}
private static void debug(String text) {
//println "DBG: $text"
}
}