All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ua.net.nlp.other.clean.SpacingModule.groovy Maven / Gradle / Ivy

The newest version!
package ua.net.nlp.other.clean

import java.util.regex.Pattern
import java.util.stream.Collectors

import org.languagetool.AnalyzedToken
import org.languagetool.AnalyzedTokenReadings
import groovy.transform.CompileStatic
import groovy.transform.PackageScope
import groovy.transform.ToString
import ua.net.nlp.bruk.WordReading
import ua.net.nlp.tools.tag.DisambigStats
import ua.net.nlp.tools.tag.DisambigStats.Stat

@PackageScope
@CompileStatic
class SpacingModule {

    private static final String MONTHS = /січня|лютого|березня|квітня|травня|червня|липня|серпня|вересня|жовтня|листопада|грудня/
    private static final Pattern SPACED_MONTHS_REGEX = Pattern.compile(MONTHS.replaceAll(/([а-яіїє])(?=[а-яіїє])/, '$1 '))
    private static final Pattern SPACING_PATTERN = ~ /([а-яіїєґА-ЯІЇЄҐ] ){5,}/

    OutputTrait out
    LtModule ltModule
    DisambigStats disambigStats = new DisambigStats()
    boolean fullSpacing = false
    
    SpacingModule() {
//        disambigStats.loadDisambigStats()
    }

    String cleanupSpacing(String text) {
        text = text.replace("У к р а ї н и", "України")
        text = text.replaceAll(/([0-9])\h+р о к у\b/, '$1 року')
        text = text.replaceAll(/([0-9])\h+г о д и н а\b/, '$1 година')
        text = text.replaceAll(SPACED_MONTHS_REGEX, { String w1 ->
            w1.replace(' ', '')
        })
        
        // stenograms from Rada
        text = text.replaceAll(/[СсCc] е с і й н и й\h+з а л\h+В е р х о в н о ї\h+Р а д и/, "Сесійний зал Верховної Ради")
        text = text.replace("О п л е с к и", "Оплески")
        text = text.replaceAll(/П і с л я\h+п е р е р в и/, "Після перерви")
        text = text.replaceAll(/Л у н а є\h+Г і м н/, "Лунає Гімн")
        text = text.replaceAll(/Л у н а є\h+Д е р ж а в н и й\h+Г і м н/, "Лунає Державний Гімн")
        text = text.replace("п о с т а н о в л я є", "постановляє")
        text = text.replaceAll(/Х в и л и н а\h+м о в ч а н н я/, "Хвилина мовчання")
        text = text.replaceAll(/Й д е\h+р е є с т р а ц і я/, "Йде реєстрація")
        text = text.replaceAll(/Ш у м\h+у\h+з а л і/, "Шум у залі")


        text = text.replaceAll(/\b([гГ])\h+([а-яіїєґ'\u2019\u02bc-]{3,})/, { all, g, rest ->
            if( ! ltModule.knownWord(rest) ) {
                def newWord = "$g$rest"
                if( ltModule.knownWord(newWord) ) {
                    return newWord
                }
            }
            all
        })
        
        def m = SPACING_PATTERN.matcher(text)
        if( m.find() ) {
            
            def lines = text.lines()
            long cnt = lines.filter{l -> SPACING_PATTERN.matcher(l).find()}.count()
            
            out.println "\tWARNING: Possible spacing in words, e.g \"${CleanTextCore2.getContext(m, text)}\": ${cnt} of ${text.lines().count()} lines"

            if( fullSpacing ) {
                text = removeSpacing(text)
            }
        }
        text
    }
    
    String removeSpacing(String text) {
        text = text.replaceAll(/\. (?=\.)/, '.')
        
//        def t = text.readLines()
//            .parallelStream()
//            .map{ removeSpacingLine(it) }
//            .collect(Collectors.joining("\n"))

        def t = new StringBuilder(text.length())
        text.readLines().eachWithIndex { l, idx ->
            debug "line $idx :: $l"
            t.append( removeSpacingLine(l) ).append('\n')
        }
                
        if( ! text.endsWith("\n") ) {
            t.deleteCharAt(t.length()-1)
//            t = "$t\n"
        }
        
        t.toString()
    }
    
    String removeSpacingLine(String text) { 
        
        List chunks = MarkLanguageModule.splitWithDelimiters(text, ~ /(?iu)[^ .а-яіїєґa-z0-9\u0301'\u2019\u02BC\u2013\u2011-]+/)

        chunks = (List)chunks.collect{ it.split(/  +/) as List }.flatten()
        
        debug "chunks size: ${chunks.size()}"
        
        List chunks2 = (List)chunks.collect { String cnk ->
            List newChunks = []
            def sb = new StringBuilder(512)
            cnk.eachWithIndex { String ss, int idx ->
                char ch = ss.charAt(0)
                sb.append(ch)
                if( (ch == '.') // && ! (cnk[idx..-1] =~ /^ ?\./)) 
                        || cnk[idx..-1] =~ /^[а-яіїєґ] ?[А-ЯІЇЄҐ]/ ) {
                    newChunks << sb.toString()
                    sb = new StringBuilder(512)
                }
            }
            if( sb ) {
                newChunks << sb.toString()
            }
            newChunks
        }
        .flatten()

        debug "chunks2 size: ${chunks2.size()}"
        
        def dags = chunks2.parallelStream().map { c ->
            
            if( c.trim() && c =~ /(?iu)[а-яіїєґa-z0-9]/ && ! (c =~ /- ?$/)) {
                debug "chunk: $c"
                def noSpaces = c.replace(' ', '')
                def dags = getDag(noSpaces, '', [])
//                debug "Got dags: ${dags}"
                
                if( dags.size() == 0 ) {
                    out.println "\tFailed to merge: $c"
                    return c
                }
                
                return pickTheDag(dags)
            }
            else {
                if( c == '\n' ) {
                    debug "-- NL --"
                }
                else {
                    debug c
                }
                c
            }
        }
        .toList()
        debug "-- $dags"
     
        def sb = new StringBuilder(1024)
        dags.each { 
            if( sb && ! Character.isWhitespace(sb.charAt(sb.length()-1)) && ! noSpace(it.charAt(0)) ) {
                sb.append(" ")
            }
            sb.append(it)
        }
        
        return sb
    }
    
    static boolean noSpace(char ch) {
        ".,:;»)] \n\t".indexOf((int)ch) >= 0
    } 
    
    List getDag(String text, String indent, List prevs) {
        debug "$indent got: $text"
        
        List nodes = []
        
        assert text
        
        for(int ii=text.length()-1; ii>=0; ii--) {
            String curr = text[0..ii]
            
            if( curr == "." ) {
                nodes << new Node(word: curr, children: [])
                return nodes
            }
            
            // 4 1-letter words is not good
            if( curr.length() == 1 
                    && prevs.size() > 3
                    && prevs[-1].length() == 1
                    && prevs[-2].length() == 1
                    && prevs[-3].length() == 1 )
                return nodes
            
            if( goodWord(curr) ) {
//                println "$indent$curr"
                if( ii < text.length()-1 ) {
                    def others = getDag(text[ii+1..-1], "$indent  ", prevs + curr)
//                    def status = others == null ? "" : ""
//                    debug "$indent  :${ others == null || others.size() == 0 ? '<--' : others}"
                    if( ! others /*== null*/ )
                        continue
                        
                    nodes << new Node(word: curr, children: others)
                }
                else { // end of the chunk
                    nodes << new Node(word: curr, children: [])
                    debug "$indent  +++"
                    return nodes
                    // this DAG is done
                }
            }
            else {
                if( curr.length() == 1 )
                    return nodes
            }
            
        }
        
        nodes
    }

    boolean goodWord(String word) {
        
        int maxLen = word.contains("-") ? 45 : 30
        if( word.length() >= maxLen )
            return false
        
        List tokens = ltModule.tagWord(word)
        for(AnalyzedToken token: tokens) {
            if( ! token.hasNoTag() ) {
                if( (word.length() > 1 || ! token.getPOSTag().contains("abbr")) 
                        && ! (token.getPOSTag() =~ /^noun:inanim:.:v_kly/) )
                    return true
            }
        }
        return false
    }
    
    static class St {
        String dag
        double rate
        String toString() {
            "$rate ${dag.replaceAll(/[\n\r]+/, '\n')}"
        }
    }

    int nested = 0
    
    List getText(Node node, String parentBase) {
        if( parentBase.length() > nested ) {
            nested = parentBase.length()
            debug "Nested: $nested"
        }
        
        List out = []
        String currBase = "$parentBase${node.word}"
        if( node.children ) {
            node.children.each { ch ->
                out += getText(ch, "${currBase} ")
            }
        }
        else {
            out << currBase
        }
        
        return out.collect{ it.replaceAll(/ +([,.:;»“)\]\n\t])/, '$1') }
    }
    
    String pickTheDag(List dags) {
//        if( dags.size() == 1 && ! dags[0].children )
//            return getText(dags[0].word, "")
        
        List dagTexts = (List)dags.collect{ getText(it, "") }.flatten()
        def sb = new StringBuilder(256)
        
        def rated = dagTexts.collect { txt ->
            def tokens = ltModule.tagSent(txt)
            def sum = tokens.sum{ t -> /*rate(t)*/ Math.pow(t.cleanToken.length(), 3) } as Double
            def rate = sum / Math.pow(tokens.size(), 3)
            new St(rate: rate, dag: txt)
        }
//        println "rated: $rated"
        rated.max { it.rate }.dag
    }
        
    double rate(AnalyzedTokenReadings readings) {
        def cleanToken = readings.getCleanToken()
        
        if( ! disambigStats.statsByWord.containsKey(cleanToken) ) {
            // if no stats and there are non-prop readings, then try lowercase
            if( disambigStats.UPPERCASED_PATTERN.matcher(cleanToken).matches()
                    && readings.size() > readings.count{ r -> r.getPOSTag() == null || r.getPOSTag().contains(":prop") }  ) {
                cleanToken = cleanToken.toLowerCase()
            }
        }
        
        Map statsForWord = disambigStats.statsByWord[cleanToken]
        
//        assert statsForWord, "no status for $cleanToken"
        
        return statsForWord ? statsForWord.max { it.value.rate }.value.rate : 0
    }
    
    static class Node {
        String word
        List children
        
        String toString() {
            "$word $children"
        }
        
        String toText() {
            def next = children ? " ${children[0].toText()}"  : ""
            "$word$next"
        }
    }

    
    private static void debug(String text) {
        //println "DBG: $text"
    }    
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy