All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ua.net.nlp.other.clean.MarkLanguageModule.groovy Maven / Gradle / Ivy

The newest version!
package ua.net.nlp.other.clean

import java.nio.charset.StandardCharsets
import java.util.regex.Matcher
import java.util.regex.Pattern

import org.languagetool.AnalyzedToken
import org.languagetool.tagging.ru.RussianTagger

import groovy.transform.CompileDynamic
import groovy.transform.CompileStatic
import groovy.transform.PackageScope
import ua.net.nlp.other.clean.CleanOptions.MarkOption
import ua.net.nlp.other.clean.CleanOptions.ParagraphDelimiter
import ua.net.nlp.other.clean.CleanRequest

@PackageScope
@CompileStatic
class MarkLanguageModule {
    
    CleanOptions options
    OutputTrait out
    LtModule ltModule
    
    static class ParaIterator implements Iterator {
        String text;
        String delim;
        int from = 0
        
        @Override
        public boolean hasNext() {
            return from < text.length()
        }
        
        @Override
        public String next() {
            if( from == text.length()) throw new IllegalArgumentException()
            
            int pos = text.indexOf(delim, from)
            if( pos == -1 ) pos = text.length()
            def ret = pos == from ? delim : text[from..(?!---<\/span>)(.*?)<\/span>/
    
    String markRussian(CleanRequest request, String outDirName) {
        // clean previous marks unless they are cut
        def text = request.text
        
        if( text.contains('
                markRuChunks(sent, ruChunks, outDirName)
            }
            .join("")
        
        if( options.markLanguages == MarkOption.cut ) {
            writeCutTextToFile(ruChunks, request, outDirName)
        }

        text
    }

    private markRuChunks(String sent, List ruChunks, String outDirName) {
        if( ! (sent =~ /[а-яіїєґёА-ЯІЇЄҐЁ]/) ) {
            return sent
        }

        List vals = evalChunk(sent)
        Double ukRate = vals[0], ruRate = vals[1]

        if( ukRate < ruRate ) {
            ruRate = Math.round(ruRate * 100d)/100d
            String marked = "$sent".replaceFirst(/(?s)([\h\v]+)(<\/span>)$/, '$2$1')
            if( options.markLanguages == MarkOption.mark ) {
                marked
            }
            else {
                ruChunks << marked
                '---'
            }
        }
        else {
            sent
        }
    }

    private writeCutTextToFile(List ruChunks, CleanRequest request, String outDirName) {
        if( ruChunks && request.file ) {
            String ruText = ruChunks.join("\n\n")
            def ruFile
            if( outDirName ) {
                def ruFilename = request.file.name.replaceFirst(/\.txt/, '.ru.txt')

                def parentDir = request.outFile.getParentFile()
                def ruDir = new File(parentDir, "ru")
                ruDir.mkdirs()

                ruFile = new File(ruDir, ruFilename)
            }
            else {
                ruFile = new File(request.outFile.absolutePath.replaceFirst(/\.txt/, '.ru.txt'))
            }
            ruFile.setText(ruText, StandardCharsets.UTF_8.name())
        }
    }

    @CompileStatic
    List evalChunk(String text) {
        
//        double ukCnt = 0
        int ruCnt = 0
        int totalCnt = 0
        int ruCharCnt = 0

        def chunks = ltModule.ukWordTokenizer.tokenize(text)
            .findAll{ it && it ==~ /(?ius)[а-яіїєґё'\u2019\u02BC\u0301-]+/ }
            .collect { it.replaceAll(/^['\u2019\u02BC]+|['\u2019\u02BC]+$/, '') }
            .findAll { it ==~ /(?ius)[а-яіїєґё'\u2019\u02BC\u0301-]+/ }

        if( chunks.isEmpty() )
            return [(double)1.0, (double)0.0]
        
        // Лариса ГУТОРОВА
        if( chunks.size() < 10 ) {
            if( ! (text =~ /[ыэъё]/) || text =~ /[ієїґ]/ ) {
                return [(double)0.5, (double)0.1]
            }
        }
                
        int ukSum = 0
        int ruSum = 0
        int ukSum10 = 0
        int ruSum10 = 0
        for(String w: chunks) {
            
            int ukWeight = getUkWordRating(w)
            ukSum += ukWeight
            if( ukWeight == 10 ) {
                ukSum10 += ukWeight
            }
            def debugStr = "$w: uk: $ukWeight"
            if( ukWeight < 10 ) {
                int ruWeight = w =~ /(?iu)[ыэёъ]/ ? 10 : ltModule.knownWordRu(w) ? 8 : 0
                
                // workaround for 1-letter abbreviations that Russian tagger claims to know
                if( w ==~ /(?iu)[бвгдежзклмнпрстуфхцчшщю]/ ) {
                    ruWeight = 0
                }
                 
                ruSum += ruWeight

                if( ruWeight == 10 ) {
                    ruSum10 += ruWeight
                }
                debugStr += ", ru: $ruWeight"
            }
//            println debugStr
        }

        double ukRate = (double)ukSum / chunks.size() / 10
        double ruRate = (double)ruSum / chunks.size() / 10
        
        if( ruSum10 > 0 && ukSum10 == 0 ) {
            ruRate = 1
        }

        //println ":: uk: $ukRate ru: $ruRate words: ${chunks.size()}"

        if( ukRate == 0 && ruRate > 0.1 ) {
            ruRate = 1
        }

//        println "check:: '${text.trim()}' : $ukRate, $ruRate"
        
        [ukRate, ruRate]
    }

    @CompileStatic
    static List splitWithDelimiters(String str, Pattern delimPattern) {
        List parts = new ArrayList();
    
        Matcher matcher = delimPattern.matcher(str);
    
        int lastEnd = 0;
        while (matcher.find()) {
          int start = matcher.start();
    
          if (lastEnd != start) {
            String nonDelim = str.substring(lastEnd, start);
            parts.add(nonDelim);
          }
    
          String delim = matcher.group();
          parts.add(delim);
    
          lastEnd = matcher.end();
        }
    
        if (lastEnd != str.length()) {
          String nonDelim = str.substring(lastEnd);
          parts.add(nonDelim);
        }
    
        return parts;
    }

    
    @CompileDynamic
    int getUkWordRating(String word) {
        if( word =~ /(?iu)[іїєґ'\u2019\u02BC]|^й$/ )
            return 10
        
        try {
            List tokenReadings = ltModule.ukTagger.getAnalyzedTokens(word)
            if( tokenReadings[0].hasNoTag() )
                return 0

            def badToken = tokenReadings.find { AnalyzedToken t ->
                    t.getPOSTag() =~ /:(bad|subst)/ && ! t.getPOSTag().contains("&adjp:actv") && ! (t.getLemma() =~ /(ння|ий)$/) }
            if( badToken ) {
                def nonBadToken = badToken = tokenReadings.find { AnalyzedToken t -> ! (t.getPOSTag() =~ /:(bad|subst)/) }
                if( ! nonBadToken )
                    return 2
            }
//            if( token.find { AnalyzedToken t -> t.getPOSTag() =~ /:prop:geo|noun:inanim:.:v_kly/ } )
//                return 5
            return 8
        }
        catch (Exception e) {
            System.err.println("Failed on word: " + word)
            throw e
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy