All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ua.net.nlp.other.EvaluateText.groovy Maven / Gradle / Ivy

The newest version!
#!/bin/env groovy

// This script checks the text with LanguageTool 
// and prints a error rating (along with count and total # of words)
//
// NOTE: it disables some rules, like spelling, double whitespace etc

package ua.net.nlp.other

import groovy.transform.CompileStatic
import org.languagetool.*
import org.languagetool.rules.*
import org.languagetool.tokenizers.*
import org.languagetool.language.*


class EvaluateText {
    private static final String RULES_TO_IGNORE="MORFOLOGIK_RULE_UK_UA,COMMA_PARENTHESIS_WHITESPACE,WHITESPACE_RULE," \
    + "EUPHONY_PREP_V_U,EUPHONY_CONJ_I_Y,EUPHONY_PREP_Z_IZ_ZI,EUPHONY_PREP_O_OB" \
    + "DATE_WEEKDAY1,DASH,UK_HIDDEN_CHARS,UPPER_INDEX_FOR_M,DEGREES_FOR_C,DIGITS_AND_LETTERS," \
    + "UK_MIXED_ALPHABETS,UK_SIMPLE_REPLACE_SOFT"
    //,UK_SIMPLE_REPLACE,INVALID_DATE,YEAR_20001,"


    final JLanguageTool langTool = new MultiThreadedJLanguageTool(new Ukrainian());
    final SRXSentenceTokenizer stokenizer = new SRXSentenceTokenizer(new Ukrainian());
    final List allRules = langTool.getAllRules()
    
    
    List check(String text, boolean force, List errorLines) {
        if( ! force && text.trim().isEmpty() ) 
            return
        
        List matches = langTool.check(text);
        
        if( matches.size() > 0 ) {
            printMatches(matches, text, errorLines)
        }
        
        return matches
    }


    @CompileStatic    
    void printMatches(List matches, String text, List errorLines) {

        def i = 0
        def total = 0
        
        def lines = text.split("\n")
        
        for (RuleMatch match : matches) {
            errorLines << "Rule ID:  ${match.getRule().getId()}".toString()
            errorLines << "Message:  " + match.getMessage().replace("", "«").replace("", "»")

            def chunkOffset = 0
            def leftOff = 40
            def rightOff = 40
            def posInSent = match.getFromPos() - leftOff
            def posToInSent = match.getToPos() + rightOff

            def prefix = ""
            def suffix = ""
            if( posInSent <= 0 ) {
                posInSent = 0
            }
            else {
                prefix = "…"
                chunkOffset = 1
            }
            if( posToInSent >= text.length() ) {
                posToInSent = text.length()
            }
            else {
                suffix = "…"
            }

            def sample = text[posInSent.. 0 ? args[0] : "."
        def outDir = "$dir/err"

        def outDirFile = new File(outDir)
        if( ! outDirFile.isDirectory() ) {
            System.err.println "Output dir $outDir does not exists"
            return
        }


        def nlpUk = new EvaluateText()
        nlpUk.langTool.disableRules(Arrays.asList(RULES_TO_IGNORE.split(",")))


        def ratings = ["коеф помил унік  слів файл"]
        new File("$outDir/ratings.txt").text = ""

        new File(dir).eachFile { file->
            if( ! file.name.endsWith(".txt") )
                return


            def text = file.text
            List errorLines = []


            println(String.format("checking $file.name, words: %d, size: %d", word_count(text), text.size()))

            def paragraphs = text.split("\n\n")

            int matchCnt = 0
            int uniqueRules = 0

            try {
                paragraphs.each { String para ->
                    def matches = nlpUk.check(para, false, errorLines)
                    if( matches ) {
                        matchCnt += matches.size()
                        uniqueRules += getUniqueRuleCount(matches)
                    }
                }

                def matches = nlpUk.check("", true, errorLines)
                if( matches ) {
                    matchCnt += matches.size()
                    uniqueRules += getUniqueRuleCount(matches)
                }

                def wc = word_count(text)
                def rating = Math.round(matchCnt * 10000 / wc)/100
                ratings << String.format("%1.2f %4d %4d %6d %s", rating, matchCnt, uniqueRules, wc, file.name)

                new File(outDir + "/" + file.name.replace(".txt", ".err.txt")).text = errorLines.join("\n")
                errorLines.clear()            }
            catch(Exception e) {
                e.printStackTrace();
            }
        }

        new File("$outDir/ratings.txt").text = ratings.join("\n")

    }


    static getUniqueRuleCount(matches) {
        matches.collect{ it.rule.id == "UK_SIMPLE_REPLACE" ? it.message : it.rule.id }.unique().size()
    }
}





© 2015 - 2024 Weber Informatics LLC | Privacy Policy