ua.net.nlp.other.clean.HyphenModule.groovy Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of nlp_uk Show documentation
NLP tools for Ukrainian language
The newest version!
package ua.net.nlp.other.clean

import java.util.function.Function
import java.util.regex.MatchResult
import java.util.regex.Matcher
import java.util.regex.Pattern

import groovy.transform.CompileDynamic
import groovy.transform.CompileStatic
import groovy.transform.PackageScope

@PackageScope
@CompileStatic
class HyphenModule {
    
    private final Pattern SOFT_HYPHEN_PATTERN1 = Pattern.compile(/([а-яіїєґА-ЯІЇЄҐa-zA-Z'ʼ’][-\u2013\u2014\u2011]?)\u00AD+(\n[ \t]*)([-\u2013\u2014\u2011]?[а-яіїєґА-ЯІЇЄҐa-zA-Z'ʼ’-]+)([,;.!?])?/)
    private final Pattern SOFT_HYPHEN_PATTERN3 = Pattern.compile(/([а-яіїєґА-ЯІЇЄҐa-zA-Z'ʼ’:. ][-\u2013\u2014\u2011]?)\u00AD+([-\u2013\u2014\u2011]?[а-яіїєґА-ЯІЇЄҐa-zA-Z'ʼ’ -])/)
    private final Pattern SOFT_HYPHEN_PATTERN2 = Pattern.compile(/([0-9])\u00AD+([а-яіїєґА-ЯІЇЄҐa-zA-Z])/)
    
    OutputTrait out
    LtModule ltModule
    
    @CompileStatic
    String removeSoftHyphens(String text) {
        text = remove00ADHyphens(text)
        text = remove00ACHyphens(text)
        text = removeTildaAsHyphen(text)
        return text
    }

    private String remove00ADHyphens(String text) {
        if( text.contains("\u00AD") ) {
            out.println "\tremoving soft hyphens: "
            //            text = text.replaceAll(/[ \t]*\u00AD[ \t]*([а-яіїєґА-ЯІЇЄҐ'ʼ’-]+)([,;.!?])?/, '$1$2')
            //            text = text.replaceAll(/\u00AD(?!\n {10,}[А-ЯІЇЄҐ])(\n?[ \t]*)([а-яіїєґА-ЯІЇЄҐ'ʼ’-]+)([,;.!?])?/, '$2$3$1')
            def text1 = SOFT_HYPHEN_PATTERN1.matcher(text).replaceAll('$1$3$4$2')
            def text2 = SOFT_HYPHEN_PATTERN2.matcher(text1).replaceAll('$1-$2')
            text = SOFT_HYPHEN_PATTERN3.matcher(text2).replaceAll('$1$2')
            //            text = text.replaceAll(/(?i)([А-ЯІЇЄҐ:. ])\u00AD+([А-ЯІЇЄҐ'ʼ’ -])/, '$1$2')
            //            text = text.replaceAll(/([А-ЯІЇЄҐA-Z])\u00AD(\n[ \t]*)([А-ЯІЇЄҐA-Z'ʼ’-]+)([,;.!?])?/, '$1$3$4$2')
            // text = text.replace('\u00AD', '-')

            if( text.contains("\u00AD") ) {
                def ctx = CleanUtils.getContext(text, "\u00AD")
                out.println "\t\tNOTE: still contains U+00AD hyphens: $ctx"
            }
        }
        return text
    }

    private final Pattern AC_HYPHEN_PATTERN1 = Pattern.compile(/([а-яіїєґА-ЯІЇЄҐ'ʼ’-]*[а-яіїєґА-ЯІЇЄҐ])\u00AC ?([а-яіїєґА-ЯІЇЄҐ][а-яіїєґА-ЯІЇЄҐ'ʼ’-]*)/)
    
    @CompileStatic
    String remove00ACHyphens(String text) {
        def t0 = text
        if( t0.contains("\u00AC") ) { // ¬
            out.println "\tremoving U+00AC hyphens: "
            // 10¬ий
            def t1 = t0.replaceAll(/([0-9])\u00AC([а-яіїєґА-ЯІЇЄҐ0-9])/, '$1-$2')
// t0 = null // ml
            def m2 = AC_HYPHEN_PATTERN1.matcher(t1)
// t1 = null // ml
            def t2 = m2.replaceAll( new Function() { String apply(MatchResult mr) {
//            text = AC_HYPHEN_PATTERN1.matcher(text).replaceAll({ String all, w1, w2 ->
                def w1 = mr.group(1)
                def w2 = mr.group(2)
                def fix = "$w1-$w2"
                if( ltModule.knownWord(fix) ) return fix
                fix = "$w1$w2"
                if( ltModule.knownWord(fix) ) return fix
                return mr.group(0)
            } } )
            t0 = t2
// t2 = null // ml
            if( text.contains("\u00AC") ) {
                def ctx = CleanUtils.getContext(text, "\u00AC")
                out.println "\t\tNOTE: still contains U+00AC hyphens: $ctx"
            }
        }
        return t0
    }

    private final Pattern AC_HYPHEN_PATTERN_TILDA = Pattern.compile(/([а-яіїєґА-ЯІЇЄҐ'ʼ’-]*[а-яіїєґА-ЯІЇЄҐ])~([а-яіїєґА-ЯІЇЄҐ][а-яіїєґА-ЯІЇЄҐ'ʼ’-]*)/)
    
    @CompileStatic
    String removeTildaAsHyphen(String text) {
        def t0 = text
        if( t0.contains("~") ) { // ¬
            out.println "\tremoving ~ as hyphen: "
            def m2 = AC_HYPHEN_PATTERN_TILDA.matcher(t0)
            def t2 = m2.replaceAll{ mr ->
                def w1 = mr.group(1)
                def w2 = mr.group(2)
                def fix = "$w1-$w2"
                if( ltModule.knownWord(fix) ) return fix
                if( ltModule.knownWord(w1) && ltModule.knownWord(w2) ) return "$w1 $w2"
                return mr.group(0)
            }
            t0 = t2
        }
        return t0
    }

    @CompileStatic
    String fixDanglingHyphens(String text) {
        text = text.replaceAll(/(?ui)([0-9])\s*([\u2013\u2011-])\s+(річчя|ліття|ий|го|ому|им|ої|ій|ою|ї)\b/, '$1$2$3')
        text = text.replaceAll(/(?ui)([0-9])\s+([\u2013\u2011-])\s*(річчя|ліття|ий|го|ому|им|ої|ій|ою|ї)\b/, '$1$2$3')
        text = text.replaceAll(/\b([дД])\s+([\u2013\u2011-])\s+(р)\b/, '$1$2$3')
        text = text.replaceAll(/(?ui)\b(будь)\s+([\u2013\u2011-])\h*(що)\h*([\u2013\u2011-])\h*(будь)\b/, '$1$2$3$4$5')
        text = text.replaceAll(/(?ui)\b(будь)\s+([\u2013\u2011-])\h*(як(ий|им|ого|ому|ім|а|ої|ій|у|е|і|их|им|ими)?|що|чого|чому|чим|чім)\b/, '$1$2$3')

        def m = text =~ /[а-яіїєґА-ЯІЇЄҐ][-\u2013\u2011][ \t]*\n/
        if( m ) {
            out.println "\tsuspect word wraps"
            def cnt = 0
            int cntWithHyphen = 0

            // e.g.: депутат-\n«мажоритарник»
            text = text.replaceAll(/([а-яіїєґА-ЯІЇЄҐ-]+)[ \t]*-\n([ \t]*)([«„"][а-яіїєґ'ʼ’-]+[»“"])([,;.!?])?/, { List it ->
                cntWithHyphen += 1
                it[1] + "-" + it[3] + (it[4] ?: "") + "\n" + it[2]
            })

            def first = null
            text = text.replaceAll(/([а-яіїєґА-ЯІЇЄҐ'ʼ’-]+)[-\u2013\u2011][ \t]*\n(?:[ \t]*\n)?([ \t]*)([а-яіїєґА-ЯІЇЄҐ'ʼ’-]+)([,;.!?])?/, { List it ->
                if( ! first ) {
                    first = it[0] ? it[0].replace('\n', "\\n") : it[0]
                    // println "== " + (it[1] + "-" + it[3]) + ", known: " + knownWord(it[1] + "-" + it[3])
                }
                // consider words with two or more hyphens with one of them before end of line to be rare
                boolean knownWithHyphen = ltModule.knownWord(it[1] + "-" + it[3]) && ! isHyphenBadLemma(it[3])
                if( knownWithHyphen )
                    return it[1] + "-" + it[3] + (it[4] ?: "") + "\n" + it[2]
                
                if( ltModule.knownWord(it[1] + it[3]) ) {
                    cnt += 1
                    //                print "."
                    it[1] + it[3] + (it[4] ?: "") + "\n" + it[2]
                }
                else {
                    it[0]
                }
            })

            out.println "\t\t$cnt word wraps removed, $cntWithHyphen newlines after hyphen removed"
            if( cnt == 0 && cntWithHyphen == 0 ) {
                if( first == null ) {
                    first = CleanTextCore2.getContext(m, text)
                }
                out.println "\t\tfirst match: \"$first\""
            }
        }

        if( text =~ /¬[ \t]*\n/ ) {
            out.println "\tsuspect word wraps with ¬:"
            text = text.replaceAll(/([а-яіїєґА-ЯІЇЄҐ'ʼ’-]+)¬ *\n([ \t]*)([а-яіїєґ'ʼ’-]+)/, '$1$3\n$2')
            out.println "\t\t¬ word wraps removed"
        }

        def m2 = text =~ /[а-яіїєґА-ЯІЇЄҐ][-–\u2011¬][ \t]*\n/
        if( m2 ) {
            def ctx = CleanTextCore2.getContext(m2, text)
            out.println "\t\tNOTE: still contains word wraps: $ctx"
        }
        
        return text
    }

    
    static final List ignoreHypLemmas = ["дійний", "ленський"]
    @CompileStatic
    boolean isHyphenBadLemma(String word) {
        try {
            List lemmas = ltModule.getLemmas(word)
            return lemmas.intersect(ignoreHypLemmas)
        }
        catch (Exception e) {
            System.err.println("Failed on word: " + word)
            throw e
        }
    }

    
    private final Pattern LEADING_HYPHEN_PATTERN1 = Pattern.compile(/(\.\h+[-\u2013\u2011\u2014])([А-ЯІЇЄҐ][^.-])/)
    private final Pattern LEADING_HYPHEN_PATTERN2 = Pattern.compile(/(?m)(^|(? //all, space, hyph, word ->
            def space = mr.group(1)
            def hyph = mr.group(2)
            def word = mr.group(3)
            if( ltModule.knownWord(word) ) {
                converted += 1
                "$space$hyph $word"
            }
            else {
                mr.group(0)
            }
        }
        
        if( converted ) {
            out.println "\tConverted leading hyphens: ${converted}"
        }

        // skip: на -овець
        def regex2 = ~/(?
                    def matcher = regex2.matcher(line)
                    while( matcher.find() ) {
                        cnt += 1
                        if( ! first )
                            first = matcher[0]
                    }
            }
            if( cnt ) {
                out.println "\tWARNING: found $cnt suspicious hypens after space, e.g. \"$first\""
            }
        }
        
        t1
    }
}