ua.net.nlp.bruk.ContextToken.groovy Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of nlp_uk Show documentation
NLP tools for Ukrainian language
The newest version!
package ua.net.nlp.bruk

import java.util.regex.Matcher
import java.util.regex.Pattern
import groovy.transform.Canonical
import groovy.transform.CompileStatic
import org.languagetool.rules.uk.LemmaHelper

@CompileStatic
@Canonical
class ContextToken {
//    static final Pattern POSTAG_KEY_PATTERN = Pattern.compile("^(noun:(anim|[iu]nanim)|verb(:rev)?:(perf|imperf)|adj|adv(p:(imperf:perf))?|part|prep|numr|conj:(coord|subord)|intj|onomat|punct|symb|noninfl|unclass|number|unknown|time|date|hashtag|BEG|END)")
    static final Pattern POSTAG_CORE_REMOVE_PATTERN = Pattern.compile(/:(arch|coll|slang|bad|vulg|ua_[0-9]{4}|ns)/)
    static final ContextToken BEG = new ContextToken('__BEG', '', 'BEG')
    static final ContextToken END = new ContextToken('__END', '', 'END')
    static final String[] IGNORE_TOKENS = [] //['б', 'би', 'ж', 'же', 'бодай'] 
    
    String word
    String lemma
    String postag
    
    @CompileStatic
    ContextToken(String word, String lemma, String postag) {
        this.word = word
        this.lemma = lemma
//        assert postag, "Empty postag for $word/$lemma"
        this.postag = getPostagCore(postag)
    }
    
    @CompileStatic
    static ContextToken normalized(String word, String lemma, String postag) {
        new ContextToken(normalizeContextString(word, lemma, postag),
            normalizeContextString(lemma, '', postag),
            postag)
    }

    @CompileStatic
    String toString() {
        def w = safeguard(word)
        def l = safeguard(lemma)
        "$w\t$l\t$postag"
    }
    
    @CompileStatic
    static String getPostagCore(String postag) {
        postag != null ? POSTAG_CORE_REMOVE_PATTERN.matcher(postag).replaceAll('') : postag
    }

    @CompileStatic
    static String safeguard(String w) {
        if( w == '' ) return '^'

        w //w.indexOf(' ') >= 0 ? w.replace(' ', '\u2009') : w
    }

    @CompileStatic
    static String unsafeguard(String w) {
        w //w = w.indexOf('\u2009') >= 0 ? w.replace('\u2009', ' ') : w
    }

    @CompileStatic
    static String normalizeContextString(String w, String lemma, String postag) {
        if( ! w ) // possible for lemmas from AnalyzedToken
            return w
        
        if( postag == "number" ) {
            def m0 = Pattern.compile(/((1[6789]|20)[0-9]{2}[-–—])?(1[6789]|20)([0-9]{2})/).matcher(w) // preserve a year - often works as adj
            if( m0.matches() )
                return "YY" + m0.group(4)

            // normalize 10 000
            w = w.replace(" ", "")
                
            def m1 = Pattern.compile(/([0-9]+[-—–])?([0-9]+)/).matcher(w) // we only care about last two digits
            if( m1.matches() ) {
                if( w =~ /[05-9]$/ || w =~ /1[0-9]$/ )
                    return 0
                if( w =~ /[234]$/ )
                    return 2
                if( w =~ /1$/ )
                    return 1
                // should not happen
                return m1.group(2)
            }

            def m2 = Pattern.compile(/([0-9,]+[–—-])?[0-9]+([,.])[0-9]+/).matcher(w) // we only care that it's decimal
            if( m2.matches() )
                return '0,0'
        }

        String w1 = normalizeWord(w, lemma, postag)
        if( w1 != w )
            return w1
        
        if( postag == "punct" ) {
            if( w == "..." )
                return '…'

            if( w.length() == 1 )
                return w.replaceAll(/^[\u2013\u2014]$/, '-')
                        .replace('„', '«')
                        .replace('“', '»')

            if( w.indexOf(".") > 0 )
                return w.replaceAll(/^([?!])\.+$/, '$1')
        }

        boolean hasLowerCaseLemma = lemma && lemma =~ /^[а-яіїєґ]/
        w = hasLowerCaseLemma ? w.toLowerCase() : w
        
//        if( postag == "prep" ) {
//            if( w=="із" || w=="зо" )
//                return "з"
//            if( w=="у" )
//                return "в"
//        }
//        else if( postag == "conj:coord" ) {
//            if( w=="й" )
//                return "і"
//        }
            
        return w
    }
    
    @CompileStatic
    static String normalizeWord(String w, String lemma, String postag) {
        w = w.replace('\u2013', '-')
        // 2000-го -> 0-го
        // 101-річчя -> 101-річчя
        if( w.indexOf('-') > 0 && postag =~ /^(adj|noun)/ ) {
            def m1 = Pattern.compile(/[0-9-]*([0-9])-([а-яіїєґ]+)/).matcher(w)
            if( m1.matches() )
                return m1.replaceFirst('$1-$2')
        }
        return w
    }

    // його|що    
    private static final USE_RIGHT_CTX_PATTERN = ~/є|її|це|саме|[ву]с[еі]|всередині|перед|протягом|брати|(українськ|англійськ)(а|у|ою|ій)|рівні|доросл.*|майбутн(є|ього|ім|ому)|більше/
    
    static boolean useRightContext(String token) {
//        token.toLowerCase() ==~ /це|його|її|їх|як|є|саме|все/
        token.toLowerCase() ==~ USE_RIGHT_CTX_PATTERN
    }
    
}