ua.net.nlp.other.StressText.groovy Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of nlp_uk Show documentation
NLP tools for Ukrainian language
The newest version!
#!/usr/bin/env groovy

package ua.net.nlp.other

@GrabConfig(systemClassLoader=true)
@Grab(group='org.languagetool', module='languagetool-core', version='6.5')
@Grab(group='org.languagetool', module='language-uk', version='6.5')
@Grab(group='ch.qos.logback', module='logback-classic', version='1.4.+')
@Grab(group='info.picocli', module='picocli', version='4.6.+')

import org.languagetool.*
import org.languagetool.language.*

import groovy.transform.Canonical
import groovy.transform.CompileStatic
import picocli.CommandLine
import picocli.CommandLine.Option
import picocli.CommandLine.ParameterException
import ua.net.nlp.tools.TextUtils


class StressText {
    @groovy.transform.SourceURI
    static SOURCE_URI
    // if this script is called from GroovyScriptEngine SourceURI is data: and does not work for File()
    static SCRIPT_DIR = SOURCE_URI.scheme == "data" 
		? new File("src/main/groovy/ua/net/nlp/tools")
		: new File(SOURCE_URI).parent

    def textUtils = new TextUtils() 
//    def textUtils = System.hasProperty("groovy.grape.enable") 
//		? new TextUtils() 
//		: Eval.me(new File("$SCRIPT_DIR/TextUtils.groovy").text + "\n new TextUtils()")

    JLanguageTool langTool = new MultiThreadedJLanguageTool(new Ukrainian())

    def options
	@Canonical
	class StressInfo { String word; String tags; int base; int offset 
		
		String toString() { word ? String.format("%s %s", word, tags) : String.format("%d %d", base, offset) }
	} 
	
	// plain lemma -> lemma key tag -> list of forms
	Map>> stresses = new HashMap<>() //.withDefault { new HashMap<>().withDefault{ new ArrayList<>() } }

	static class Stats { 
		int unknownCnt 
		int homonymCnt
		
		void add(Stats stats) { 
			this.unknownCnt += stats.unknownCnt 
			this.homonymCnt += stats.homonymCnt
		}
	}
	
	@Canonical
	static class StressResult {
		String tagged
		Stats stats
	}
	
	Stats stats = new Stats()
	
    StringWriter writer

	StressText() {
		loadStressInfo()
	}
	

    void setOptions(options) {
        this.options = options
    }


	@CompileStatic
	StressResult stressText(String text) {
		assert stresses

		Stats stats = new Stats()
		
		List analyzedSentences = langTool.analyzeText(text);

		def sb = new StringBuilder()
		for (AnalyzedSentence analyzedSentence : analyzedSentences) {
			AnalyzedTokenReadings[] tokens = analyzedSentence.getTokens()

			String sentenceLine = outputStressed(tokens, stats)
			
			sb.append(sentenceLine) //.append("\n");
		}

		return new StressResult(sb.toString(), stats)
	}

	@CompileStatic
	boolean isMatch(StressInfo it, String theToken, AnalyzedToken anToken) {
		stripAccent(it.word) == theToken.toLowerCase() //&& it.tags in anToken.getPOSTag()
	}
	

	@CompileStatic
	private String getStressed(String theToken, List analyzedTokens, Stats stats) {
			
		def words = analyzedTokens.collect { AnalyzedToken anToken -> 
				if( anToken.getPOSTag() == null )
					return	
			
				String keyTag = getTagKey(anToken.getPOSTag())
				def tokenLemma = anToken.lemma
				println "key: $tokenLemma $keyTag"
				int stressOffset = 0

				if( tokenLemma =~ /^((що)?якнай|щонай).*(ий|е)$/ ) {
					tokenLemma = tokenLemma.replaceFirst(/^((що)?якнай|щонай)/, '')
					stressOffset += 2
					if( tokenLemma.startsWith("щоякнай") ) {
						stressOffset += 1
					}
				}
					
				List infos = []
				
				if( tokenLemma in stresses ) {
					 infos = stresses[tokenLemma][keyTag] ?: infos

					if( ! infos ) {
						if( keyTag.startsWith("verb") ) {
							String genericTag = keyTag.replaceFirst(/:(im)?perf/, ':imperf:perf')
							if( genericTag in stresses[tokenLemma] ) {
								infos = stresses[tokenLemma][genericTag]
							}
						}
						else if( keyTag.startsWith("noun") && keyTag.contains(":+") ) {
							// TODO: other genders
							String genericTag = keyTag.replaceFirst(/:[mfn]/, ':m:+n')
							if( genericTag in stresses[tokenLemma] ) {
								infos = stresses[tokenLemma][genericTag]
							}
						}

					}

					// get noun lemma from singular
					if( keyTag.startsWith("noun") && keyTag.endsWith(":p") ) {
						for(String s: [":m", ":f", ":n"]) {
							String genderTag = keyTag.replaceFirst(/:p$/, s)
							if( genderTag in stresses[tokenLemma] ) {
								infos += stresses[tokenLemma][genderTag]
							}
						}
					}
				}
				else if( anToken.getPOSTag().startsWith("adv:comp") ) {
					// if we have докладніше adj:n:comp skip unknown adv:comp
					if( analyzedTokens.any { it.getPOSTag() && it.getPOSTag().startsWith("adj:n:v_naz:comp") } )
						return
				}

				println "info: $infos"
				if( infos ) {
					// handle /1/ - simple offset
					if( infos.size() == 2 && infos[1].offset ) {
						return applyAccents(theToken, [infos[1].base + infos[1].offset])
					}
					
					def foundForms = infos.findAll { StressInfo it -> 
												isMatch(it, theToken, anToken) 
											}
											.collect{ 
												def x = stripAccent(it.word) == theToken
													? it.word
													: restoreAccent(it.word, theToken, 0)  // casing is off - need to apply accent
												x
											}

					if( foundForms.size() > 1 ) {
//						System.err.println "Multiple forms found for $theToken"
					}
											
					foundForms = foundForms.unique()

					if( foundForms ) {
						foundForms
					}
					else {
						restoreAccent(infos[0].word, theToken, stressOffset)
					}
				}
				else {
					if( getSyllCount(tokenLemma) == 1 ) {
						println "single syll lemma: $tokenLemma"
						applyAccents(theToken, [1])
					}
					else {
//						stats.unknownCnt++
						theToken
					}
				}
			}
			.flatten()
			.findAll{ it }
			.unique()

		println "words: $words"

		int stressCount = words.count { ((String)it).indexOf('\u0301') >= 0 } as int
		
		if( stressCount > 1 ) {
			stats.homonymCnt++
		}
		if( stressCount < words.size() ) {
			stats.unknownCnt++
		}
		words.join("/")
	}
	
	
	private String outputStressed(AnalyzedTokenReadings[] tokens, Stats stats) {
//		println ":: " + tokens
		
		StringBuilder sb = new StringBuilder()
		
		tokens[1..-1].eachWithIndex { AnalyzedTokenReadings tokenReadings, int idx ->
			String theToken = tokenReadings.token
			
			if( tokenReadings.token.length() < 2 
					|| theToken.indexOf('\u0301') >= 1
					|| getSyllCount(theToken) < 2 ) {
				sb.append(theToken);
			}
			else {
				List analyzedTokens = tokenReadings.getReadings()
					.findAll { AnalyzedToken tr -> 
						tr.getPOSTag() != null && ! tr.getPOSTag().endsWith("_END") && tr.getLemma() 
					}
					
				if( analyzedTokens ) {
					println "lemmas: $analyzedTokens"
					def stressed = getStressed(theToken, analyzedTokens, stats)
					sb.append(stressed)
				}
				else {
					stats.unknownCnt++
					sb.append(theToken)
				}
			}
		}
		
		sb.toString()
	}
	    

    def process() {
        def outputFile = textUtils.processByParagraph(options, { buffer ->
            return stressText(buffer)
        },
		{ StressResult result ->
			stats.add(result.stats) 
		});
    }

	
	@CompileStatic
	static String getTagKey(String tag) {
		tag.replace(':inanim', '') \
			.replaceFirst(/(noun(:(un)?anim)?:[mnfps]|(noun(:(un)?anim)?).*&pron|verb(:perf|:imperf)+|adj(.*?:&adjp)?|[a-z]+).*/, '$1') \
			.replaceFirst(/adj.*?:&adjp/, 'adj:&adjp')
	}
	
	@CompileStatic
	static int getSyllCount(String word) {
		int cnt = 0
		word.getChars().each { char ch ->
			if( isWovel(ch) )
				cnt += 1
		}
		cnt
	}
	
	@CompileStatic
	static boolean isWovel(char ch) {
		"аеєиіїоуюяАЕЄИІЇОУЮЯ".indexOf((int)ch) >= 0
	}
	
	@CompileStatic
	static String stripAccent(String word) {
		word.replace("\u0301", "")
	}
	
	@CompileStatic
	static List getAccentSyllIdxs(String word) {
		int syllIdx = 0
		List idxs = []
		word.getChars().each { char it ->
			if( it == '\u0301' ) {
				idxs << syllIdx
			}
			else if( "аеєиіїоуюя".indexOf((int)it) >= 0 ) {
				syllIdx += 1
			}
		}
		idxs
	}
	
	@CompileStatic
	static String restoreAccent(String lemma, String word, int offset) {
		List accents = getAccentSyllIdxs(lemma)
		if( offset ) {
			accents.eachWithIndex{ int a, int i -> accents[i]+=offset }
		}
		println "restore for: $lemma: $accents"
		applyAccents(word, accents)
	}

	@CompileStatic
	static String applyAccents(String word, List accents) {
		def sb = new StringBuilder()
		int syll = 0
		word.getChars().eachWithIndex { char ch, int idx ->
			sb.append(ch)
			if( isWovel(ch) ) {
				syll += 1
				if( syll in accents ) sb.append('\u0301')
			}
		}
		sb.toString()
	}

	def loadStressInfo() {
		long tm1 = System.currentTimeMillis()
		
		// def base = System.getProperty("user.home") + "/work/ukr/spelling/dict_uk/data/sem"
//		def base = "https://raw.githubusercontent.com/brown-uk/dict_uk/master/data/stress"
		File base
		def stressDir = new File("stress")
		if( stressDir.isDirectory() ) {
			base = stressDir
		}
		else {
			System.err.println("Loading stress info from resource")
//			base = getClass().getResource("/stress")
		}

		System.err.println("Loading stress info from $base")
		["all_stress", "all_stress_prop", "add"].each { file ->
//			def lines = base.startsWith("http")
//				? "$base/${cat}.csv".toURL().getText("UTF-8")
			
			def src = base ? new File(base, file+".txt") : getClass().getResourceAsStream("/stress/${file}.txt")
			def lines = src.getText("UTF-8")

//			println "File: ${file}.txt, lines: ${lines.size()}"
			
			def lastLemmaFull
			def lastLemma
			def lastLemmaTags
			lines.eachLine { line ->
				if( line.indexOf('#') >= 0 )
					line = line.replaceFirst(/\s*#.*/, '')

				// /1/
				String trimmed = line.trim()
				if( trimmed.indexOf(' ') <= 0 && trimmed.startsWith("/") ) {
//					println "x: " + trimmed + " "  + trimmed.charAt(1) + " " + lastLemmaFull
					int offset = trimmed[1] as int
					int[] lemmaAccents = getAccentSyllIdxs(lastLemmaFull) ?: [1]
					stresses[lastLemma][lastLemmaTags] << new StressInfo(base: lemmaAccents[0], offset: offset)
					return
				}
					
				assert trimmed.indexOf(' ') > 0, "Failed at $line" 
					
				def (word, tags) = trimmed.split(' ')
				if( ! line.startsWith(' ') ) {
					lastLemmaFull = word
					lastLemma = stripAccent(word)
					lastLemmaTags = getTagKey(tags)
				}
				
				if( ! (lastLemma in stresses) ) {
					stresses.put(lastLemma, new HashMap<>())
				}
				if( ! (lastLemmaTags in stresses[lastLemma]) ) {
					stresses[lastLemma].put(lastLemmaTags, [])
				}

//				if( lastLemma == "аналізувати" ) println "$lastLemmaTags / $word + $tags" 
				stresses[lastLemma][lastLemmaTags] << new StressInfo(word, tags)
			}
		}

		long tm2 = System.currentTimeMillis()
		System.err.println("Loaded ${stresses.size()} stress forms, ${tm2-tm1}ms")
	}
	

    static class StressOptions {
        @Option(names = ["-i", "--input"], arity="1", description = ["Input file"])
        String input
        @Option(names = ["-o", "--output"], arity="1", description = ["Output file (default:  - .txt + .stressed.txt)"])
        String output
        @Option(names = ["--singleThread"], description = ["Always use single thread (default is to use multithreading if > 2 cpus are found)"])
        boolean singleThread
        @Option(names= ["-q", "--quiet"], usageHelp= true, description= "Less messages.")
        boolean quiet
        @Option(names= ["-h", "--help"], usageHelp= true, description= "Show this help message and exit.")
        boolean helpRequested
    }
    
    @CompileStatic
    static StressOptions parseOptions(String[] argv) {
        StressOptions options = new StressOptions()
        CommandLine commandLine = new CommandLine(options)
        try {
            commandLine.parseArgs(argv)
            if (options.helpRequested) {
                commandLine.usage(System.out)
                System.exit 0
            }
        } catch (ParameterException ex) {
            println ex.message
            commandLine.usage(System.out)
            System.exit 1
        }

        if( ! options.output ) {
            def fileExt = ".txt"
            def outfile = options.input == '-' ? '-' : options.input.replaceFirst(/\.txt$/, '') + ".stressed" + fileExt
            options.output = outfile
        }

        options
    }


    static void main(String[] argv) {
        StressOptions options = parseOptions(argv)

        def nlpUk = new StressText()

        nlpUk.setOptions(options)

        nlpUk.process()
    }

}