ua.net.nlp.tools.TextUtils.groovy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of nlp_uk Show documentation
Show all versions of nlp_uk Show documentation
NLP tools for Ukrainian language
The newest version!
package ua.net.nlp.tools
import java.nio.charset.StandardCharsets
import java.util.concurrent.ArrayBlockingQueue
import java.util.concurrent.BlockingQueue
import java.util.concurrent.Callable
import java.util.concurrent.ExecutorService
import java.util.concurrent.Executors
import java.util.concurrent.Future
import java.util.concurrent.TimeUnit
import java.util.function.Consumer
import java.util.function.Function
import java.util.regex.Pattern
import groovy.transform.Canonical
import groovy.transform.CompileStatic
import picocli.CommandLine.Option
@CompileStatic
public class TextUtils {
static int BUFFER_SIZE = 4*1024
static int MAX_PARAGRAPH_SIZE = 200*1024
static class IOFiles {
InputStream inputFile
PrintStream outputFile
String filename // only for file-by-file case
}
static IOFiles prepareInputOutput(OptionsBase options) {
// if( options.output == "-" || options.input == "-" ) {
// warnOnWindows();
// }
PrintStream outputFile
if( options.output == "-" ) {
outputFile = System.out
}
else {
if( ! options.isNoTag() ) {
def of = new File(options.output)
of.setText('') // to clear out output file
outputFile = new PrintStream(of, "UTF-8")
}
}
if( ! options.quiet && options.input == "-" ) {
System.err.println("Reading from stdin...")
}
InputStream inputFile = options.input == "-" ? System.in : new File(options.input).newInputStream()
if( ! options.quiet ) {
if( options.noTag ) {
System.err.println("Collecting stats only...")
}
else if( options.output != "-" ) {
System.err.println ("writing into ${options.output}")
}
}
def ioFiles = new IOFiles(inputFile: inputFile, outputFile: outputFile)
if( options.input && options.input != "-" ) {
ioFiles.filename = options.input
}
return ioFiles
}
static def processByParagraph(OptionsBase options, Closure closure, Closure resultClosure) {
IOFiles files = prepareInputOutput(options)
processByParagraphInternal(options, files.inputFile, files.outputFile, closure, resultClosure)
}
static def processByParagraphInternal(OptionsBase options, InputStream inputFile, PrintStream outputFile, Closure closure, Closure resultClosure) {
boolean parallel = false
int cores = Runtime.getRuntime().availableProcessors()
if( cores > 2 && ! options.singleThread ) {
if( ! options.quiet ) {
System.err.println ("Found ${cores} cores, using parallel threads")
}
parallel = true
}
if ( ! options.noTag ) {
if( options.outputFormat == OutputFormat.xml ) {
outputFile.println('')
if( options.xmlSchema ) {
outputFile.println("\n")
}
else {
outputFile.println('\n')
}
}
else if( options.outputFormat == OutputFormat.json ) {
outputFile.println('{')
outputFile.println(' "sentences": [')
}
}
long tm1 = System.currentTimeMillis()
if( parallel ) {
processFileParallel(inputFile, outputFile, options, closure, (int)(cores), resultClosure)
}
else {
processFile(inputFile, outputFile, options, closure, resultClosure)
}
if( ! options.quiet ) {
long tm2 = System.currentTimeMillis()
System.err.println "Time: ${(tm2-tm1)} ms"
}
if ( ! options.noTag ) {
if( options.outputFormat == OutputFormat.xml ) {
outputFile.println('\n ')
}
else if( options.outputFormat == OutputFormat.json ) {
outputFile.println('\n ]')
outputFile.println('}')
}
// else {
// outputFile.println('\n')
// }
}
return outputFile
}
@CompileStatic
static class OutputHandler {
PrintStream outputFile
OptionsBase options
boolean outStarted = false
boolean jsonStarted = false
void print(ResultBase analyzed) {
if( options.noTag )
return
if( jsonStarted ) {
outputFile.print(",\n")
}
else if( outStarted ){
outputFile.print("\n")
}
outputFile.print(analyzed.tagged)
if( analyzed.tagged ) {
outStarted = true
if( options.outputFormat.name() == 'json'
&& ! jsonStarted
&& (analyzed.tagged.endsWith('}') || analyzed.tagged.endsWith('[') ) ) {
jsonStarted = true
}
}
}
}
@CompileStatic
static void processFile(InputStream inputFile, PrintStream outputFile, OptionsBase options, Function closure, Consumer extends ResultBase> postProcessClosure) {
StringBuilder buffer = new StringBuilder(BUFFER_SIZE)
boolean notEmpty = false
OutputHandler outputHandler = new OutputHandler(outputFile: outputFile, options: options)
inputFile.eachLine('UTF-8', 0, { String line ->
buffer.append(line).append("\n")
notEmpty |= line.trim().length() > 0
if( (notEmpty
&& buffer.lastIndexOf("\n\n") == buffer.length() - 2 )
|| buffer.length() > MAX_PARAGRAPH_SIZE ) {
def str = buffer.toString()
try {
ResultBase analyzed = closure.apply(str)
outputHandler.print(analyzed)
postProcessClosure(analyzed)
}
catch(Throwable e) {
e.printStackTrace()
}
buffer = new StringBuilder(BUFFER_SIZE)
notEmpty = false
}
})
if( buffer ) {
def analyzed = closure(buffer.toString())
outputHandler.print(analyzed)
if( outputHandler.outputFile ) {
outputHandler.outputFile.println()
}
try {
postProcessClosure(analyzed)
}
catch(e) {
e.printStackTrace()
}
}
}
@CompileStatic
static void processFileParallel(InputStream inputFile, PrintStream outputFile, OptionsBase options, Function processClosure, int cores, Consumer postProcessClosure) {
ExecutorService executor = Executors.newFixedThreadPool(cores + 1) // +1 for consumer
BlockingQueue futures = new ArrayBlockingQueue<>(cores*2) // we need to poll for futures in order to keep the queue busy
OutputHandler outputHandler = new OutputHandler(outputFile: outputFile, options: options)
executor.submit(new Callable() {
def call() {
for(Future f = futures.poll(5, TimeUnit.MINUTES); ; f = futures.poll(5, TimeUnit.MINUTES)) {
if( f == null ) {
continue
}
// println "queue size: " + futures.size()
try {
ResultBase analyzed = f.get()
if( analyzed == null ) break;
outputHandler.print(analyzed)
postProcessClosure.accept(analyzed)
}
catch(e) {
e.printStackTrace()
System.exit(1)
}
}
// println "done polling"
}
})
StringBuilder buffer = new StringBuilder(BUFFER_SIZE)
boolean notEmpty = false
inputFile.eachLine('UTF-8', 0, { String line ->
buffer.append(line).append("\n")
notEmpty |= line.trim().length() > 0
if( (notEmpty
// && buffer.length() > 1000
&& buffer.lastIndexOf("\n\n") == buffer.length() - 2 )
|| buffer.length() > MAX_PARAGRAPH_SIZE ) {
def str = buffer.toString()
futures << executor.submit(new Callable