All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mayabot.nlp.fasttext.FasttextTranUtils.kt Maven / Gradle / Ivy

package com.mayabot.nlp.fasttext

import com.mayabot.nlp.segment.LexerReader
import com.mayabot.nlp.segment.Lexers
import java.io.File

class FasttextTranUtils {

    companion object {

        /**
         * 处理没有分词的语料
         * __label__xxxx 语料文本,语料文本,语料文本
         */
        @JvmOverloads
        @JvmStatic
        fun prepareBySegment(from: File,
                             to: File,
                             label: String = "__label__",
                             lexer: LexerReader = Lexers.coreBuilder().build().filterReader(true, true)) {

            fun processLine(line:String): String{
                val list = ArrayList()
                line.split(" ").forEach { part->
                    if(part.startsWith(label)){
                        list += part
                    }else{
                        lexer.scan(part).toWordSequence().forEach { word->
                            list += word
                        }
                    }
                }
                return list.joinToString(" ")
            }

            from.useLines { lines->
                to.bufferedWriter(Charsets.UTF_8).use { writer->
                    lines.forEach { line->
                        writer.write(processLine(line))
                        writer.write("\n")
                    }
                }
            }
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy