org.ansj.app.crf.MakeTrainFile Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of ansj_seg Show documentation
Show all versions of ansj_seg Show documentation
best java chinese word seg !
package org.ansj.app.crf;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import org.ansj.app.crf.pojo.Element;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
/**
* 生成crf 或者是 wapiti的训练语聊工具.
*
* 执行:java org.ansj.app.crf.MakeTrainFile [inputPath] [outputPath]
*
* @author Ansj
*
*/
public class MakeTrainFile {
private static final Log logger = MyStaticValue.getLog();
public static void main(String[] args) {
String inputPath = "corpus.txt";
String outputPath = "train.txt";
if (args != null && args.length == 2) {
inputPath = args[0];
outputPath = args[1];
}
if (StringUtil.isBlank(inputPath) || StringUtil.isBlank(outputPath)) {
logger.info("org.ansj.app.crf.MakeTrainFile [inputPath] [outputPath]");
return;
}
try (BufferedReader reader = IOUtil.getReader(inputPath, "utf-8");
FileOutputStream fos = new FileOutputStream(outputPath)) {
String temp = null;
int i = 0;
while ((temp = reader.readLine()) != null) {
StringBuilder sb = new StringBuilder("\n");
if (StringUtil.isBlank(temp)) {
continue;
}
if (i == 0) {
temp = StringUtil.trim(temp);
}
List list = Config.makeToElementList(temp, "\\s+");
for (Element element : list) {
sb.append(element.nameStr() + " " + Config.getTagName(element.getTag()));
sb.append("\n");
}
fos.write(sb.toString().getBytes(IOUtil.UTF8));
System.out.println(++i);
}
} catch (FileNotFoundException e) {
logger.warn("文件没有找到", e);
} catch (IOException e) {
logger.warn("IO异常", e);
}
}
}