All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.ansj.app.crf.MakeTrainFile Maven / Gradle / Ivy

There is a newer version: 5.1.6
Show newest version
package org.ansj.app.crf;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;

import org.ansj.app.crf.pojo.Element;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;

/**
 * 生成crf 或者是 wapiti的训练语聊工具.
 * 
 * 执行:java org.ansj.app.crf.MakeTrainFile [inputPath] [outputPath]
 * 
 * @author Ansj
 *
 */
public class MakeTrainFile {

	private static final Log logger = MyStaticValue.getLog();

	public static void main(String[] args) {

		String inputPath = "corpus.txt";

		String outputPath = "train.txt";

		if (args != null && args.length == 2) {
			inputPath = args[0];
			outputPath = args[1];
		}

		if (StringUtil.isBlank(inputPath) || StringUtil.isBlank(outputPath)) {
			logger.info("org.ansj.app.crf.MakeTrainFile [inputPath] [outputPath]");
			return;
		}
		try (BufferedReader reader = IOUtil.getReader(inputPath, "utf-8");
				FileOutputStream fos = new FileOutputStream(outputPath)) {
			String temp = null;
			int i = 0;
			while ((temp = reader.readLine()) != null) {
				StringBuilder sb = new StringBuilder("\n");
				if (StringUtil.isBlank(temp)) {
					continue;
				}
				if (i == 0) {
					temp = StringUtil.trim(temp);
				}
				List list = Config.makeToElementList(temp, "\\s+");
				for (Element element : list) {
					sb.append(element.nameStr() + " " + Config.getTagName(element.getTag()));
					sb.append("\n");
				}
				fos.write(sb.toString().getBytes(IOUtil.UTF8));
				System.out.println(++i);
			}
		} catch (FileNotFoundException e) {
			logger.warn("文件没有找到", e);
		} catch (IOException e) {
			logger.warn("IO异常", e);
		}
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy