All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.paoding.analysis.analyzer.estimate.TryPaodingAnalyzer Maven / Gradle / Ivy

package net.paoding.analysis.analyzer.estimate;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;

import net.paoding.analysis.analyzer.PaodingAnalyzer;
import net.paoding.analysis.knife.PaodingMaker;

import org.apache.lucene.analysis.Analyzer;

public class TryPaodingAnalyzer {
	private static final String ARGS_TIP = ":";
	static String input = null;
	static String file = null;
	static Reader reader = null;
	static String charset = null;
	static String mode = null;
	static String analyzerName = null;
	static String print = null;
	static String properties = PaodingMaker.DEFAULT_PROPERTIES_PATH;
	
	public static void main(String[] args) {
		try {
			resetArgs();
			
			int inInput = 0;
			for (int i = 0; i < args.length; i++) {
				if (args[i] == null || (args[i] = args[i].trim()).length() == 0) {
					continue;
				}
				if (args[i].equals("--file") || args[i].equals("-f")) {
					file = args[++i];
				} else if (args[i].equals("--charset") || args[i].equals("-c")) {
					charset = args[++i];
				} else if (args[i].equals("--mode") || args[i].equals("-m")) {
					mode = args[++i];
				} else if (args[i].equals("--properties") || args[i].equals("-p")) {
					properties = args[++i];
				} else if (args[i].equals("--analyzer") || args[i].equals("-a")) {
					analyzerName = args[++i];
				} else if (args[i].equals("--print") || args[i].equals("-P")) {
					print = args[++i];
				} else if (args[i].equals("--input") || args[i].equals("-i")) {
					inInput++;
				} else if (args[i].equals("--help") || args[i].equals("-h")
						|| args[i].equals("?")) {
					printHelp();
					return;
				} else {
					// 非选项的参数数组视为input
					if (!args[i].startsWith("-")
							&& (i == 0 || args[i - 1].equals("-i") || args[i - 1].equals("--input") || !args[i - 1].startsWith("-"))) {
						if (input == null) {
							input = args[i];// !!没有++i
						} else {
							input = input + ' ' + args[i];// !!没有++i
						}
						inInput++;
					}
				}
			}
			if (file != null) {
				input = null;
				reader = getReader(file, charset);
			}
			//
			analysing();
		} catch (Exception e1) {
			resetArgs();
			e1.printStackTrace();
		}
	}



	private static void resetArgs() {
		input = null;
		file = null;
		reader = null;
		charset = null;
		mode = null;
		print = null;
		analyzerName = null;
		properties = PaodingMaker.DEFAULT_PROPERTIES_PATH;
	}
	

	
	private static void analysing() throws Exception {
		Analyzer analyzer;
		if (analyzerName == null || analyzerName.length() == 0 || analyzerName.equalsIgnoreCase("paoding")) {
			//properties==null等同于new new PaodingAnalyzer();
			analyzer = new PaodingAnalyzer(properties);
			if (mode != null) {
				((PaodingAnalyzer) analyzer).setMode(mode);
			}
		}
		else {
			Class clz;
			if (analyzerName.equalsIgnoreCase("standard")) {
				analyzerName = "org.apache.lucene.analysis.standard.StandardAnalyzer";
			}
			else if (analyzerName.equalsIgnoreCase("cjk")) {
				analyzerName = "org.apache.lucene.analysis.cjk.CJKAnalyzer";
			}
			else if (analyzerName.equalsIgnoreCase("cn") || analyzerName.equalsIgnoreCase("chinese")) {
				analyzerName = "org.apache.lucene.analysis.cn.ChineseAnalyzer";
			}
			else if (analyzerName.equalsIgnoreCase("st") || analyzerName.equalsIgnoreCase("standard")) {
				analyzerName = "org.apache.lucene.analysis.standard.StandardAnalyzer";
			}
			clz = Class.forName(analyzerName);
			analyzer = (Analyzer) clz.newInstance();
		}
		boolean readInputFromConsle = false;
		Estimate estimate = new Estimate(analyzer);
		if (print != null) {
			estimate.setPrint(print);
		}
		while (true) {
			if (reader == null) {
				if (input == null || input.length() == 0 || readInputFromConsle) {
					input = getInputFromConsole();
					readInputFromConsle = true;
				}
				if (input == null || input.length() == 0) {
					System.out.println("Warn: none charactors you input!!");
					continue;
				}
				else if (input.startsWith(ARGS_TIP)) {
					String argsStr = input.substring(ARGS_TIP.length());
					main(argsStr.split(" "));
					continue;
				}
			}
			if (reader != null) {
				estimate.test(System.out, reader);
				reader = null;
			}
			else {
				estimate.test(System.out, input);
				input = null;
			}
			System.out.println("--------------------------------------------------");
			if (false == readInputFromConsle) {
				return;
			}
		}
	}

	private static String getInputFromConsole() throws IOException {
		printTitleIfNotPrinted("");
		String input = null;
		BufferedReader reader = new BufferedReader(new InputStreamReader(
				System.in));
		String line;
		do {
			System.out.print("paoding> ");
			line = reader.readLine();
			if (line == null || line.length() == 0) {
				continue;
			}
			if (line.equals(ARGS_TIP + "clear") || line.equals(ARGS_TIP + "c")) {
				input = null;
				System.out.println("paoding> Cleared");
				return getInputFromConsole();
			}
			else if (line.equals(ARGS_TIP + "exit") || line.equals(ARGS_TIP + "quit") || line.equals(ARGS_TIP + "e") || line.equals(ARGS_TIP + "q") ) {
				System.out.println("Bye!");
				System.exit(0);
			}
			else if (input == null && line.startsWith(ARGS_TIP)) {
				input = line;
				break;
			}
			else {
				if (line.endsWith(";")) {
					if (line.length() > ";".length()) {
						input = line.substring(0, line.length() - ";".length());
					}
					break;
				}
				else {
					if (input == null) {
						input = line;
					} else {
						input = input + "\n" + line;
					}
				}
			}
		} while (true);
		return input == null ? null : input.trim();
	}

	private static void printHelp() {
		String app = System.getProperty("paoding.try.app",
				"TryPaodingAnalyzer");
		String cmd = System.getProperty("paoding.try.cmd", "java "
				+ TryPaodingAnalyzer.class.getName());
		System.out.println(app + "的用法:");
		System.out.println("\t" + cmd + " [OPTIONS] [text_content]");
		System.out.println("\nOPTIONS:");
		System.out.println("\t--file, -f:\n\t\t文章以文件的形式输入,在前缀加上\"classpath:\"表示从类路径中寻找该文件。");
		System.out.println("\t--charset, -c:\n\t\t文章的字符集编码,比如gbk,utf-8等。如果没有设置该选项,则使用Java环境默认的字符集编码。");
		System.out.println("\t--properties, -p:\n\t\t不读取默认的类路径下的庖丁分词属性文件,而使用指定的文件,在前缀加上\"classpath:\"表示从类路径中寻找该文件。");
		System.out.println("\t--mode, -m:\n\t\t强制使用给定的mode的分词器;可以设定为default,most-words,max-word-length或指定类名的其他mode(指定类名的,需要加前缀\"class:\")。");
		System.out.println("\t--input, -i:\n\t\t要被分词的文章内容;当没有通过-f或--file指定文章输入文件时可选择这个选项指定要被分词的内容。");
		System.out.println("\t--analyzer, -a:\n\t\t测试其他分词器,通过--analyzer或-a指定其完整类名。特别地,paoding、cjk、chinese、st分别对应PaodingAnalyzer、CJKAnalyzer、ChineseAnalyzer、StandardAnalyzer");
		System.out.println("\t--print, -P:\n\t\t 是否打印分词结果。默认打印前50行。规则:no表示不打印;50等价于1-50行;1-50表示打印1至50行;可以以逗号组合使用,如20,40-50表示打印1-20以及40-50行");
		System.out.println("\n示例:");
		System.out.println("\t" + cmd);
		System.out.println("\t" + cmd + " ?");
		System.out.println("\t" + cmd + " 中华人民共和国");
		System.out.println("\t" + cmd + " -m max 中华人民共和国");
		System.out.println("\t" + cmd + " -f e:/content.txt -c utf8");
		System.out.println("\t" + cmd + " -f e:/content.txt -c utf8 -m max-word-length");
		System.out.println("\t" + cmd + " -f e:/content.txt -c utf8 -a cjk");
		System.out.println("\n若是控制台进入\"paoding>\"后:");
		titlePrinted = false;
		printTitleIfNotPrinted("\t");
	}
	
	
	private static boolean titlePrinted = false;
	private static boolean welcomePrinted = false;
	private static void printTitleIfNotPrinted(String prefix) {
		if (!titlePrinted) {
			System.out.println();
			if (!welcomePrinted) {
				System.out.println("Welcome to Paoding Analyser(2.0.4-alpha2)");
				System.out.println();
				welcomePrinted = true;
			}
			System.out.println(prefix + "直接输入或粘贴要被分词的内容,以分号;结束,回车后开始分词。");
			System.out.println(prefix + "另起一行输入:clear或:c,使此次输入无效,用以重新输入。");
			System.out.println(prefix + "要使用命令行参数读入文件内容或其他参数请以冒号:开始,然后输入参数选项。");
			System.out.println(prefix + "退出,请输入:quit或:q、:exit、:e");
			System.out.println(prefix + "需要帮助,请输入:?");
			System.out.println(prefix + "注意:指定对文件分词之前要了解该文件的编码,如果系统编码和文件编码不一致,要通过-c指定文件的编码。");
			System.out.println();
			titlePrinted = true;
		}
	}
	
		
	static String getContent(String path, String encoding) throws IOException {
		return (String) read(path, encoding, true);
	}
	
	static Reader getReader(String path, String encoding) throws IOException {
		return (Reader) read(path, encoding, false);
	}
	
	static Object read(String path, String encoding, boolean return_string) throws IOException {
		InputStream in;
		if (path.startsWith("classpath:")) {
			path = path.substring("classpath:".length());
			URL url = Estimate.class.getClassLoader().getResource(path);
			if (url == null) {
				throw new IllegalArgumentException("Not found " + path
						+ " in classpath.");
			}
			System.out.println("read content from:" + url.getFile());
			in = url.openStream();
		} else {
			File f = new File(path);
			if (!f.exists()) {
				throw new IllegalArgumentException("Not found " + path
						+ " in system.");
			}
			System.out.println("read content from:" + f.getAbsolutePath());
			in = new FileInputStream(f);
		}
		Reader re;
		if (encoding != null) {
			re = new InputStreamReader(in, encoding);
		} else {
			re = new InputStreamReader(in);
		}
		if (!return_string) {
			return re;
		}
		char[] chs = new char[1024];
		int count;
		// 为兼容低版本的JDK,使用StringBuffer而不是StringBuilder
		StringBuffer content = new StringBuffer();
		while ((count = re.read(chs)) != -1) {
			content.append(chs, 0, count);
		}
		re.close();
		return content.toString();
		}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy