All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.khala.extractor.QAExtractor Maven / Gradle / Ivy

There is a newer version: 2.0.3
Show newest version
package com.khala.extractor;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

/**
 * QA Extractor.
 *
 */
public class QAExtractor {

	//	public static void main(String[] args) throws IOException {
	//		QAExtractor extractor = new QAExtractor();
	//		List> list = extractor
	//				.extract(new File("C:\\Users\\84958\\Desktop\\智能问答文档1012\\article\\0a6a38e1974d.html"));
	//		for (Map map : list) {
	//			System.out.println("Q:" + map.get("Q"));
	//			System.out.println("A:" + map.get("A"));
	//		}
	//	}

	public List> extract(File file) {
		List> list = new ArrayList>();
		StringBuffer buffer = new StringBuffer();
		BufferedReader bf;
		try {
			bf = new BufferedReader(new FileReader(file));
			String s = null;
			while ((s = bf.readLine()) != null)
				buffer.append(s.trim());
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		String html = buffer.toString();
		Document doc = Jsoup.parse(html);
		BasicParser parser = new BasicParser();
		String title = parser.getTitle(doc);
		String content = parser.getContent(doc);
		String _content = HTMLUtil.getTextFromHtml(content).replaceAll(" ", "");
		//		System.out.println(_content);
		List sentences = ArticleUtil.getSentences(_content);
		for (String s : sentences) {
			if (RegulationMatcher.match(s.trim())) {
				System.out.println(s);
				Map map = new HashMap();
				map.put("Q", s);
				map.put("A", content);
				list.add(map);
			}
		}
		Map map = new HashMap();
		if (!Util.isContainedChinese(title) || !Util.isContainedChinese(content)) {
			map.put("Q", "");
			map.put("A", "");
		} else {
			map.put("Q", title);
			map.put("A", content);
		}
		list.add(map);

		//		try (QaExtractor extactor = new QaExtractor(new MinioDataService("http://192.168.199.130:9000",
		//				"3VHZ1DH1C21KNKJ6ZI66", "WnrZQ4aZtJQW1nGHfvMlPyUTnCuIpdnloeEE++iU", "extract"))) {
		//			//这是关闭摘要功能
		//			extactor.getComponentFactory().get("QaExtractionFromSummary", QaExtractionFromSummary.class).setSkip(true);
		//			//extract也支持File参数,注意默认是异步处理的
		//			extactor.extract(file, (qas) -> {
		//				qas.forEach((a) -> {
		//					System.out.println(a);
		//				});
		//			});
		//			//如果是要实时等待结果,就调用extract之后调用这句
		//			extactor.getComponentFactory().get(Pipeline.class).run();
		//		}

		return list;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy