All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.ansj.app.extracting.Extracting Maven / Gradle / Ivy

The newest version!
package org.ansj.app.extracting;

import org.ansj.app.extracting.domain.ExtractingResult;
import org.ansj.app.extracting.domain.Rule;
import org.ansj.app.extracting.domain.RuleIndex;
import org.ansj.app.extracting.exception.RuleFormatException;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.library.DicLibrary;
import org.ansj.recognition.arrimpl.UserDefineRecognition;
import org.ansj.splitWord.analysis.DicAnalysis;
import org.ansj.util.Graph;
import org.ansj.util.TermUtil;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
 * 抽取引擎
 * 

*

* Created by Ansj on 28/08/2017. */ public class Extracting { private RuleIndex ruleIndex = new RuleIndex(); public Extracting() { } public Extracting(List lines) throws RuleFormatException { addRules(lines); } public Extracting(InputStream is, String encoding) throws RuleFormatException, IOException { try { List lines = IOUtil.readFile2List(IOUtil.getReader(is, encoding)); addRules(lines); } finally { is.close(); } } public Extracting(Reader reader) throws RuleFormatException, IOException { try { List lines = IOUtil.readFile2List(new BufferedReader(reader)); addRules(lines); } finally { reader.close(); } } public void addRules(List lines) throws RuleFormatException { for (String line : lines) { addRuleStr(line); } } public void addRule(Rule rule) { ruleIndex.add(rule); } public void addRuleStr(String line) throws RuleFormatException { if (StringUtil.isNotBlank(line)) { addRule(Lexical.parse(line)); } } /** * 传入文本分词并抽取 * @param content 需要分析的文本 * @param forests 对文本分词加载的词典 * @return 抽取结果集 */ public ExtractingResult parse(String content, Forest... forests) { Forest[] myForests = null; if (forests == null) { myForests = new Forest[]{ruleIndex.getForest()}; } else if (forests.length == 0) { myForests = new Forest[]{ruleIndex.getForest(), DicLibrary.get()}; } else { myForests = new Forest[forests.length + 1]; myForests[0] = ruleIndex.getForest(); for (int i = 0; i < forests.length; i++) { myForests[i + 1] = forests[i]; } } Result terms = DicAnalysis.parse(content, myForests); return parse(terms, false); } /** * 对分词后的结果进行抽取 * @param terms 分词后的结果 * @return 抽取结果集 */ public ExtractingResult parse(Result terms){ return parse(terms,true) ; } /** * 对分词后的结果进行抽取 * @param terms 分词后的结果 * @param useForest 是否使用词典 * @return 抽取结果集 */ private ExtractingResult parse(Result terms, boolean useForest) { if(terms==null || terms.size()==0){ return new ExtractingResult() ; } if (useForest) { Graph graph = new Graph(terms); new UserDefineRecognition(TermUtil.InsertTermType.REPLACE,ruleIndex.getForest()).recognition(graph); graph.rmLittlePath(); List result = new ArrayList(); int length = graph.terms.length - 1; for (int i = 0; i < length; i++) { if (graph.terms[i] != null) { result.add(graph.terms[i]); } } terms = new Result(result) ; } List tasks = new ArrayList<>(); ExtractingResult result = new ExtractingResult(); for (int i = 0; i < terms.size(); i++) { Term term = terms.get(i); Set sets = new HashSet<>(); Set rules = ruleIndex.getRules(term.getName()); if (rules != null) { sets.addAll(rules); } rules = ruleIndex.getRules(":" + term.getNatureStr()); if (rules != null) { sets.addAll(rules); } rules = ruleIndex.getRules(":*"); if (rules != null) { sets.addAll(rules); } for (Rule rule : sets) { tasks.add(new ExtractingTask(result, rule, i, terms)); } } for (ExtractingTask task : tasks) { task.run(); } return result; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy