All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.ansj.app.summary.SummaryComputer Maven / Gradle / Ivy

There is a newer version: 5.1.6
Show newest version
package org.ansj.app.summary;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.ansj.app.keyword.KeyWordComputer;
import org.ansj.app.keyword.Keyword;
import org.ansj.app.summary.pojo.Summary;
import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.nlpcn.commons.lang.tire.SmartGetWord;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.MapCount;

/**
 * 自动摘要,同时返回关键词
 * 
 * @author ansj
 * 
 */
public class SummaryComputer {

	private static final Set FILTER_SET = new HashSet();

	static {
		FILTER_SET.add("w");
		FILTER_SET.add("null");
	}

	/**
	 * summaryLength
	 */
	private int len = 300;

	private boolean isSplitSummary = true;

	String title, content;

	public SummaryComputer(String title, String content) {
		this.title = title;
		this.content = content;
	}

	public SummaryComputer(int len, String title, String content) {
		this.len = len;
		this.title = title;
		this.content = content;
	}

	public SummaryComputer(int len, boolean isSplitSummary, String title, String content) {
		this.len = len;
		this.title = title;
		this.content = content;
		this.isSplitSummary = isSplitSummary;
	}

	/**
	 * 计算摘要,利用关键词抽取计算
	 * 
	 * @return
	 */
	public Summary toSummary() {
		return toSummary(new ArrayList());
	}

	/**
	 * 根据用户查询串计算摘要
	 * 
	 * @return
	 */
	public Summary toSummary(String query) {

		List parse = NlpAnalysis.parse(query).getTerms();

		List keywords = new ArrayList();
		for (Term term : parse) {
			if (FILTER_SET.contains(term.natrue().natureStr)) {
				continue;
			}
			keywords.add(new Keyword(term.getName(), term.termNatures().allFreq, 1));
		}

		return toSummary(keywords);
	}

	/**
	 * 计算摘要,传入用户自己算好的关键词
	 * 
	 * @return
	 */
	public Summary toSummary(List keywords) {

		if (keywords == null) {
			keywords = new ArrayList();
		}

		if (keywords.size() == 0) {

			KeyWordComputer kc = new KeyWordComputer(10);
			keywords = kc.computeArticleTfidf(title, content);
		}
		return explan(keywords, content);
	}

	/**
	 * 计算摘要
	 * 
	 * @param keyword
	 * @param content
	 * @return
	 */
	private Summary explan(List keywords, String content) {

		SmartForest sf = new SmartForest();

		for (Keyword keyword : keywords) {
			sf.add(keyword.getName(), keyword.getScore());
		}

		// 先断句
		List sentences = toSentenceList(content.toCharArray());
		
		for (Sentence sentence : sentences) {
			computeScore(sentence, sf);
		}

		double maxScore = 0;
		int maxIndex = 0;

		MapCount mc = new MapCount<>();

		for (int i = 0; i < sentences.size(); i++) {
			double tempScore = sentences.get(i).score;
			int tempLength = sentences.get(i).value.length();
			mc.addAll(sentences.get(i).mc.get());

			if (tempLength >= len) {
				tempScore = tempScore * mc.get().size();
				if (maxScore < tempScore) {
					maxScore = tempScore;
					maxIndex = i;
					continue;
				}
				mc.get().clear();
			}
			for (int j = i + 1; j < sentences.size(); j++) {
				tempScore += sentences.get(j).score;
				tempLength += sentences.get(j).value.length();
				mc.addAll(sentences.get(j).mc.get());

				if (tempLength >= len) {
					tempScore = tempScore * mc.get().size();
					if (maxScore < tempScore) {
						maxScore = tempScore;
						maxIndex = i;
					}
					mc.get().clear();
					break;
				}
			}

			if (tempLength < len) {
				tempScore = tempScore * mc.get().size();
				if (maxScore < tempScore) {
					maxScore = tempScore;
					maxIndex = i;
					break;
				}
				mc.get().clear();
			}
		}

		StringBuilder sb = new StringBuilder();
		for (int i = maxIndex; i < sentences.size(); i++) {
			sb.append(sentences.get(i).value);
			if (sb.length() > len) {
				break;
			}
		}

		String summaryStr = sb.toString();

		/**
		 * 是否强制文本长度。对于abc这种字符算半个长度
		 */

		if (isSplitSummary && sb.length() > len) {
			double value = len;

			StringBuilder newSummary = new StringBuilder();
			char c = 0;
			for (int i = 0; i < sb.length(); i++) {
				c = sb.charAt(i);
				if (c < 256) {
					value -= 0.5;
				} else {
					value -= 1;
				}

				if (value < 0) {
					break;
				}

				newSummary.append(c);
			}

			summaryStr = newSummary.toString();
		}

		return new Summary(keywords, summaryStr);
	}

	/**
	 * 计算一个句子的分数
	 * 
	 * @param sentence
	 * @param sf
	 */
	private void computeScore(Sentence sentence, SmartForest forest) {
		SmartGetWord sgw = new SmartGetWord(forest, sentence.value);
		String name = null;
		while ((name = sgw.getFrontWords()) != null) {
			sentence.updateScore(name, sgw.getParam());
		}
		if (sentence.score == 0) {
			sentence.score = sentence.value.length() * -0.005;
		} else {
			sentence.score /= Math.log(sentence.value.length() + 3);
		}
	}

	public List toSentenceList(char[] chars) {

		StringBuilder sb = new StringBuilder();

		List sentences = new ArrayList();

		for (int i = 0; i < chars.length; i++) {
			if (sb.length() == 0 && (Character.isWhitespace(chars[i]) || chars[i] == ' ')) {
				continue;
			}

			sb.append(chars[i]);
			switch (chars[i]) {
			case '.':
				if (i < chars.length - 1 && chars[i + 1] > 128) {
					insertIntoList(sb, sentences);
					sb = new StringBuilder();
				}
				break;
			//case ' ':
			case '	':
			case ' ':
			case ' ':
			case ',':
			case '。':
			case ';':
			case ';':
			case '!':
			case '!':
			case ',':
			case '?':
			case '?':
			case '\n':
			case '\r':
				insertIntoList(sb, sentences);
				sb = new StringBuilder();
			}
		}

		if (sb.length() > 0) {
			insertIntoList(sb, sentences);
		}

		return sentences;
	}

	private void insertIntoList(StringBuilder sb, List sentences) {
		String content = sb.toString().trim();
		if (content.length() > 0) {
			sentences.add(new Sentence(content));
		}
	}

	/*
	 * 句子对象
	 */
	public class Sentence {
		String value;
		private double score;

		private MapCount mc = new MapCount<>();

		public Sentence(String value) {
			this.value = value.trim();
		}

		public void updateScore(String name, double score) {
			mc.add(name);
			Double size = mc.get().get(name);
			this.score += score / size;
		}

		public String toString() {
			return value;
		}
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy