All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mayabot.nlp.module.summary.KeywordSummary Maven / Gradle / Ivy

package com.mayabot.nlp.module.summary;

import com.mayabot.nlp.algorithm.TopMaxK;
import com.mayabot.nlp.common.Pair;
import com.mayabot.nlp.segment.LexerReader;

import java.io.Reader;
import java.io.StringReader;
import java.util.*;
import java.util.stream.Collectors;

/**
 * 基于TextRank算法的关键字提取,适用于单文档
 * 

* 该对象可复用,多线程安全。 * * @author hankcs * @author jimichan */ public class KeywordSummary { /** * 阻尼系数一般取值为0.85 */ float d = 0.85f; /** * 最大迭代次数 */ private int maxIter = 200; private float minDiff = 0.001f; private LexerReader lexerReader; public KeywordSummary(LexerReader lexerReader) { this.lexerReader = lexerReader; } public List keyword(String text, int top) { return keywordWithScore(new StringReader(text), top).stream().map(it -> it.first).collect(Collectors.toList()); } public List keyword(Reader text, int top) { return keywordWithScore(text, top).stream().map(it -> it.first).collect(Collectors.toList()); } public List> keywordWithScore(String text, int top) { return keywordWithScore(new StringReader(text), top); } public List> keywordWithScore(Reader text, int top) { TopMaxK topMaxK = new TopMaxK(top, String.class); Map rank = getRank(text); rank.forEach((k, v) -> topMaxK.push(k, v)); return topMaxK.result(); } /** * 使用已经分好的词来计算rank * 该方法复制来自https://github.com/hankcs/HanLP/blob/master/src/main/java/com/hankcs/hanlp/summary/KeywordSummary.java * * @return */ private Map getRank(Reader reader) { Map> words = new TreeMap<>(); Queue que = new LinkedList<>(); lexerReader.scan(reader).stream().map(it -> it.word).forEach(w -> { if (!words.containsKey(w)) { words.put(w, new TreeSet<>()); } // 复杂度O(n-1) if (que.size() >= 5) { que.poll(); } for (String qWord : que) { if (w.equals(qWord)) { continue; } //既然是邻居,那么关系是相互的,遍历一遍即可 words.get(w).add(qWord); words.get(qWord).add(w); } que.offer(w); }); Map score = new HashMap<>(64); //依据TF来设置初值 for (Map.Entry> entry : words.entrySet()) { score.put(entry.getKey(), sigmoid(entry.getValue().size())); } for (int i = 0; i < maxIter; ++i) { Map m = new HashMap<>(); float max_diff = 0; for (Map.Entry> entry : words.entrySet()) { String key = entry.getKey(); Set value = entry.getValue(); m.put(key, 1 - d); for (String element : value) { int size = words.get(element).size(); if (key.equals(element) || size == 0) { continue; } m.put(key, m.get(key) + d / size * (score.get(element) == null ? 0 : score.get(element))); } max_diff = Math.max(max_diff, Math.abs(m.get(key) - (score.get(key) == null ? 0 : score.get(key)))); } score = m; if (max_diff <= minDiff) { break; } } return score; } private float sigmoid(float value) { return (float) (1d / (1d + Math.exp(-value))); } public LexerReader getLexerReader() { return lexerReader; } /** * 设置新的分词器。默认是去除停用词和标点符号的 * * @param lexerReader * @return KeywordSummary */ public KeywordSummary setLexerReader(LexerReader lexerReader) { this.lexerReader = lexerReader; return this; } public float getD() { return d; } public KeywordSummary setD(float d) { this.d = d; return this; } public int getMaxIter() { return maxIter; } public KeywordSummary setMaxIter(int maxIter) { this.maxIter = maxIter; return this; } public float getMinDiff() { return minDiff; } public KeywordSummary setMinDiff(float minDiff) { this.minDiff = minDiff; return this; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy