All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mayabot.nlp.module.summary.SentenceSummary Maven / Gradle / Ivy

package com.mayabot.nlp.module.summary;

import com.mayabot.nlp.common.Guava;
import com.mayabot.nlp.common.Lists;
import com.mayabot.nlp.segment.LexerReader;

import java.util.*;
import java.util.regex.Pattern;

/**
 * 文本摘要
 *
 * @author jimichan
 */
public class SentenceSummary {

    private static Pattern lineSplitter = Pattern.compile("[\r\n]");
    private static Pattern sentenceSplitter = Pattern.compile("[,,。::“”??!!;;]");

    private LexerReader lexerReader;

    public SentenceSummary(LexerReader lexerReader) {
        this.lexerReader = lexerReader;
    }

    /**
     * 对文章进行摘要
     *
     * @param document 目标文档
     * @param max      需要摘要的长度
     * @return 摘要文本
     */
    public String summary(String document, int max) {
        List sentenceList = splitSentence(document);
        if (sentenceList.isEmpty()) {
            return "";
        }

        int sentence_count = sentenceList.size();
        int document_length = document.length();
        int sentence_length_avg = document_length / sentence_count;
        int size = max / sentence_length_avg + 1;

        List> docs = toDocument(sentenceList);
        TextRankSentence textRank = new TextRankSentence(docs);
        int[] topSentence = textRank.getTopSentence(size);
        List resultList = new LinkedList();
        for (int i : topSentence) {
            resultList.add(sentenceList.get(i));
        }

        resultList = permutation(resultList, sentenceList);
        resultList = pickSentences(resultList, max);

        return Guava.join(resultList, "。");
    }

    /**
     * 对文章进行摘要
     *
     * @param document 文档
     * @param top      需要的关键句的个数
     * @return 关键句列表
     */
    public List summarySentences(String document, int top) {

        List sentences = splitSentence(document);
        List> docs = toDocument(sentences);

        TextRankSentence textRank = new TextRankSentence(docs);
        int[] topSentence = textRank.getTopSentence(top);

        List resultList = new LinkedList();
        for (int i : topSentence) {
            resultList.add(sentences.get(i));
        }

        return resultList;
    }


    private List permutation(List resultList, final List sentenceList) {
        Collections.sort(resultList, new Comparator() {
            @Override
            public int compare(String o1, String o2) {
                Integer num1 = sentenceList.indexOf(o1);
                Integer num2 = sentenceList.indexOf(o2);
                return num1.compareTo(num2);
            }
        });
        return resultList;
    }

    private List pickSentences(List resultList, int max_length) {
        List summary = new ArrayList();
        int count = 0;
        for (String result : resultList) {
            if (count + result.length() <= max_length) {
                summary.add(result);
                count += result.length();
            }
        }
        return summary;
    }


    private List splitSentence(String document) {
        List sentences = Lists.newArrayList();

        Guava.split(document, lineSplitter).forEach(line ->
                sentences.addAll(Guava.split(line, sentenceSplitter))
        );

        return sentences;
    }

    private List> toDocument(List setences) {
        List> sentences = Lists.newArrayList();

        setences.forEach(sentence -> {
            sentences.add(Lists.newArrayList(lexerReader.scan(sentence).toWordSequence()));
        });

        return sentences;
    }

    public LexerReader getLexerReader() {
        return lexerReader;
    }

    public SentenceSummary setLexerReader(LexerReader lexerReader) {
        this.lexerReader = lexerReader;
        return this;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy