org.apdplat.word.analysis.SimilarityRanker Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of word Show documentation

word分词是一个Java实现的中文分词组件，提供了多种基于词典的分词算法，并利用ngram模型来消除歧义。能准确识别英文、数字，以及日期、时间等数量词，能识别人名、地名、组织机构名等未登录词。同时提供了Lucene、Solr、ElasticSearch插件。

There is a newer version: 1.3.1

Show newest version

/**
 *
 * APDPlat - Application Product Development Platform
 * Copyright (c) 2013, 杨尚川, [email protected]
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see .
 *
 */

package org.apdplat.word.analysis;

import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

/**
 * 相似度排名
 * @author 杨尚川
 */
public interface SimilarityRanker extends Similarity{
    /**
     * 计算源文本和目标文本的相似度
     * 根据相似度分值对目标文本进行排序
     * @param source 源文本
     * @param targets 目标文本
     * @return 相似度排名结果列表
     */
    default Hits rank(String source, List targets){
        return rank(source, targets, Integer.MAX_VALUE);
    }

    /**
     * 计算源文本和目标文本的相似度
     * 根据相似度分值对目标文本进行排序
     * 获取排名结果最高的topN项
     * @param source 源文本
     * @param targets 目标文本
     * @param topN 相似度排名结果列表只保留相似度分值最高的topN项
     * @return 相似度排名结果列表
     */
    default Hits rank(String source, List targets, int topN){
        Hits hits = new Hits(topN>targets.size()?targets.size():topN);
        targets
            .parallelStream()
            .map(target -> {
                double score = similarScore(source, target);
                Hit hit = new Hit();
                hit.setText(target);
                hit.setScore(score);
                return hit;
            })
            .sorted()
            .limit(topN)
            .collect(Collectors.toList())
            .forEach(hit -> hits.addHit(hit));
        return hits;
    }

    static void main(String[] args) throws Exception{
        TextSimilarity textSimilarity = new EditDistanceTextSimilarity();
        List sentences = Files.readAllLines(Paths.get("src/test/resources/dic.txt"))
                .stream()
                .map(line -> line.trim())
                .filter(line -> line.length() > 1)
                .collect(Collectors.toList());
        System.out.println("开始计算 "+sentences.size()+" 句话的文本相似度");
        AtomicInteger i = new AtomicInteger();
        sentences.forEach(sentence -> {
            AtomicInteger j = new AtomicInteger();
            System.out.println("****************************************************************************************************");
            System.out.println(i.incrementAndGet() + "、文本 "+sentence+" 的相似度文本：");
            textSimilarity.rank(sentence, sentences, 15).getHits().forEach(hit -> System.out.println("\t" + j.incrementAndGet() + "、" + hit.getScore() + " " + hit.getText()));
            System.out.println("****************************************************************************************************");
        });
    }
}