All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hankcs.hanlp.mining.word2vec.DocVectorModel Maven / Gradle / Ivy

There is a newer version: portable-1.8.5
Show newest version
/*
 * Hankcs
 * [email protected]
 * 2017-06-20 PM1:38
 *
 * 
 * Copyright (c) 2017, 码农场. All Right Reserved, http://www.hankcs.com/
 * This source is subject to Hankcs. Please contact Hankcs to get more information.
 * 
 */
package com.hankcs.hanlp.mining.word2vec;


import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.NotionalTokenizer;

import java.util.*;

/**
 * 文档向量模型
 *
 * @author hankcs
 */
public class DocVectorModel extends AbstractVectorModel
{
    private final WordVectorModel wordVectorModel;

    public DocVectorModel(WordVectorModel wordVectorModel)
    {
        super();
        this.wordVectorModel = wordVectorModel;
    }

    /**
     * 添加文档
     *
     * @param id      文档id
     * @param content 文档内容
     * @return 文档向量
     */
    public Vector addDocument(int id, String content)
    {
        Vector result = query(content);
        storage.put(id, result);
        return result;
    }


    /**
     * 查询最相似的前10个文档
     *
     * @param query 查询语句(或者说一个文档的内容)
     * @return
     */
    public List> nearest(String query)
    {
        return queryNearest(query, 10);
    }


    /**
     * 将一个文档转为向量
     *
     * @param content 文档
     * @return 向量
     */
    public Vector query(String content)
    {
        if (content == null || content.length() == 0) return null;
        List termList = NotionalTokenizer.segment(content);
        Vector result = new Vector(dimension());
        int n = 0;
        for (Term term : termList)
        {
            Vector vector = wordVectorModel.vector(term.word);
            if (vector == null)
            {
                continue;
            }
            ++n;
            result.addToSelf(vector);
        }
        if (n == 0)
        {
            return null;
        }
        result.normalize();
        return result;
    }

    @Override
    public int dimension()
    {
        return wordVectorModel.dimension();
    }

    /**
     * 文档相似度计算
     * @param what
     * @param with
     * @return
     */
    public float similarity(String what, String with)
    {
        Vector A = query(what);
        if (A == null) return -1f;
        Vector B = query(with);
        if (B == null) return -1f;
        return A.cosineForUnitVector(B);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy