All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.dennisit.vplus.data.utils.NlpUtils Maven / Gradle / Ivy

/*--------------------------------------------------------------------------
 *  Copyright (c) 2010-2020, Elon.su All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 * Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution.
 * Neither the name of the elon developer nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * Author: Elon.su, you can also mail [email protected]
 *--------------------------------------------------------------------------
 */
package com.github.dennisit.vplus.data.utils;

import com.google.common.collect.Lists;
import com.hankcs.hanlp.HanLP;
import org.apache.commons.lang3.StringUtils;
import org.nlpcn.commons.lang.finger.FingerprintService;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.WordAlert;

import java.util.List;
import java.util.Optional;

/**
 * @author Elon.su
 */
public class NlpUtils {

    /**
     * 文本转换成繁体字
     * @param text 待处理字符串
     * @return 执行结果
     */
    public static String toTraditionalCN(String text){
        return Optional.ofNullable(text).orElse(HanLP.convertToTraditionalChinese(text));
    }

    /**
     * 文件转换成中文简体字
     * @param text 待处理字符串
     * @return 执行结果
     */
    public static String toSimplifiedCN(String text){
        return Optional.ofNullable(text).orElse(HanLP.convertToSimplifiedChinese(text));
    }

    /**
     * 转换字符成拼音
     * @param text 待处理字符串
     * @return [cháng, jiāng, chéng, zhăng]
     */
    public static String toPinyinText(String text) {
        return HanLP.convertToPinyinString(text, "", false);
    }


    /**
     * 指纹去重
     * 任何一段信息文字,都可以对应一个不太长的随机数,作为区别它和其它信息的指纹(Fingerprint)。只要算法设计的好,任何两段信息的指纹都很难重复,就如同人类的指纹一样。
     * 信息指纹在加密、信息压缩和处理中有着广泛的应用。
     * 我们这里的做法是文章抽取特征词,压缩为md5指纹。利用这些指纹进行hash去重。广泛应用在。搜索结果推荐结果去重。
     * @param str 待处理字符串
     * @return 执行结果
     */
    public static String fingerprint(String str){
        return new FingerprintService().fingerprint(str);
    }

    /**
     * 移除文本中的html标签
     * 
     *     params: "hello ansjBaiDu, suggest booke 《java pro》",
     *     output: "hello ansjBaiDu, suggest booke 《java pro》"
     * 
* @param str 待处理字符串 * @return 执行结果 */ public static String removeHtmlTag(String str){ return StringUtil.rmHtmlTag(str); } /** * 纠正字符 *
     *     params: az AZ AZ az 09•
     *     output: az az az az 09·
     * 
* @param str 待处理字符串 * @return az az az az 09 */ public static String correction(String str){ char[] result = WordAlert.alertStr(str) ; return new String(result); } /** * 根据文章分析文章摘要 * @param text 文章 * @param topNum 摘要条数 * @return 摘要条数 */ public static List extractTabloid(String text, int topNum){ if(StringUtils.isBlank(text)){ return Lists.newArrayList(); } return HanLP.extractSummary(text, topNum); } /** * 抽取文章关键词 * @param text 目标文本 * @param topNum 关键词数目 * @return 关键词集合 */ public static List extractTags(String text, int topNum){ return HanLP.extractKeyword(text, topNum); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy