com.github.dennisit.vplus.data.utils.NlpUtils Maven / Gradle / Ivy
/*--------------------------------------------------------------------------
* Copyright (c) 2010-2020, Elon.su All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* Neither the name of the elon developer nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
* Author: Elon.su, you can also mail [email protected]
*--------------------------------------------------------------------------
*/
package com.github.dennisit.vplus.data.utils;
import com.google.common.collect.Lists;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.dictionary.py.Pinyin;
import org.apache.commons.lang3.StringUtils;
import org.nlpcn.commons.lang.finger.FingerprintService;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.WordAlert;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
/**
* @author Elon.su
*/
public class NlpUtils {
/**
* 文本转换成繁体字
*
* @param text 待处理字符串
* @return 执行结果
*/
public static String toTraditionalCN(String text) {
return Optional.ofNullable(text).orElse(HanLP.convertToTraditionalChinese(text));
}
/**
* 文件转换成中文简体字
*
* @param text 待处理字符串
* @return 执行结果
*/
public static String toSimplifiedCN(String text) {
return Optional.ofNullable(text).orElse(HanLP.convertToSimplifiedChinese(text));
}
/**
* 转换字符成拼音
*
* @param text 待处理字符串
* @return [cháng, jiāng, chéng, zhăng]
*/
public static String toPinyinText(String text) {
return HanLP.convertToPinyinString(text, "", false);
}
public static String toPinyinAddr(String text) {
List pinyinList = HanLP.convertToPinyinList(text);
return StringUtils.join(Optional.ofNullable(pinyinList).orElse(Lists.newArrayList())
.stream()
.map(x -> x.getFirstChar())
.collect(Collectors.toList()), "");
}
/**
* 指纹去重
* 任何一段信息文字,都可以对应一个不太长的随机数,作为区别它和其它信息的指纹(Fingerprint)。只要算法设计的好,任何两段信息的指纹都很难重复,就如同人类的指纹一样。
* 信息指纹在加密、信息压缩和处理中有着广泛的应用。
* 我们这里的做法是文章抽取特征词,压缩为md5指纹。利用这些指纹进行hash去重。广泛应用在。搜索结果推荐结果去重。
*
* @param str 待处理字符串
* @return 执行结果
*/
public static String fingerprint(String str) {
return new FingerprintService().fingerprint(str);
}
/**
* 移除文本中的html标签
*
* params: "hello ansjBaiDu, suggest booke 《java pro》",
* output: "hello ansjBaiDu, suggest booke 《java pro》"
*
*
* @param str 待处理字符串
* @return 执行结果
*/
public static String removeHtmlTag(String str) {
return StringUtil.rmHtmlTag(str);
}
/**
* 纠正字符
*
* params: az AZ AZ az 09•
* output: az az az az 09·
*
*
* @param str 待处理字符串
* @return az az az az 09
*/
public static String correction(String str) {
char[] result = WordAlert.alertStr(str);
return new String(result);
}
/**
* 根据文章分析文章摘要
*
* @param text 文章
* @param topNum 摘要条数
* @return 摘要条数
*/
public static List extractTabloid(String text, int topNum) {
if (StringUtils.isBlank(text)) {
return Lists.newArrayList();
}
return HanLP.extractSummary(text, topNum);
}
/**
* 抽取文章关键词
*
* @param text 目标文本
* @param topNum 关键词数目
* @return 关键词集合
*/
public static List extractTags(String text, int topNum) {
return HanLP.extractKeyword(text, topNum);
}
}