All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mayabot.nlp.segment.wordnet.Wordnet Maven / Gradle / Ivy

/*
 * Copyright 2018 mayabot.com authors. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.mayabot.nlp.segment.wordnet;

import com.mayabot.nlp.common.utils.CustomCharSequence;
import com.mayabot.nlp.segment.common.VertexHelper;
import org.jetbrains.annotations.NotNull;

import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Map;
import java.util.function.Consumer;

/**
 * 

* #S# [0] #Start 品 [1] 品质 质 [2] 质 和 [3] {和、和服} 服 [4] 服务 务 [5] 务 #E# [6] #End *

*

* sindex表示智能下标索引,用 -1 表示Start length表示End *

* WordNodeLinkedList 是一个优化过的linkedlist,去除了中间node节点,并增量排序和特殊性质 *

* 实现CharSequence接口,可以进行当做字符串类型处理 * O * * @author jimichan */ public final class Wordnet implements CharSequence { /** * 节点,每一行都是前缀词,和 char数字一一对应 */ private final VertexRow[] slotList; /** * 启始行 下标 -1 */ private final VertexRow begin; /** * 结尾行 下标 charSize */ private final VertexRow end; /** * 原始句子对应的数组 */ final char[] charArray; /** * 字符的数量 */ private final int charSize; /** * 3.1.0 新增,存储上下文 */ private Map context = null; private Wordnet(Wordnet parent, int from, int length) { this.charArray = Arrays.copyOfRange(parent.charArray, from, from + length); this.charSize = length; this.begin = new VertexRow(-1, this); this.end = new VertexRow(charSize, this); slotList = new VertexRow[charSize]; for (int i = 0; i < length; i++) { VertexRow row = new VertexRow(i, this); for (Vertex v : parent.slotList[from + i]) { row.getOrCrete(v.length); } slotList[i] = row; } getBeginRow().put(VertexHelper.newBegin()); getEndRow().put(VertexHelper.newEnd()); } /** * 构建一个空的网,槽的数量是charArray.length * * @param charArray 字符数组 */ public Wordnet(char[] charArray) { this.charArray = charArray; this.charSize = charArray.length; this.begin = new VertexRow(-1, this); this.end = new VertexRow(charSize, this); // 创建一个空的数组 slotList = new VertexRow[charSize]; //初始化数组里面的对象,提前初始化好 for (int i = 0; i < charSize; i++) { slotList[i] = new VertexRow(i, this); } getBeginRow().put(VertexHelper.newBegin()); getEndRow().put(VertexHelper.newEnd()); } public Wordnet subWordnet(int from, int length) { return new Wordnet(this, from, length); } /** * 补齐空洞,这样viterbi总是能走到底 */ public void fillNill() { for (VertexRow row : slotList) { if (row.first == null) { Vertex vertex = row.getOrCrete(1); vertex.freq = 1; } } } /** * 寻找没有被所有路径覆盖的位置 * * @return 返回 false的位置是没有覆盖掉的 */ public BitSet findNoOverWords() { BitSet noOverWords = new BitSet(); for (int i = 0; i < charSize; i++) { VertexRow row = slotList[i]; if (row != null) { Vertex p = row.first(); while (p != null) { noOverWords.set(row.rowNum, row.rowNum + p.length); p = p.next; } } } return noOverWords; } /** * 寻找 存在跳转到当前row但是当前row没有跳出节点。 * 或者 根本不存在跳转到当前行的路径(前置条件是没有被路径覆盖) * 寻找 悬空行,会导致路径中断 * * @return bitset 孤悬节点 */ public BitSet findDangling() { BitSet bitSet = new BitSet(charSize); BitSet noOverWords = findNoOverWords(); //第一行肯定是跳入的 bitSet.set(0); for (int i = 0; i < charSize; i++) { VertexRow row = slotList[i]; if (row != null) { Vertex p = row.first(); while (p != null) { noOverWords.set(row.rowNum + p.length); p = p.next; } } } for (int i = 0; i < charSize; i++) { //如果没人跳入,而且也没被覆盖 if (!(bitSet.get(i) || noOverWords.get(i))) { bitSet.set(i); } } //bit set 里面为 false的,那么肯定是孤力 return null; } /** * 如果对应行不存在,那么会自动创建 * * @param sindex index * @return VertexRow */ public final VertexRow getRow(int sindex) { return indexAt(sindex); } public final VertexRow row(int sindex) { return indexAt(sindex); } /** * 节点数字,包含了Start,end两个标记节点 动态统计,调用的时候请注意 * * @return 包含了Start,end两个标记节点的数量 */ public int size() { int count = 0; for (int i = charSize - 1; i >= 0; i--) { VertexRow r = slotList[i]; if (r != null) { count += r.size(); } } count += this.begin.size(); count += this.end.size(); return count; } /** * 返回第多少行的链表 *

*

     * 	length = 5
     *  sloat length = 7
     *
     * -1 0 1 2 3 4 5	优化过的智能下标
     *    0 1 2 3 4		char array 下标
     *  0 1 2 3 4 5 6	sloat数组下标
     * 
* * @param sindex SmartIndex -1表示Start * @return VertexRow */ private VertexRow indexAt(int sindex) { if (sindex == -1) { return begin; } if (sindex == charSize) { return end; } return slotList[sindex]; } /** * 非空的行数 * * @return 非空的行数 */ public int notNullRowNums() { int count = 0; for (int i = charSize - 1; i >= 0; i--) { VertexRow r = slotList[i]; if (r != null && !r.isEmpty()) { count++; } } if (!this.begin.isEmpty()) { count++; } if (!this.end.isEmpty()) { count++; } return count; } ////////////// 顶点操作//////////////// /** * 添加顶点,重复添加就忽略 返回被替换的节点 * * @param charOffset sindex 下标和char对应 * @param vertex 顶点 * @return 返回被替换的节点, 新增节点返回null */ public Vertex put(int charOffset, Vertex vertex) { return getRow(charOffset).put(vertex); } /** * put一个 ,但是返回的是一个最新的Vertext对象,然后可以继续设置属性 * * @param offset 偏移量 * @param length 长度 * @return 返回的是一个最新的Vertex对象 */ public Vertex put(int offset, int length) { Vertex vertex = new Vertex(length); getRow(offset).put(vertex); return vertex; } /** * 行首节点 * * @param sindex * @return Vertex */ public Vertex getRowFirst(int sindex) { VertexRow row = row(sindex); return row.getFirst(); } /** * 获取某一行长度为length的节点 没有就返回null * * @param sindex * @param length * @return Vertex */ public Vertex getVertex(int sindex, int length) { VertexRow row = row(sindex); return row.get(length); } /** * 检查是否包含 offset-length的vertext * * @param sindex * @param length * @return boolean true not contain */ public boolean isNotContains(int sindex, int length) { VertexRow row = row(sindex); return !row.contains(length); } /** * 返回一行里面有几个节点 * * @param sindex * @return int size */ public int sizeInRow(int sindex) { VertexRow row = row(sindex); if (row == null) { return 0; } return row.size(); } /** * 访问网络里面所有的Vertex节点. 从后向前了 * * @param consumer */ public final void accessAllVertex(Consumer consumer) { for (int i = slotList.length - 1; i >= 0; i--) { VertexRow row = slotList[i]; if (row != null) { for (Vertex v = row.first(); v != null; v = v.next()) { consumer.accept(v); } } } } /** * 根据当前的最优路径,设定已经选择的最优路径中的Vertex的最优网络标记为true */ @Override public String toString() { return new WordNetToStringBuilder(this, false).toString(); } public String toMoreString() { return new WordNetToStringBuilder(this, true).toString(); } public VertexRow getBeginRow() { return begin; } public VertexRow getEndRow() { return end; } /** * 比如字符串长度为5,那么这个length返回5 * * @return char size */ public int getCharSizeLength() { return charSize; } /** * 原始的 * * @return chars */ public @NotNull char[] getCharArray() { return charArray; } @Override public int length() { return charArray.length; } @Override public char charAt(int index) { if (index < 0) { return ' '; } if (index >= charSize) { return ' '; } return charArray[index]; } @Override public CharSequence subSequence(int start, int end) { return new CustomCharSequence(charArray, start, end - start); } public VertexRow[] getSlotList() { return slotList; } public T get(String key){ if (context == null) { return null; }else{ return (T) context.get(key); } } public void set(String key, Object value) { if(context == null){ context = new HashMap<>(); } context.put(key, value); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy