com.mayabot.nlp.segment.wordnet.Wordpath Maven / Gradle / Ivy
Show all versions of mynlp Show documentation
/*
* Copyright 2018 mayabot.com authors. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mayabot.nlp.segment.wordnet;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.mayabot.nlp.logging.InternalLogger;
import com.mayabot.nlp.logging.InternalLoggerFactory;
import com.mayabot.nlp.segment.Nature;
import java.util.BitSet;
import java.util.Iterator;
import java.util.function.Consumer;
/**
* WordPath表示对文本划分词的描述。
*
* bitset 下标[i]
* true 表示在1 后面插入分割,0表示不分割
*
* 使用切分符号来表达单词路径的切分,而不是采用List。
* 这个数据结构具有非常好的性质,随时合并、分裂、拆分,而且性能和空间优越。
*
* @author jimichan [email protected]
*/
public class Wordpath {
private static InternalLogger logger = InternalLoggerFactory.getInstance(Wordpath.class);
private BitSet bitSet;
private int length;
private Wordnet wordnet;
public Wordpath(Wordnet wordnet) {
this.wordnet = wordnet;
this.length = wordnet.length();
this.bitSet = new BitSet(length);
//每个字后面都有切分符号,默认切成单子
bitSet.set(0, length);
}
public Wordnet getWordnet() {
return wordnet;
}
/**
* @return 一个包含头和尾的选择路径
*/
public Iterable getBestPathWithBE() {
//FIXME 此处可以优化
Iterable b = ImmutableList.of(wordnet.getBeginRow().getFirst());
Iterable m = this::iteratorVertex;
Iterable e = ImmutableList.of(wordnet.getEndRow().getFirst());
return Iterables.concat(b, m, e);
}
/**
* 返回一个完整最优路径的迭代器。如果最后一个next不完整,那么抛出异常. 这样选择路径的时候,就不需要构建一个PATH list的数据结构了
*
* @return 不包含开始和结束的点路径
*/
public Iterator iteratorVertex() {
return new AbstractIterator() {
private WordPointer pointer = wordPointer();
@Override
protected Vertex computeNext() {
boolean hasNext = pointer.next();
if (!hasNext) {
return endOfData();
}
final int from = pointer.getFrom();
final int len = pointer.getLen();
Vertex theVertex = wordnet.getVertex(from, len);
if (theVertex == null) {
// System.out.println(wordnet.toMoreString());
//@ RepairWordnetProcessor 这里去修复了这个错误,到时要在之前去调用
// logger.error("row: " + from + " len " + len + " select is null"
// +"\nInput ");
Vertex tmp = wordnet.put(from,len);
tmp.nature = Nature.x;
return tmp;
// throw new IllegalStateException("row: " + from + " len " + len + " select is null");
}
return theVertex;
}
};
}
/**
* 直接连接划定的长度的词汇。至于是否打破,可以自行弥补
*
* @param from
* @param length
* @return 合并后对应的Vertex
*/
public Vertex combine(int from, int length) {
this.connect(from, length);
if (wordnet.isNotContains(from, length)) {
return wordnet.put(from, length);
} else {
return wordnet.getVertex(from, length);
}
}
/**
* 最优路径算法一般会调用这个方法,划定一个词语
*
* @param vertex
*/
public void combine(Vertex vertex) {
this.connect(vertex.getRowNum(), vertex.length());
}
/**
* 寻找path里面的词语,但是词图中却不存在.自动创建Vertex。consumer可以去设置Vertex的属性内容
*/
public void findunloadVertext(Consumer consumer) {
final int lastIndex = length - 1;
// 0 1 2 3 4 5 6
// 0 1 1 0 1 1 1
//bitSet.nextSetBit()
// printlnBitSet(bitSet);
for (int i = 0; i < length; ) {
int nextSplitIndex = bitSet.nextSetBit(i);
if (i == lastIndex) {
if (wordnet.isNotContains(i, 1)) {
// 一个点
consumer.accept(wordnet.put(i, 1));
}
i++;
} else {
int len = nextSplitIndex - i + 1;
if (wordnet.isNotContains(i, len)) {
// 一个点
consumer.accept(wordnet.put(i, len));
}
i += len;
}
}
}
/**
* 计算切分出,多个个词片断出来
*
* @return word count
*/
public int wordCount() {
return bitSet.cardinality();
}
public int wordCountInPath() {
return this.wordCount();
}
public class WordPointer {
private int from;
private int len;
private final int lastIndex = length - 1;
private int i = 0;
public WordPointer() {
}
public int getFrom() {
return from;
}
public int getLen() {
return len;
}
public boolean next() {
int nextSplitIndex = Wordpath.this.bitSet.nextSetBit(i);
if (i >= length) {
return false;
}
if (i == lastIndex) {
this.from = i;
this.len = 1;
i++;
} else {
int le = nextSplitIndex - i + 1;
this.from = i;
this.len = le;
i += le;
}
return true;
}
}
public WordPointer wordPointer() {
return new WordPointer();
}
/**
* 划定一个词语,并保持状态
* 从from位置开始,将长度为length的连城一片
*
* @param from
* @param length
*/
private void connect(int from, int length) {
if (length <= 0 || from < 0) {
return;
}
//设置前面的插板
if (from > 0) {
bitSet.set(from - 1);
}
//设置最后的插板
bitSet.set(from + length - 1);
//消除之间所有吃插板
bitSet.set(from, from + length - 1, false);
}
/**
* 是否可以联合多个单词片断,但是有没有打断别的分词
* 比如 AA B CC D E , 此时可以联合BCC
* 但是不可以联合ABCC 因为打断了AA
*
* @param from
* @param len
* @return false表示没有破坏前后词, true是破坏了
*/
public boolean willCutOtherWords(int from, int len) {
//TODO 检查这里的实现是否完美正确
// from 前面是否切分,前面如果有插板,那么返回false
if (from != 0 && !bitSet.get(from - 1)) {
return true;
}
//词尾原来是不是又插板,如果true,返回false
int to = from + len - 1;
//to 0 1 2 3
return !bitSet.get(to);
}
@Override
public String toString() {
int last = bitSet.length() - 1;
StringBuilder sb = new StringBuilder();
for (int i = 0; i < wordnet.length(); i++) {
sb.append(wordnet.charAt(i));
if (bitSet.get(i) && i != last) {
sb.append(" | ");
}
}
return sb.toString();
}
public BitSet getBitSet() {
return bitSet;
}
public void reset() {
this.bitSet.set(0, length);
}
}