com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieStringIntMap Maven / Gradle / Ivy
/*
* Copyright 2018 mayabot.com authors. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* DoubleArrayTrieMap: Java implementation of Darts (Double-ARray Trie System)
*
*
* Copyright(C) 2001-2007 Taku Kudo <[email protected]>
* Copyright(C) 2009 MURAWAKI Yugo <[email protected]>
* Copyright(C) 2012 KOMIYA Atsushi <[email protected]>
*
*
*
* The contents of this file may be used under the terms of either of the GNU
* Lesser General Public License Version 2.1 or later (the "LGPL"), or the BSD
* License (the "BSD").
*
*/
/*
* 源代码参考和部分引用来自 https://github.com/hankcs/HanLP https://github.com/NLPchina/ansj_seg
*/
package com.mayabot.nlp.algorithm.collection.dat;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.TreeMap;
import static com.mayabot.nlp.common.utils.DataInOutputUtils.readIntArray;
import static com.mayabot.nlp.common.utils.DataInOutputUtils.writeIntArray;
/**
* 【String int】的map
* 这是一个定制化的String,Int的DATMap的实现,采用int来存储,比Integer对内存的要求更低。
*
* @author jimichan
*/
public class DoubleArrayTrieStringIntMap {
private int[] values;
private DoubleArrayTrie dat;
/**
* 从IO里面恢复
*
* @param in
* @throws IOException
*/
public DoubleArrayTrieStringIntMap(
DataInput in) throws IOException {
DoubleArrayTrie dat = new DoubleArrayTrie(in);
int[] values = readIntArray(in);
this.dat = dat;
this.values = values;
}
/**
* @param dat
* @param values
*/
public DoubleArrayTrieStringIntMap(DoubleArrayTrie dat, int[] values) {
this.values = values;
this.dat = dat;
}
/**
* @param keys 一定是字典有序
* @param values
*/
public DoubleArrayTrieStringIntMap(ArrayList keys, int[] values) {
this(new DoubleArrayTrie(keys), values);
}
public DoubleArrayTrieStringIntMap(TreeMap map) {
ArrayList keys = new ArrayList<>(map.keySet());
int[] values = new int[map.size()];
Collection values1 = map.values();
int c = 0;
for(Integer integer : values1){
values[c++] = integer.intValue();
}
this.dat = new DoubleArrayTrie(keys);
this.values = values;
}
public void save(DataOutput out) throws IOException {
dat.write(out);
writeIntArray(values, out);
}
/**
* DAT的搜索器
*
* @param text 带计算的文本
* @param offset 文本中的偏移量
* @return DATMapMatcherInt
*/
public DATMapMatcherInt match(String text, int offset) {
return new DATMapMatcherInt(dat.matcher(text, offset));
}
/**
* DAT的搜索器
*
* @param text 带计算的文本
* @param offset 文本中的偏移量
* @return DATMapMatcherInt
*/
public DATMapLongMatcherInt matchLong(char[] text, int offset) {
return new DATMapLongMatcherInt(dat.matcherLong(text, offset));
}
/**
* DAT的搜索器
*
* @param text 带计算的文本
* @param offset 文本中的偏移量
* @return DATMapMatcherInt
*/
public DATMapLongMatcherInt matchLong(String text, int offset) {
return new DATMapLongMatcherInt(dat.matcherLong(text, offset));
}
/**
* DAT的搜索器
*
* @param text 带计算的文本
* @param offset 文本中的偏移量
* @return DATMapMatcherInt
*/
public DATMapMatcherInt match(char[] text, int offset) {
return new DATMapMatcherInt(dat.matcher(text, offset));
}
public class DATMapMatcherInt {
DATMatcher datMater;
public DATMapMatcherInt(DATMatcher datMater) {
this.datMater = datMater;
}
public boolean next() {
return datMater.next();
}
public int getBegin() {
return datMater.getBegin();
}
public int getLength() {
return datMater.getLength();
}
public int getValue() {
int index = datMater.getIndex();
if (index == -1) {
return -1;
} else {
return values[index];
}
}
public int getIndex() {
return datMater.getIndex();
}
}
public class DATMapLongMatcherInt {
DATLongMatcher datMater;
public DATMapLongMatcherInt(DATLongMatcher datMater) {
this.datMater = datMater;
}
public boolean next() {
return datMater.next();
}
public int getBegin() {
return datMater.getBegin();
}
public int getLength() {
return datMater.getLength();
}
public int getValue() {
int index = datMater.getIndex();
if (index == -1) {
return -1;
} else {
return values[index];
}
}
public int getIndex() {
return datMater.getIndex();
}
}
/**
* 树叶子节点个数
*
* @return DATMapMatcherInt
*/
public int size() {
return values.length;
}
/**
* 精确匹配
*
* @param key 键
* @return 值
*/
public int indexOf(CharSequence key) {
return dat.indexOf(key, 0, 0, 0);
}
public int indexOf(CharSequence key, int pos, int len, int nodePos) {
return dat.indexOf(key, pos, len, nodePos);
}
/**
* 精确查询
*
* @param chars 键的char数组
* @param pos char数组的起始位置
* @param len 键的长度
* 开始查找的位置(本参数允许从非根节点查询)
* @return 查到的节点代表的value ID,负数表示不存在
*/
public int indexOf(char[] chars, int pos, int len) {
return dat.indexOf(chars, pos, len, 0);
}
/**
* 精确查询
*
* @param keyChars 键的char数组
* @param pos char数组的起始位置
* @param len 键的长度
* @param nodePos 开始查找的位置(本参数允许从非根节点查询)
* @return 查到的节点代表的value ID,负数表示不存在
*/
public int indexOf(char[] keyChars, int pos, int len, int nodePos) {
return dat.indexOf(keyChars, pos, len, nodePos);
}
public int indexOf(char ch) {
return dat.indexOf(ch);
}
/**
* 精确查询
*
* @param key 键
* @return 值
*/
public int get(CharSequence key) {
int index = indexOf(key);
if (index >= 0) {
return getValueAt(index);
}
return -1;
}
public int get(CharSequence key, int offset, int length) {
int index = indexOf(key, offset, length, 0);
if (index >= 0) {
return getValueAt(index);
}
return -1;
}
public int get(char[] key) {
int index = indexOf(key, 0, key.length, 0);
if (index >= 0) {
return getValueAt(index);
}
return -1;
}
public int get(char[] key, int offset, int len) {
int index = indexOf(key, offset, len, 0);
if (index >= 0) {
return getValueAt(index);
}
return -1;
}
/**
* 获取index对应的值
*
* @param index
* @return DATMapMatcherInt
*/
public int getValueAt(int index) {
return values[index];
}
/**
* @param key
* @return true or false
*/
public boolean containsKey(String key) {
return indexOf(key) >= 0;
}
/**
* @param key
* @return true or false
*/
public boolean containsKey(char key) {
return indexOf(key) >= 0;
}
/**
* 更新某个键对应的值
*
* @param key 键
* @param value 值
* @return 是否成功(失败的原因是没有这个键)
*/
public boolean set(String key, int value) {
int index = indexOf(key);
if (index >= 0) {
values[index] = value;
return true;
}
return false;
}
/**
* 从值数组中提取下标为index的值
* 注意为了效率,此处不进行参数校验
*
* @param index 下标
* @return 值
*/
public int get(int index) {
return values[index];
}
}