com.mayabot.nlp.segment.lexer.perceptron.PerceptronSegmentAlgorithm Maven / Gradle / Ivy
package com.mayabot.nlp.segment.lexer.perceptron;
import com.mayabot.nlp.segment.Nature;
import com.mayabot.nlp.segment.WordSplitAlgorithm;
import com.mayabot.nlp.segment.common.BaseSegmentComponent;
import com.mayabot.nlp.segment.lexer.bigram.CoreDictionary;
import com.mayabot.nlp.segment.wordnet.Vertex;
import com.mayabot.nlp.segment.wordnet.Wordnet;
/**
* 基于核心词典的基础切词器
*
* @author jimichan
*/
public class PerceptronSegmentAlgorithm extends BaseSegmentComponent implements WordSplitAlgorithm {
private final PerceptronSegment perceptron;
private PerceptronsSegmentService service;
private CoreDictionary coreDictionary;
public PerceptronSegmentAlgorithm(PerceptronsSegmentService service, CoreDictionary coreDictionary) {
super(LEVEL1);
this.service = service;
this.coreDictionary = coreDictionary;
perceptron = service.getPerceptron();
}
@Override
public void fill(Wordnet wordnet) {
char[] text = wordnet.getCharArray();
int[] decode = perceptron.decode(text, false);
int p = 0;
for (int i = 0; i < decode.length; i++) {
int x = decode[i];
if (x == PerceptronSegment.S || x == PerceptronSegment.E) {
combine(wordnet, text, p, i - p + 1);
p = i + 1;
}
}
if (p < text.length) {
combine(wordnet, text, p, text.length - p);
}
}
private void combine(Wordnet wordnet, char[] text, int offset, int length) {
Vertex vertex = wordnet.put(offset, length);
int wordId = coreDictionary.wordId(text, offset, length);
if (wordId >= 0) {
int freq = coreDictionary.wordFreq(wordId);
vertex.wordID = wordId;
vertex.freq = freq;
} else {
vertex.setAbsWordNatureAndFreq(Nature.newWord);
}
}
}