com.atilika.kuromoji.dict.UserDictionary Maven / Gradle / Ivy
The newest version!
/*-*
* Copyright © 2010-2015 Atilika Inc. and contributors (see CONTRIBUTORS.md)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. A copy of the
* License is distributed with this work in the LICENSE.md file. You may
* also obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.atilika.kuromoji.dict;
import com.atilika.kuromoji.trie.PatriciaTrie;
import com.atilika.kuromoji.util.DictionaryEntryLineParser;
import com.atilika.kuromoji.util.StringUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class UserDictionary implements Dictionary {
private static final String DEFAULT_FEATURE = "*";
private static final String FEATURE_SEPARATOR = ",";
private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
private static final int WORD_COST = -100000;
private static final int LEFT_ID = 5;
private static final int RIGHT_ID = 5;
private int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
// The word id below is the word id for the source string
// surface string => [ word id, 1st token length, 2nd token length, ... , nth token length
private PatriciaTrie entries = new PatriciaTrie<>();
// Maps wordId to reading
private Map readings = new HashMap<>();
// Maps wordId to part-of-speech
private Map partOfSpeech = new HashMap<>();
private final int readingFeature;
private final int partOfSpeechFeature;
private final int totalFeatures;
public UserDictionary(InputStream inputStream, int totalFeatures, int readingFeature, int partOfSpeechFeature)
throws IOException {
this.totalFeatures = totalFeatures;
this.readingFeature = readingFeature;
this.partOfSpeechFeature = partOfSpeechFeature;
read(inputStream);
}
/**
* Lookup words in text
*
* @param text text to look up user dictionary matches for
* @return list of UserDictionaryMatch, not null
*/
public List findUserDictionaryMatches(String text) {
List matchInfos = new ArrayList<>();
int startIndex = 0;
while (startIndex < text.length()) {
int matchLength = 0;
while (startIndex + matchLength < text.length()
&& entries.containsKeyPrefix(text.substring(startIndex, startIndex + matchLength + 1))) {
matchLength++;
}
if (matchLength > 0) {
String match = text.substring(startIndex, startIndex + matchLength);
int[] details = entries.get(match);
if (details != null) {
matchInfos.addAll(makeMatchDetails(startIndex, details));
}
}
startIndex++;
}
return matchInfos;
}
private List makeMatchDetails(int matchStartIndex, int[] details) {
List matchDetails = new ArrayList<>(details.length - 1);
int wordId = details[0];
int startIndex = 0;
for (int i = 1; i < details.length; i++) {
int matchLength = details[i];
matchDetails.add(new UserDictionaryMatch(wordId, matchStartIndex + startIndex, matchLength));
startIndex += matchLength;
wordId++;
}
return matchDetails;
}
public static class UserDictionaryMatch {
private final int wordId;
private final int matchStartIndex;
private final int matchLength;
public UserDictionaryMatch(int wordId, int matchStartIndex, int matchLength) {
this.wordId = wordId;
this.matchStartIndex = matchStartIndex;
this.matchLength = matchLength;
}
public int getWordId() {
return wordId;
}
public int getMatchStartIndex() {
return matchStartIndex;
}
public int getMatchLength() {
return matchLength;
}
}
@Override
public int getLeftId(int wordId) {
return LEFT_ID;
}
@Override
public int getRightId(int wordId) {
return RIGHT_ID;
}
@Override
public int getWordCost(int wordId) {
return WORD_COST;
}
@Override
public String[] getAllFeaturesArray(int wordId) {
String[] features = new String[totalFeatures];
for (int i = 0; i < totalFeatures; i++) {
features[i] = getFeature(wordId, i);
}
return features;
}
@Override
public String getAllFeatures(int wordId) {
return StringUtils.join(getAllFeaturesArray(wordId), FEATURE_SEPARATOR);
}
@Override
public String getFeature(int wordId, int... fields) {
// Is this latter test correct? There can be duplicate features... -Christian
if (fields.length == 0 || fields.length == totalFeatures) {
return getAllFeatures(wordId);
}
String[] features = new String[fields.length];
for (int i = 0; i < fields.length; i++) {
int featureNumber = fields[i];
if (featureNumber == readingFeature) {
features[i] = readings.get(wordId);
} else if (featureNumber == partOfSpeechFeature) {
features[i] = partOfSpeech.get(wordId);
} else {
features[i] = DEFAULT_FEATURE;
}
}
return StringUtils.join(features, FEATURE_SEPARATOR);
}
public void read(InputStream input) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(input, StandardCharsets.UTF_8));
String line;
while ((line = reader.readLine()) != null) {
// Remove comments and trim leading and trailing whitespace
line = line.replaceAll("#.*$", "");
line = line.trim();
// Skip empty lines or comment lines
if (line.isEmpty()) {
continue;
}
addEntry(line);
}
}
public void addEntry(String entry) {
String[] values = DictionaryEntryLineParser.parseLine(entry);
String surface = values[0];
String segmentationValue = values[1];
String readingsValue = values[2];
String partOfSpeech = values[3];
String[] segmentation;
String[] readings;
if (isCustomSegmentation(surface, segmentationValue)) {
segmentation = split(segmentationValue);
readings = split(readingsValue);
} else {
segmentation = new String[] {segmentationValue};
readings = new String[] {readingsValue};
}
if (segmentation.length != readings.length) {
throw new RuntimeException("User dictionary entry not properly formatted: " + entry);
}
// { wordId, 1st token length, 2nd token length, ... , nth token length
int[] wordIdAndLengths = new int[segmentation.length + 1];
wordIdAndLengths[0] = wordId;
for (int i = 0; i < segmentation.length; i++) {
wordIdAndLengths[i + 1] = segmentation[i].length();
this.readings.put(wordId, readings[i]);
this.partOfSpeech.put(wordId, partOfSpeech);
wordId++;
}
entries.put(surface, wordIdAndLengths);
}
private boolean isCustomSegmentation(String surface, String segmentation) {
return !surface.equals(segmentation);
}
private String[] split(String input) {
return input.split("\\s+");
}
}