org.apache.lucene.analysis.ja.dict.UserDictionary Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analysis-kuromoji Show documentation
Show all versions of lucene-analysis-kuromoji Show documentation
Apache Lucene (module: kuromoji)
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.dict;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.morph.Dictionary;
import org.apache.lucene.analysis.util.CSVUtil;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.PositiveIntOutputs;
/** Class for building a User Dictionary. This class allows for custom segmentation of phrases. */
public final class UserDictionary implements Dictionary {
public static final String INTERNAL_SEPARATOR = "\u0000";
private static final Pattern LINE_COMMENT = Pattern.compile("^#.*$");
private static final Pattern WHITESPACE = Pattern.compile("\\s");
private static final Pattern SPACES = Pattern.compile(" +");
// phrase text -> phrase ID
private final TokenInfoFST fst;
// holds wordid, length, length... indexed by phrase ID
private final int[][] segmentations;
// holds readings and POS, indexed by wordid
private final UserMorphData morphAtts;
static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
public static UserDictionary open(Reader reader) throws IOException {
BufferedReader br = new BufferedReader(reader);
String line;
List featureEntries = new ArrayList<>();
// text, segmentation, readings, POS
while ((line = br.readLine()) != null) {
// Remove comments
line = LINE_COMMENT.matcher(line).replaceAll("");
// Skip empty lines or comment lines
if (line.trim().isEmpty()) {
continue;
}
String[] values = CSVUtil.parse(line);
featureEntries.add(values);
}
if (featureEntries.isEmpty()) {
return null;
} else {
return new UserDictionary(featureEntries);
}
}
private UserDictionary(List featureEntries) throws IOException {
int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
// TODO: should we allow multiple segmentations per input 'phrase'?
// the old treemap didn't support this either, and i'm not sure if it's needed/useful?
featureEntries.sort((left, right) -> left[0].compareTo(right[0]));
List data = new ArrayList<>(featureEntries.size());
List segmentations = new ArrayList<>(featureEntries.size());
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
FSTCompiler fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput).build();
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = 0;
for (String[] values : featureEntries) {
String surface = WHITESPACE.matcher(values[0]).replaceAll("");
String concatenatedSegment = WHITESPACE.matcher(values[1]).replaceAll("");
String[] segmentation = SPACES.split(values[1]);
String[] readings = SPACES.split(values[2]);
String pos = values[3];
if (segmentation.length != readings.length) {
throw new RuntimeException(
"Illegal user dictionary entry "
+ values[0]
+ " - the number of segmentations ("
+ segmentation.length
+ ")"
+ " does not the match number of readings ("
+ readings.length
+ ")");
}
if (!surface.equals(concatenatedSegment)) {
throw new RuntimeException(
"Illegal user dictionary entry "
+ values[0]
+ " - the concatenated segmentation ("
+ concatenatedSegment
+ ")"
+ " does not match the surface form ("
+ surface
+ ")");
}
int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
wordIdAndLength[0] = wordId;
for (int i = 0; i < segmentation.length; i++) {
wordIdAndLength[i + 1] = segmentation[i].length();
data.add(readings[i] + INTERNAL_SEPARATOR + pos);
wordId++;
}
// add mapping to FST
String token = values[0];
scratch.growNoCopy(token.length());
scratch.setLength(token.length());
for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, token.charAt(i));
}
fstCompiler.add(scratch.get(), ord);
segmentations.add(wordIdAndLength);
ord++;
}
this.fst =
new TokenInfoFST(
FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()), false);
this.morphAtts = new UserMorphData(data.toArray(new String[0]));
this.segmentations = segmentations.toArray(new int[0][]);
}
@Override
public UserMorphData getMorphAttributes() {
return morphAtts;
}
/**
* Lookup words in text
*
* @param chars text
* @param off offset into text
* @param len length of text
* @return array of {wordId, position, length}
*/
public int[][] lookup(char[] chars, int off, int len) throws IOException {
List matches = null;
int numResults = 0;
final FST.BytesReader fstReader = fst.getBytesReader();
final int end = off + len;
FST.Arc arc = new FST.Arc<>();
for (int startOffset = off; startOffset < end; startOffset++) {
int[] wordIdAndLength = null;
arc = fst.getFirstArc(arc);
int output = 0;
for (int i = 0, remaining = end - startOffset; i < remaining; i++) {
int ch = chars[startOffset + i];
if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
break; // continue to next position
}
output += arc.output().intValue();
if (arc.isFinal()) {
int finalOutput = output + arc.nextFinalOutput().intValue();
wordIdAndLength = segmentations[finalOutput];
}
}
if (wordIdAndLength != null) {
if (matches == null) {
matches = new ArrayList<>();
}
matches.add(new Match(startOffset - off, wordIdAndLength));
numResults += wordIdAndLength.length - 1;
}
}
if (numResults == 0) {
return EMPTY_RESULT;
}
int[][] result = new int[numResults][];
int index = 0;
for (int i = 0; i < matches.size(); i++) {
Match match = matches.get(i);
int[] wordIdAndLength = match.wordIdAndLength;
int wordId = wordIdAndLength[0];
// convert length to index
int position = match.position;
for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
// add a {wordId, index, length} token to the results
int[] token = {wordId + j - 1, position, wordIdAndLength[j]};
result[index++] = token;
position += wordIdAndLength[j];
}
}
return result;
}
public TokenInfoFST getFST() {
return fst;
}
private static final int[][] EMPTY_RESULT = new int[0][];
public int[] lookupSegmentation(int phraseID) {
return segmentations[phraseID];
}
private record Match(int position, int[] wordIdAndLength) {}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy