org.apache.lucene.analysis.ja.completion.KatakanaRomanizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analysis-kuromoji Show documentation
Show all versions of lucene-analysis-kuromoji Show documentation
Apache Lucene (module: kuromoji)
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.completion;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
/**
* Converts a Katakana string to Romaji using the pre-defined
* Katakana-Romaji mapping rules. Internally, this repeatedly performs prefix match on the given
* char sequence to the pre-built keystroke array until it reaches the end of the sequence, or there
* are no matched keystrokes.
*/
public class KatakanaRomanizer {
private static final String ROMAJI_MAP_FILE = "romaji_map.txt";
private static KatakanaRomanizer INSTANCE;
static {
// Build romaji-map and keystroke arrays from the pre-defined Katakana-Romaji mapping file.
try (InputStreamReader is =
new InputStreamReader(
KatakanaRomanizer.class.getResourceAsStream(ROMAJI_MAP_FILE),
Charset.forName("UTF-8"));
BufferedReader ir = new BufferedReader(is)) {
Map> romajiMap = new HashMap<>();
String line;
while ((line = ir.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
String[] cols = line.trim().split(",");
if (cols.length < 2) {
continue;
}
CharsRef prefix = new CharsRef(cols[0]);
romajiMap.put(prefix, new ArrayList<>());
for (int i = 1; i < cols.length; i++) {
romajiMap.get(prefix).add(new CharsRef(cols[i]));
}
}
Set keystrokeSet = romajiMap.keySet();
int maxKeystrokeLength = keystrokeSet.stream().mapToInt(CharsRef::length).max().getAsInt();
CharsRef[][] keystrokes = new CharsRef[maxKeystrokeLength][];
for (int len = 0; len < maxKeystrokeLength; len++) {
final int l = len;
keystrokes[l] =
keystrokeSet.stream().filter(k -> k.length - 1 == l).toArray(CharsRef[]::new);
}
for (CharsRef[] ks : keystrokes) {
// keystroke array must be sorted in ascending order for binary search.
Arrays.sort(ks);
}
INSTANCE = new KatakanaRomanizer(keystrokes, romajiMap);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private final CharsRef[][] keystrokes;
private final Map> romajiMap;
/** Returns the singleton instance of {@code KatakanaRomenizer} */
public static KatakanaRomanizer getInstance() {
return INSTANCE;
}
private KatakanaRomanizer(CharsRef[][] keystrokes, Map> romajiMap) {
this.keystrokes = keystrokes;
this.romajiMap = romajiMap;
}
/**
* Translates a sequence of katakana to romaji. An input can produce multiple outputs because a
* keystroke can be mapped to multiple romajis.
*/
public List romanize(CharsRef input) {
assert CharSequenceUtils.isKatakanaOrHWAlphabets(input);
List pendingOutputs = new ArrayList<>();
int pos = 0;
while (pos < input.length) {
// Greedily looks up the longest matched keystroke.
// e.g.: Consider input="キョウ", then there are two matched keystrokes (romaji mapping rules)
// "キ" -> "ki" and "キョ" -> "kyo". Only the longest one "キョ" will be selected.
MatchedKeystroke matched = longestKeystrokeMatch(input, pos);
if (matched == null) {
break;
}
List candidates =
romajiMap.get(keystrokes[matched.keystrokeLen - 1][matched.keystrokeIndex]);
if (pendingOutputs.size() == 0) {
// There is no pending output.
// Add the matched keystrokes to pending outputs list.
for (CharsRef cref : candidates) {
CharsRefBuilder output = new CharsRefBuilder();
output.copyChars(cref);
pendingOutputs.add(output);
}
} else if (candidates.size() == 1) {
// There are one or more pending output(s) and one matched keystroke.
// Append the matched keystroke to all pending outputs.
// e.g.: Consider we already have two pending outputs "shi" and "si" and the matched
// keystroke "ka";
// then results are "shika" and "sika".
CharsRef cref = candidates.get(0);
for (CharsRefBuilder pdgOutput : pendingOutputs) {
pdgOutput.append(cref.chars, 0, cref.length);
}
} else {
// There are one or more pending output(s) and multiple matched keystrokes.
// Combine the matched keystrokes to all pending outputs.
// e.g.: Consider we already have two pending outputs "shi" and "si" and the matched
// keystroke "n" and "nn".
// To produce all possible keystroke patterns, result outputs should be "shin", "shinn",
// "sin" and "sinn".
List outputs = new ArrayList<>();
for (CharsRef cref : candidates) {
for (CharsRefBuilder pdgOutput : pendingOutputs) {
CharsRefBuilder buffer = new CharsRefBuilder();
buffer.copyChars(pdgOutput.chars(), 0, pdgOutput.length());
buffer.append(cref.chars, cref.offset, cref.length);
outputs.add(buffer);
}
}
// update the pending outputs
pendingOutputs = outputs;
}
// proceed to the next input position
pos += matched.keystrokeLen;
}
if (pos < input.length) {
// add the remnants (that cannot be mapped to any romaji) as suffix
for (CharsRefBuilder output : pendingOutputs) {
output.append(input.chars, pos, input.length - pos);
}
}
return pendingOutputs.stream().map(CharsRefBuilder::get).collect(Collectors.toList());
}
private MatchedKeystroke longestKeystrokeMatch(CharsRef input, int inputOffset) {
for (int len = Math.min(input.length - inputOffset, keystrokes.length); len > 0; len--) {
CharsRef ref = new CharsRef(input.chars, inputOffset, len);
int index = Arrays.binarySearch(keystrokes[len - 1], ref);
if (index >= 0) {
return new MatchedKeystroke(len, index);
}
}
// there's no matched keystroke
return null;
}
private static class MatchedKeystroke {
final int keystrokeLen;
final int keystrokeIndex;
MatchedKeystroke(int keystrokeLen, int keystrokeIndex) {
this.keystrokeLen = keystrokeLen;
this.keystrokeIndex = keystrokeIndex;
}
}
}