org.apache.lucene.analysis.ja.dict.BinaryDictionary Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers-kuromoji Show documentation
Show all versions of lucene-analyzers-kuromoji Show documentation
Lucene Kuromoji Japanese Morphological Analyzer
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.dict;
import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IOUtils;
/**
* Base class for a binary-encoded in-memory dictionary.
*/
public abstract class BinaryDictionary implements Dictionary {
public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
public static final String DICT_HEADER = "kuromoji_dict";
public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
public static final String POSDICT_HEADER = "kuromoji_dict_pos";
public static final int VERSION = 1;
private final ByteBuffer buffer;
private final int[] targetMapOffsets, targetMap;
private final String[] posDict;
private final String[] inflTypeDict;
private final String[] inflFormDict;
protected BinaryDictionary() throws IOException {
InputStream mapIS = null, dictIS = null, posIS = null;
int[] targetMapOffsets = null, targetMap = null;
String[] posDict = null;
String[] inflFormDict = null;
String[] inflTypeDict = null;
ByteBuffer buffer = null;
boolean success = false;
try {
mapIS = getResource(TARGETMAP_FILENAME_SUFFIX);
mapIS = new BufferedInputStream(mapIS);
DataInput in = new InputStreamDataInput(mapIS);
CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
targetMap = new int[in.readVInt()];
targetMapOffsets = new int[in.readVInt()];
int accum = 0, sourceId = 0;
for (int ofs = 0; ofs < targetMap.length; ofs++) {
final int val = in.readVInt();
if ((val & 0x01) != 0) {
targetMapOffsets[sourceId] = ofs;
sourceId++;
}
accum += val >>> 1;
targetMap[ofs] = accum;
}
if (sourceId + 1 != targetMapOffsets.length)
throw new IOException("targetMap file format broken");
targetMapOffsets[sourceId] = targetMap.length;
mapIS.close(); mapIS = null;
posIS = getResource(POSDICT_FILENAME_SUFFIX);
posIS = new BufferedInputStream(posIS);
in = new InputStreamDataInput(posIS);
CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
int posSize = in.readVInt();
posDict = new String[posSize];
inflTypeDict = new String[posSize];
inflFormDict = new String[posSize];
for (int j = 0; j < posSize; j++) {
posDict[j] = in.readString();
inflTypeDict[j] = in.readString();
inflFormDict[j] = in.readString();
// this is how we encode null inflections
if (inflTypeDict[j].length() == 0) {
inflTypeDict[j] = null;
}
if (inflFormDict[j].length() == 0) {
inflFormDict[j] = null;
}
}
posIS.close(); posIS = null;
dictIS = getResource(DICT_FILENAME_SUFFIX);
// no buffering here, as we load in one large buffer
in = new InputStreamDataInput(dictIS);
CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION);
final int size = in.readVInt();
final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
final ReadableByteChannel channel = Channels.newChannel(dictIS);
final int read = channel.read(tmpBuffer);
if (read != size) {
throw new EOFException("Cannot read whole dictionary");
}
dictIS.close(); dictIS = null;
buffer = tmpBuffer.asReadOnlyBuffer();
success = true;
} finally {
if (success) {
IOUtils.close(mapIS, posIS, dictIS);
} else {
IOUtils.closeWhileHandlingException(mapIS, posIS, dictIS);
}
}
this.targetMap = targetMap;
this.targetMapOffsets = targetMapOffsets;
this.posDict = posDict;
this.inflTypeDict = inflTypeDict;
this.inflFormDict = inflFormDict;
this.buffer = buffer;
}
protected final InputStream getResource(String suffix) throws IOException {
return getClassResource(getClass(), suffix);
}
// util, reused by ConnectionCosts and CharacterDefinition
public static final InputStream getClassResource(Class> clazz, String suffix) throws IOException {
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
if (is == null)
throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.','/') + suffix);
return is;
}
public void lookupWordIds(int sourceId, IntsRef ref) {
ref.ints = targetMap;
ref.offset = targetMapOffsets[sourceId];
// targetMapOffsets always has one more entry pointing behind last:
ref.length = targetMapOffsets[sourceId + 1] - ref.offset;
}
@Override
public int getLeftId(int wordId) {
return buffer.getShort(wordId) >>> 3;
}
@Override
public int getRightId(int wordId) {
return buffer.getShort(wordId) >>> 3;
}
@Override
public int getWordCost(int wordId) {
return buffer.getShort(wordId + 2); // Skip id
}
@Override
public String getBaseForm(int wordId, char surfaceForm[], int off, int len) {
if (hasBaseFormData(wordId)) {
int offset = baseFormOffset(wordId);
int data = buffer.get(offset++) & 0xff;
int prefix = data >>> 4;
int suffix = data & 0xF;
char text[] = new char[prefix+suffix];
System.arraycopy(surfaceForm, off, text, 0, prefix);
for (int i = 0; i < suffix; i++) {
text[prefix+i] = buffer.getChar(offset + (i << 1));
}
return new String(text);
} else {
return null;
}
}
@Override
public String getReading(int wordId, char surface[], int off, int len) {
if (hasReadingData(wordId)) {
int offset = readingOffset(wordId);
int readingData = buffer.get(offset++) & 0xff;
return readString(offset, readingData >>> 1, (readingData & 1) == 1);
} else {
// the reading is the surface form, with hiragana shifted to katakana
char text[] = new char[len];
for (int i = 0; i < len; i++) {
char ch = surface[off+i];
if (ch > 0x3040 && ch < 0x3097) {
text[i] = (char)(ch + 0x60);
} else {
text[i] = ch;
}
}
return new String(text);
}
}
@Override
public String getPartOfSpeech(int wordId) {
return posDict[getLeftId(wordId)];
}
@Override
public String getPronunciation(int wordId, char surface[], int off, int len) {
if (hasPronunciationData(wordId)) {
int offset = pronunciationOffset(wordId);
int pronunciationData = buffer.get(offset++) & 0xff;
return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
} else {
return getReading(wordId, surface, off, len); // same as the reading
}
}
@Override
public String getInflectionType(int wordId) {
return inflTypeDict[getLeftId(wordId)];
}
@Override
public String getInflectionForm(int wordId) {
return inflFormDict[getLeftId(wordId)];
}
private static int baseFormOffset(int wordId) {
return wordId + 4;
}
private int readingOffset(int wordId) {
int offset = baseFormOffset(wordId);
if (hasBaseFormData(wordId)) {
int baseFormLength = buffer.get(offset++) & 0xf;
return offset + (baseFormLength << 1);
} else {
return offset;
}
}
private int pronunciationOffset(int wordId) {
if (hasReadingData(wordId)) {
int offset = readingOffset(wordId);
int readingData = buffer.get(offset++) & 0xff;
final int readingLength;
if ((readingData & 1) == 0) {
readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
} else {
readingLength = readingData >>> 1;
}
return offset + readingLength;
} else {
return readingOffset(wordId);
}
}
private boolean hasBaseFormData(int wordId) {
return (buffer.getShort(wordId) & HAS_BASEFORM) != 0;
}
private boolean hasReadingData(int wordId) {
return (buffer.getShort(wordId) & HAS_READING) != 0;
}
private boolean hasPronunciationData(int wordId) {
return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0;
}
private String readString(int offset, int length, boolean kana) {
char text[] = new char[length];
if (kana) {
for (int i = 0; i < length; i++) {
text[i] = (char) (0x30A0 + (buffer.get(offset + i) & 0xff));
}
} else {
for (int i = 0; i < length; i++) {
text[i] = buffer.getChar(offset + (i << 1));
}
}
return new String(text);
}
/** flag that the entry has baseform data. otherwise it's not inflected (same as surface form) */
public static final int HAS_BASEFORM = 1;
/** flag that the entry has reading data. otherwise reading is surface form converted to katakana */
public static final int HAS_READING = 2;
/** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
public static final int HAS_PRONUNCIATION = 4;
}