org.apache.lucene.analysis.ja.dict.BinaryDictionary Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers-kuromoji Show documentation
Lucene Kuromoji Japanese Morphological Analyzer
There is a newer version: 8.11.4
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ja.dict;


import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IOUtils;

/**
 * Base class for a binary-encoded in-memory dictionary.
 */
public abstract class BinaryDictionary implements Dictionary {
  
  /**
   * Used to specify where (dictionary) resources get loaded from.
   */
  public enum ResourceScheme {
    CLASSPATH, FILE
  }

  public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
  public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
  public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
  
  public static final String DICT_HEADER = "kuromoji_dict";
  public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
  public static final String POSDICT_HEADER = "kuromoji_dict_pos";
  public static final int VERSION = 1;
  
  private final ResourceScheme resourceScheme;
  private final String resourcePath;
  private final ByteBuffer buffer;
  private final int[] targetMapOffsets, targetMap;
  private final String[] posDict;
  private final String[] inflTypeDict;
  private final String[] inflFormDict;
  
  protected BinaryDictionary() throws IOException {
    this(ResourceScheme.CLASSPATH, null);
  }

  /**
   * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
   * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
   * this class's name as the path.
   */
  protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
    this.resourceScheme = resourceScheme;
    if (resourcePath == null) {
      if (resourceScheme != ResourceScheme.CLASSPATH) {
        throw new IllegalArgumentException("resourcePath must be supplied with FILE resource scheme");
      }
      this.resourcePath = getClass().getName().replace('.', '/');
    } else {
      this.resourcePath = resourcePath;
    }
    InputStream mapIS = null, dictIS = null, posIS = null;
    int[] targetMapOffsets = null, targetMap = null;
    String[] posDict = null;
    String[] inflFormDict = null;
    String[] inflTypeDict = null;
    ByteBuffer buffer = null;
    boolean success = false;
    try {
      mapIS = getResource(TARGETMAP_FILENAME_SUFFIX);
      mapIS = new BufferedInputStream(mapIS);
      DataInput in = new InputStreamDataInput(mapIS);
      CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
      targetMap = new int[in.readVInt()];
      targetMapOffsets = new int[in.readVInt()];
      int accum = 0, sourceId = 0;
      for (int ofs = 0; ofs < targetMap.length; ofs++) {
        final int val = in.readVInt();
        if ((val & 0x01) != 0) {
          targetMapOffsets[sourceId] = ofs;
          sourceId++;
        }
        accum += val >>> 1;
        targetMap[ofs] = accum;
      }
      if (sourceId + 1 != targetMapOffsets.length)
        throw new IOException("targetMap file format broken; targetMap.length=" + targetMap.length
                              + ", targetMapOffsets.length=" + targetMapOffsets.length
                              + ", sourceId=" + sourceId);
      targetMapOffsets[sourceId] = targetMap.length;
      mapIS.close(); mapIS = null;
      
      posIS = getResource(POSDICT_FILENAME_SUFFIX);
      posIS = new BufferedInputStream(posIS);
      in = new InputStreamDataInput(posIS);
      CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
      int posSize = in.readVInt();
      posDict = new String[posSize];
      inflTypeDict = new String[posSize];
      inflFormDict = new String[posSize];
      for (int j = 0; j < posSize; j++) {
        posDict[j] = in.readString();
        inflTypeDict[j] = in.readString();
        inflFormDict[j] = in.readString();
        // this is how we encode null inflections
        if (inflTypeDict[j].length() == 0) {
          inflTypeDict[j] = null;
        }
        if (inflFormDict[j].length() == 0) {
          inflFormDict[j] = null;
        }
      }
      posIS.close(); posIS = null;
      
      dictIS = getResource(DICT_FILENAME_SUFFIX);
      // no buffering here, as we load in one large buffer
      in = new InputStreamDataInput(dictIS);
      CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION);
      final int size = in.readVInt();
      final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
      final ReadableByteChannel channel = Channels.newChannel(dictIS);
      final int read = channel.read(tmpBuffer);
      if (read != size) {
        throw new EOFException("Cannot read whole dictionary");
      }
      dictIS.close(); dictIS = null;
      buffer = tmpBuffer.asReadOnlyBuffer();
      success = true;
    } finally {
      if (success) {
        IOUtils.close(mapIS, posIS, dictIS);
      } else {
        IOUtils.closeWhileHandlingException(mapIS, posIS, dictIS);
      }
    }
    
    this.targetMap = targetMap;
    this.targetMapOffsets = targetMapOffsets;
    this.posDict = posDict;
    this.inflTypeDict = inflTypeDict;
    this.inflFormDict = inflFormDict;
    this.buffer = buffer;
  }
  
  protected final InputStream getResource(String suffix) throws IOException {
    switch(resourceScheme) {
      case CLASSPATH:
        return getClassResource(resourcePath + suffix);
      case FILE:
        return Files.newInputStream(Paths.get(resourcePath + suffix));
      default:
        throw new IllegalStateException("unknown resource scheme " + resourceScheme);
    }
  }
  
  public static final InputStream getResource(ResourceScheme scheme, String path) throws IOException {
    switch(scheme) {
      case CLASSPATH:
        return getClassResource(path);
      case FILE:
        return Files.newInputStream(Paths.get(path));
      default:
        throw new IllegalStateException("unknown resource scheme " + scheme);
    }
  }

  // util, reused by ConnectionCosts and CharacterDefinition
  public static final InputStream getClassResource(Class clazz, String suffix) throws IOException {
    final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
    if (is == null) {
      throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.','/') + suffix);
    }
    return is;
  }
  
  private static InputStream getClassResource(String path) throws IOException {
    final InputStream is = BinaryDictionary.class.getClassLoader().getResourceAsStream(path);
    if (is == null) {
      throw new FileNotFoundException("Not in classpath: " + path);
    }
    return is;
  }

  public void lookupWordIds(int sourceId, IntsRef ref) {
    ref.ints = targetMap;
    ref.offset = targetMapOffsets[sourceId];
    // targetMapOffsets always has one more entry pointing behind last:
    ref.length = targetMapOffsets[sourceId + 1] - ref.offset;
  }
  
  @Override
  public int getLeftId(int wordId) {
    return (buffer.getShort(wordId) & 0xffff) >>> 3;
  }
  
  @Override
  public int getRightId(int wordId) {
    return (buffer.getShort(wordId) & 0xffff) >>> 3;
  }
  
  @Override
  public int getWordCost(int wordId) {
    return buffer.getShort(wordId + 2);  // Skip id
  }

  @Override
  public String getBaseForm(int wordId, char surfaceForm[], int off, int len) {
    if (hasBaseFormData(wordId)) {
      int offset = baseFormOffset(wordId);
      int data = buffer.get(offset++) & 0xff;
      int prefix = data >>> 4;
      int suffix = data & 0xF;
      char text[] = new char[prefix+suffix];
      System.arraycopy(surfaceForm, off, text, 0, prefix);
      for (int i = 0; i < suffix; i++) {
        text[prefix+i] = buffer.getChar(offset + (i << 1));
      }
      return new String(text);
    } else {
      return null;
    }
  }
  
  @Override
  public String getReading(int wordId, char surface[], int off, int len) {
    if (hasReadingData(wordId)) {
      int offset = readingOffset(wordId);
      int readingData = buffer.get(offset++) & 0xff;
      return readString(offset, readingData >>> 1, (readingData & 1) == 1);
    } else {
      // the reading is the surface form, with hiragana shifted to katakana
      char text[] = new char[len];
      for (int i = 0; i < len; i++) {
        char ch = surface[off+i];
        if (ch > 0x3040 && ch < 0x3097) {
          text[i] = (char)(ch + 0x60);
        } else {
          text[i] = ch;
        }
      }
      return new String(text);
    }
  }
  
  @Override
  public String getPartOfSpeech(int wordId) {
    return posDict[getLeftId(wordId)];
  }
  
  @Override
  public String getPronunciation(int wordId, char surface[], int off, int len) {
    if (hasPronunciationData(wordId)) {
      int offset = pronunciationOffset(wordId);
      int pronunciationData = buffer.get(offset++) & 0xff;
      return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
    } else {
      return getReading(wordId, surface, off, len); // same as the reading
    }
  }
  
  @Override
  public String getInflectionType(int wordId) {
    return inflTypeDict[getLeftId(wordId)];
  }

  @Override
  public String getInflectionForm(int wordId) {
    return inflFormDict[getLeftId(wordId)];
  }
  
  private static int baseFormOffset(int wordId) {
    return wordId + 4;
  }
  
  private int readingOffset(int wordId) {
    int offset = baseFormOffset(wordId);
    if (hasBaseFormData(wordId)) {
      int baseFormLength = buffer.get(offset++) & 0xf;
      return offset + (baseFormLength << 1);
    } else {
      return offset;
    }
  }
  
  private int pronunciationOffset(int wordId) {
    if (hasReadingData(wordId)) {
      int offset = readingOffset(wordId);
      int readingData = buffer.get(offset++) & 0xff;
      final int readingLength;
      if ((readingData & 1) == 0) {
        readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
      } else {
        readingLength = readingData >>> 1;
      }
      return offset + readingLength;
    } else {
      return readingOffset(wordId);
    }
  }
  
  private boolean hasBaseFormData(int wordId) {
    return (buffer.getShort(wordId) & HAS_BASEFORM) != 0;
  }
  
  private boolean hasReadingData(int wordId) {
    return (buffer.getShort(wordId) & HAS_READING) != 0;
  }
  
  private boolean hasPronunciationData(int wordId) {
    return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0;
  }
  
  private String readString(int offset, int length, boolean kana) {
    char text[] = new char[length];
    if (kana) {
      for (int i = 0; i < length; i++) {
        text[i] = (char) (0x30A0 + (buffer.get(offset + i) & 0xff));
      }
    } else {
      for (int i = 0; i < length; i++) {
        text[i] = buffer.getChar(offset + (i << 1));
      }
    }
    return new String(text);
  }
  
  /** flag that the entry has baseform data. otherwise it's not inflected (same as surface form) */
  public static final int HAS_BASEFORM = 1;
  /** flag that the entry has reading data. otherwise reading is surface form converted to katakana */
  public static final int HAS_READING = 2;
  /** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
  public static final int HAS_PRONUNCIATION = 4;
}