org.apache.lucene.analysis.ja.util.UnknownDictionaryBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers-kuromoji Show documentation
Lucene Kuromoji Japanese Morphological Analyzer
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ja.util;


import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

import org.apache.lucene.analysis.ja.dict.CharacterDefinition;

class UnknownDictionaryBuilder {
  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";

  private final String encoding;

  UnknownDictionaryBuilder(String encoding) {
    this.encoding = encoding;
  }

  public UnknownDictionaryWriter build(Path dir) throws IOException {
    UnknownDictionaryWriter unkDictionary = readDictionaryFile(dir.resolve("unk.def"));  //Should be only one file
    readCharacterDefinition(dir.resolve("char.def"), unkDictionary);
    return unkDictionary;
  }

  private UnknownDictionaryWriter readDictionaryFile(Path path) throws IOException {
    return readDictionaryFile(path, encoding);
  }

  private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding) throws IOException {
    UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);

    List lines = new ArrayList<>();
    try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
         LineNumberReader lineReader = new LineNumberReader(reader)) {

      dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));

      String line;
      while ((line = lineReader.readLine()) != null) {
        // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
        // even though the unknown dictionary returns hardcoded null here.
        final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
        lines.add(parsed);
      }
    }

    lines.sort(Comparator.comparingInt(entry -> CharacterDefinition.lookupCharacterClass(entry[0])));

    for (String[] entry : lines) {
      dictionary.put(entry);
    }

    return dictionary;
  }

  private void readCharacterDefinition(Path path, UnknownDictionaryWriter dictionary) throws IOException {
    try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
         LineNumberReader lineReader = new LineNumberReader(reader)) {

      String line;
      while ((line = lineReader.readLine()) != null) {
        line = line.replaceAll("^\\s", "");
        line = line.replaceAll("\\s*#.*", "");
        line = line.replaceAll("\\s+", " ");

        // Skip empty line or comment line
        if (line.length() == 0) {
          continue;
        }

        if (line.startsWith("0x")) {  // Category mapping
          String[] values = line.split(" ", 2);  // Split only first space

          if (!values[0].contains("..")) {
            int cp = Integer.decode(values[0]);
            dictionary.putCharacterCategory(cp, values[1]);
          } else {
            String[] codePoints = values[0].split("\\.\\.");
            int cpFrom = Integer.decode(codePoints[0]);
            int cpTo = Integer.decode(codePoints[1]);

            for (int i = cpFrom; i <= cpTo; i++) {
              dictionary.putCharacterCategory(i, values[1]);
            }
          }
        } else {  // Invoke definition
          String[] values = line.split(" "); // Consecutive space is merged above
          String characterClassName = values[0];
          int invoke = Integer.parseInt(values[1]);
          int group = Integer.parseInt(values[2]);
          int length = Integer.parseInt(values[3]);
          dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
        }
      }
    }
  }
}