org.apache.lucene.analysis.ja.util.UnknownDictionaryBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analysis-kuromoji Show documentation
Apache Lucene (module: kuromoji)
There is a newer version: 9.12.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ja.util;

import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;

class UnknownDictionaryBuilder {
  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";

  private final String encoding;

  UnknownDictionaryBuilder(String encoding) {
    this.encoding = encoding;
  }

  public UnknownDictionaryWriter build(Path dir) throws IOException {
    UnknownDictionaryWriter unkDictionary =
        readDictionaryFile(dir.resolve("unk.def")); // Should be only one file
    readCharacterDefinition(dir.resolve("char.def"), unkDictionary);
    return unkDictionary;
  }

  private UnknownDictionaryWriter readDictionaryFile(Path path) throws IOException {
    return readDictionaryFile(path, encoding);
  }

  private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding)
      throws IOException {
    UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);

    List lines = new ArrayList<>();
    try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
        LineNumberReader lineReader = new LineNumberReader(reader)) {

      dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));

      String line;
      while ((line = lineReader.readLine()) != null) {
        // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading
        // and pronunciation,
        // even though the unknown dictionary returns hardcoded null here.
        final String[] parsed =
            CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
        lines.add(parsed);
      }
    }

    lines.sort(
        Comparator.comparingInt(entry -> CharacterDefinition.lookupCharacterClass(entry[0])));

    for (String[] entry : lines) {
      dictionary.put(entry);
    }

    return dictionary;
  }

  private void readCharacterDefinition(Path path, UnknownDictionaryWriter dictionary)
      throws IOException {
    try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
        LineNumberReader lineReader = new LineNumberReader(reader)) {

      String line;
      while ((line = lineReader.readLine()) != null) {
        line = line.replaceAll("^\\s", "");
        line = line.replaceAll("\\s*#.*", "");
        line = line.replaceAll("\\s+", " ");

        // Skip empty line or comment line
        if (line.length() == 0) {
          continue;
        }

        if (line.startsWith("0x")) { // Category mapping
          String[] values = line.split(" ", 2); // Split only first space

          if (!values[0].contains("..")) {
            int cp = Integer.decode(values[0]);
            dictionary.putCharacterCategory(cp, values[1]);
          } else {
            String[] codePoints = values[0].split("\\.\\.");
            int cpFrom = Integer.decode(codePoints[0]);
            int cpTo = Integer.decode(codePoints[1]);

            for (int i = cpFrom; i <= cpTo; i++) {
              dictionary.putCharacterCategory(i, values[1]);
            }
          }
        } else { // Invoke definition
          String[] values = line.split(" "); // Consecutive space is merged above
          String characterClassName = values[0];
          int invoke = Integer.parseInt(values[1]);
          int group = Integer.parseInt(values[2]);
          int length = Integer.parseInt(values[3]);
          dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
        }
      }
    }
  }
}