org.opensearch.indices.analysis.HunspellService Maven / Gradle / Ivy
Show all versions of opensearch Show documentation
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.indices.analysis;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.opensearch.OpenSearchException;
import org.opensearch.common.settings.Setting;
import org.opensearch.common.settings.Setting.Property;
import org.opensearch.common.settings.Settings;
import org.opensearch.common.util.io.IOUtils;
import org.opensearch.core.util.FileSystemUtils;
import org.opensearch.env.Environment;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
/**
* Serves as a node level registry for hunspell dictionaries. This services expects all dictionaries to be located under
* the {@code /hunspell} directory, where each locale has its dedicated sub-directory which holds the dictionary
* files. For example, the dictionary files for {@code en_US} locale must be placed under {@code /hunspell/en_US}
* directory.
*
* The following settings can be set for each dictionary:
*
* - {@code ignore_case} - If true, dictionary matching will be case insensitive (defaults to {@code false})
* - {@code strict_affix_parsing} - Determines whether errors while reading a affix rules file will cause exception or simple be ignored
* (defaults to {@code true})
*
*
* These settings can either be configured as node level configuration, such as:
*
*
* indices.analysis.hunspell.dictionary.en_US.ignore_case: true
* indices.analysis.hunspell.dictionary.en_US.strict_affix_parsing: false
*
*
* or, as dedicated configuration per dictionary, placed in a {@code settings.yml} file under the dictionary directory. For
* example, the following can be the content of the {@code /hunspell/en_US/settings.yml} file:
*
*
* ignore_case: true
* strict_affix_parsing: false
*
*
* @see org.opensearch.index.analysis.HunspellTokenFilterFactory
*
* @opensearch.internal
*/
public class HunspellService {
private static final Logger logger = LogManager.getLogger(HunspellService.class);
public static final Setting HUNSPELL_LAZY_LOAD = Setting.boolSetting(
"indices.analysis.hunspell.dictionary.lazy",
Boolean.FALSE,
Property.NodeScope
);
public static final Setting HUNSPELL_IGNORE_CASE = Setting.boolSetting(
"indices.analysis.hunspell.dictionary.ignore_case",
Boolean.FALSE,
Property.NodeScope
);
public static final Setting HUNSPELL_DICTIONARY_OPTIONS = Setting.groupSetting(
"indices.analysis.hunspell.dictionary.",
Property.NodeScope
);
private final ConcurrentHashMap dictionaries = new ConcurrentHashMap<>();
private final Map knownDictionaries;
private final boolean defaultIgnoreCase;
private final Path hunspellDir;
private final Function loadingFunction;
public HunspellService(final Settings settings, final Environment env, final Map knownDictionaries)
throws IOException {
this.knownDictionaries = Collections.unmodifiableMap(knownDictionaries);
this.hunspellDir = resolveHunspellDirectory(env);
this.defaultIgnoreCase = HUNSPELL_IGNORE_CASE.get(settings);
this.loadingFunction = (locale) -> {
try {
return loadDictionary(locale, settings, env);
} catch (Exception e) {
logger.error("Failed to load hunspell dictionary for locale: " + locale, e);
throw new IllegalStateException("Failed to load hunspell dictionary for locale: " + locale);
}
};
if (!HUNSPELL_LAZY_LOAD.get(settings)) {
scanAndLoadDictionaries();
}
}
/**
* Returns the hunspell dictionary for the given locale.
*
* @param locale The name of the locale
*/
public Dictionary getDictionary(String locale) {
Dictionary dictionary = knownDictionaries.get(locale);
if (dictionary == null) {
dictionary = dictionaries.computeIfAbsent(locale, loadingFunction);
}
return dictionary;
}
private Path resolveHunspellDirectory(Environment env) {
return env.configFile().resolve("hunspell");
}
/**
* Scans the hunspell directory and loads all found dictionaries
*/
private void scanAndLoadDictionaries() throws IOException {
if (Files.isDirectory(hunspellDir)) {
try (DirectoryStream stream = Files.newDirectoryStream(hunspellDir)) {
for (Path file : stream) {
if (Files.isDirectory(file)) {
try (DirectoryStream inner = Files.newDirectoryStream(hunspellDir.resolve(file), "*.dic")) {
if (inner.iterator().hasNext()) { // just making sure it's indeed a dictionary dir
try {
getDictionary(file.getFileName().toString());
} catch (Exception e) {
// The cache loader throws unchecked exception (see #loadDictionary()),
// here we simply report the exception and continue loading the dictionaries
logger.error(
() -> new ParameterizedMessage("exception while loading dictionary {}", file.getFileName()),
e
);
}
}
}
}
}
}
}
}
/**
* Loads the hunspell dictionary for the given local.
*
* @param locale The locale of the hunspell dictionary to be loaded.
* @param nodeSettings The node level settings
* @param env The node environment (from which the conf path will be resolved)
* @return The loaded Hunspell dictionary
* @throws Exception when loading fails (due to IO errors or malformed dictionary files)
*/
private Dictionary loadDictionary(String locale, Settings nodeSettings, Environment env) throws Exception {
if (logger.isDebugEnabled()) {
logger.debug("Loading hunspell dictionary [{}]...", locale);
}
Path dicDir = hunspellDir.resolve(locale);
if (FileSystemUtils.isAccessibleDirectory(dicDir, logger) == false) {
throw new OpenSearchException(String.format(Locale.ROOT, "Could not find hunspell dictionary [%s]", locale));
}
// merging node settings with hunspell dictionary specific settings
Settings dictSettings = HUNSPELL_DICTIONARY_OPTIONS.get(nodeSettings);
nodeSettings = loadDictionarySettings(dicDir, dictSettings.getByPrefix(locale + "."));
boolean ignoreCase = nodeSettings.getAsBoolean("ignore_case", defaultIgnoreCase);
Path[] affixFiles = FileSystemUtils.files(dicDir, "*.aff");
if (affixFiles.length == 0) {
throw new OpenSearchException(String.format(Locale.ROOT, "Missing affix file for hunspell dictionary [%s]", locale));
}
if (affixFiles.length != 1) {
throw new OpenSearchException(String.format(Locale.ROOT, "Too many affix files exist for hunspell dictionary [%s]", locale));
}
InputStream affixStream = null;
Path[] dicFiles = FileSystemUtils.files(dicDir, "*.dic");
List dicStreams = new ArrayList<>(dicFiles.length);
try {
for (int i = 0; i < dicFiles.length; i++) {
dicStreams.add(Files.newInputStream(dicFiles[i]));
}
affixStream = Files.newInputStream(affixFiles[0]);
try (Directory tmp = new NIOFSDirectory(env.tmpFile())) {
return new Dictionary(tmp, "hunspell", affixStream, dicStreams, ignoreCase);
}
} catch (Exception e) {
logger.error(() -> new ParameterizedMessage("Could not load hunspell dictionary [{}]", locale), e);
throw e;
} finally {
IOUtils.close(affixStream);
IOUtils.close(dicStreams);
}
}
/**
* Each hunspell dictionary directory may contain a {@code settings.yml} which holds dictionary specific settings. Default
* values for these settings are defined in the given default settings.
*
* @param dir The directory of the dictionary
* @param defaults The default settings for this dictionary
* @return The resolved settings.
*/
private static Settings loadDictionarySettings(Path dir, Settings defaults) throws IOException {
Path file = dir.resolve("settings.yml");
if (Files.exists(file)) {
return Settings.builder().loadFromPath(file).put(defaults).build();
}
file = dir.resolve("settings.json");
if (Files.exists(file)) {
return Settings.builder().loadFromPath(file).put(defaults).build();
}
return defaults;
}
}