org.opensearch.index.analysis.Analysis Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearch Show documentation
Show all versions of opensearch Show documentation
OpenSearch subproject :server
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.index.analysis;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
import org.apache.lucene.analysis.ca.CatalanAnalyzer;
import org.apache.lucene.analysis.ckb.SoraniAnalyzer;
import org.apache.lucene.analysis.cz.CzechAnalyzer;
import org.apache.lucene.analysis.da.DanishAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.et.EstonianAnalyzer;
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.fi.FinnishAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.ga.IrishAnalyzer;
import org.apache.lucene.analysis.gl.GalicianAnalyzer;
import org.apache.lucene.analysis.hi.HindiAnalyzer;
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
import org.apache.lucene.analysis.hy.ArmenianAnalyzer;
import org.apache.lucene.analysis.id.IndonesianAnalyzer;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.apache.lucene.analysis.lt.LithuanianAnalyzer;
import org.apache.lucene.analysis.lv.LatvianAnalyzer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.no.NorwegianAnalyzer;
import org.apache.lucene.analysis.pt.PortugueseAnalyzer;
import org.apache.lucene.analysis.ro.RomanianAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.sv.SwedishAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
import org.opensearch.common.settings.Settings;
import org.opensearch.core.common.Strings;
import org.opensearch.env.Environment;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import static java.util.Collections.unmodifiableMap;
/**
* Core analysis class
*
* @opensearch.internal
*/
public class Analysis {
private static final Logger LOGGER = LogManager.getLogger(Analysis.class);
// Regular expression to support hashtag tokenization
private static final Pattern HASH_TAG_RULE_PATTERN = Pattern.compile("^\\s*#\\s*=>");
public static CharArraySet parseStemExclusion(Settings settings, CharArraySet defaultStemExclusion) {
String value = settings.get("stem_exclusion");
if ("_none_".equals(value)) {
return CharArraySet.EMPTY_SET;
}
List stemExclusion = settings.getAsList("stem_exclusion", null);
if (stemExclusion != null) {
// LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)?
return new CharArraySet(stemExclusion, false);
} else {
return defaultStemExclusion;
}
}
public static final Map> NAMED_STOP_WORDS;
static {
Map> namedStopWords = new HashMap<>();
namedStopWords.put("_arabic_", ArabicAnalyzer.getDefaultStopSet());
namedStopWords.put("_armenian_", ArmenianAnalyzer.getDefaultStopSet());
namedStopWords.put("_basque_", BasqueAnalyzer.getDefaultStopSet());
namedStopWords.put("_bengali_", BengaliAnalyzer.getDefaultStopSet());
namedStopWords.put("_brazilian_", BrazilianAnalyzer.getDefaultStopSet());
namedStopWords.put("_bulgarian_", BulgarianAnalyzer.getDefaultStopSet());
namedStopWords.put("_catalan_", CatalanAnalyzer.getDefaultStopSet());
namedStopWords.put("_czech_", CzechAnalyzer.getDefaultStopSet());
namedStopWords.put("_danish_", DanishAnalyzer.getDefaultStopSet());
namedStopWords.put("_dutch_", DutchAnalyzer.getDefaultStopSet());
namedStopWords.put("_english_", EnglishAnalyzer.getDefaultStopSet());
namedStopWords.put("_estonian_", EstonianAnalyzer.getDefaultStopSet());
namedStopWords.put("_finnish_", FinnishAnalyzer.getDefaultStopSet());
namedStopWords.put("_french_", FrenchAnalyzer.getDefaultStopSet());
namedStopWords.put("_galician_", GalicianAnalyzer.getDefaultStopSet());
namedStopWords.put("_german_", GermanAnalyzer.getDefaultStopSet());
namedStopWords.put("_greek_", GreekAnalyzer.getDefaultStopSet());
namedStopWords.put("_hindi_", HindiAnalyzer.getDefaultStopSet());
namedStopWords.put("_hungarian_", HungarianAnalyzer.getDefaultStopSet());
namedStopWords.put("_indonesian_", IndonesianAnalyzer.getDefaultStopSet());
namedStopWords.put("_irish_", IrishAnalyzer.getDefaultStopSet());
namedStopWords.put("_italian_", ItalianAnalyzer.getDefaultStopSet());
namedStopWords.put("_latvian_", LatvianAnalyzer.getDefaultStopSet());
namedStopWords.put("_lithuanian_", LithuanianAnalyzer.getDefaultStopSet());
namedStopWords.put("_norwegian_", NorwegianAnalyzer.getDefaultStopSet());
namedStopWords.put("_persian_", PersianAnalyzer.getDefaultStopSet());
namedStopWords.put("_portuguese_", PortugueseAnalyzer.getDefaultStopSet());
namedStopWords.put("_romanian_", RomanianAnalyzer.getDefaultStopSet());
namedStopWords.put("_russian_", RussianAnalyzer.getDefaultStopSet());
namedStopWords.put("_sorani_", SoraniAnalyzer.getDefaultStopSet());
namedStopWords.put("_spanish_", SpanishAnalyzer.getDefaultStopSet());
namedStopWords.put("_swedish_", SwedishAnalyzer.getDefaultStopSet());
namedStopWords.put("_thai_", ThaiAnalyzer.getDefaultStopSet());
namedStopWords.put("_turkish_", TurkishAnalyzer.getDefaultStopSet());
NAMED_STOP_WORDS = unmodifiableMap(namedStopWords);
}
public static CharArraySet parseWords(
Environment env,
Settings settings,
String name,
CharArraySet defaultWords,
Map> namedWords,
boolean ignoreCase
) {
String value = settings.get(name);
if (value != null) {
if ("_none_".equals(value)) {
return CharArraySet.EMPTY_SET;
} else {
return resolveNamedWords(settings.getAsList(name), namedWords, ignoreCase);
}
}
List pathLoadedWords = parseWordList(env, settings, name, s -> s);
if (pathLoadedWords != null) {
return resolveNamedWords(pathLoadedWords, namedWords, ignoreCase);
}
return defaultWords;
}
public static CharArraySet parseCommonWords(Environment env, Settings settings, CharArraySet defaultCommonWords, boolean ignoreCase) {
return parseWords(env, settings, "common_words", defaultCommonWords, NAMED_STOP_WORDS, ignoreCase);
}
public static CharArraySet parseArticles(Environment env, Settings settings) {
boolean articlesCase = settings.getAsBoolean("articles_case", false);
return parseWords(env, settings, "articles", null, null, articlesCase);
}
public static CharArraySet parseStopWords(Environment env, Settings settings, CharArraySet defaultStopWords) {
boolean stopwordsCase = settings.getAsBoolean("stopwords_case", false);
return parseStopWords(env, settings, defaultStopWords, stopwordsCase);
}
public static CharArraySet parseStopWords(Environment env, Settings settings, CharArraySet defaultStopWords, boolean ignoreCase) {
return parseWords(env, settings, "stopwords", defaultStopWords, NAMED_STOP_WORDS, ignoreCase);
}
private static CharArraySet resolveNamedWords(Collection words, Map> namedWords, boolean ignoreCase) {
if (namedWords == null) {
return new CharArraySet(words, ignoreCase);
}
CharArraySet setWords = new CharArraySet(words.size(), ignoreCase);
for (String word : words) {
if (namedWords.containsKey(word)) {
setWords.addAll(namedWords.get(word));
} else {
setWords.add(word);
}
}
return setWords;
}
public static CharArraySet getWordSet(Environment env, Settings settings, String settingsPrefix) {
List wordList = parseWordList(env, settings, settingsPrefix, s -> s);
if (wordList == null) {
return null;
}
boolean ignoreCase = settings.getAsBoolean(settingsPrefix + "_case", false);
return new CharArraySet(wordList, ignoreCase);
}
public static List parseWordList(Environment env, Settings settings, String settingPrefix, CustomMappingRuleParser parser) {
return parseWordList(env, settings, settingPrefix + "_path", settingPrefix, parser);
}
/**
* Parses a list of words from the specified settings or from a file, with the given parser.
*
* @throws IllegalArgumentException
* If the word list cannot be found at either key.
* @throws RuntimeException
* If there is error parsing the words
*/
public static List parseWordList(
Environment env,
Settings settings,
String settingPath,
String settingList,
CustomMappingRuleParser parser
) {
List words = getWordList(env, settings, settingPath, settingList);
if (words == null) {
return null;
}
List rules = new ArrayList<>();
int lineNum = 0;
for (String word : words) {
lineNum++;
if (word.startsWith("#") == false || HASH_TAG_RULE_PATTERN.matcher(word).find() == true) {
try {
rules.add(parser.apply(word));
} catch (RuntimeException ex) {
String wordListPath = settings.get(settingPath, null);
if (wordListPath == null || isUnderConfig(env, wordListPath)) {
throw new RuntimeException("Line [" + lineNum + "]: " + ex.getMessage());
} else {
LOGGER.error("Line [{}]: {}", lineNum, ex);
throw new RuntimeException("Line [" + lineNum + "]: " + "Invalid rule");
}
}
}
}
return rules;
}
/**
* Fetches a list of words from the specified settings file. The list should either be available at the key
* specified by settingList
or in a file specified by settingPath
.
*
* @throws IllegalArgumentException
* If the word list cannot be found at either key.
*/
private static List getWordList(Environment env, Settings settings, String settingPath, String settingList) {
String wordListPath = settings.get(settingPath, null);
if (wordListPath == null) {
return settings.getAsList(settingList, null);
}
final Path path = resolveAnalyzerPath(env, wordListPath);
try {
return loadWordList(path);
} catch (CharacterCodingException ex) {
String message = String.format(
Locale.ROOT,
"Unsupported character encoding detected while reading %s: files must be UTF-8 encoded",
settingPath
);
LOGGER.error("{}: from file: {}, exception is: {}", message, path.toString(), ex);
throw new IllegalArgumentException(message);
} catch (IOException ioe) {
String message = String.format(Locale.ROOT, "IOException while reading %s: file not readable", settingPath);
LOGGER.error("{}, from file: {}, exception is: {}", message, path.toString(), ioe);
throw new IllegalArgumentException(message);
}
}
private static List loadWordList(Path path) throws IOException {
final List result = new ArrayList<>();
try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
String word;
while ((word = br.readLine()) != null) {
if (Strings.hasText(word) == false) {
continue;
}
result.add(word.trim());
}
}
return result;
}
/**
* @return null If no settings set for "settingsPrefix" then return null
.
* @throws IllegalArgumentException
* If the Reader can not be instantiated.
*/
public static Reader getReaderFromFile(Environment env, Settings settings, String settingPrefix) {
String filePath = settings.get(settingPrefix, null);
if (filePath == null) {
return null;
}
final Path path = resolveAnalyzerPath(env, filePath);
try {
return Files.newBufferedReader(path, StandardCharsets.UTF_8);
} catch (CharacterCodingException ex) {
String message = String.format(
Locale.ROOT,
"Unsupported character encoding detected while reading %s_path: files must be UTF-8 encoded",
settingPrefix
);
LOGGER.error("{}: from file: {}, exception is: {}", message, path.toString(), ex);
throw new IllegalArgumentException(message);
} catch (IOException ioe) {
String message = String.format(Locale.ROOT, "IOException while reading %s_path: file not readable", settingPrefix);
LOGGER.error("{}, from file: {}, exception is: {}", message, path.toString(), ioe);
throw new IllegalArgumentException(message);
}
}
public static Path resolveAnalyzerPath(Environment env, String wordListPath) {
return env.configFile().resolve(wordListPath).normalize();
}
private static boolean isUnderConfig(Environment env, String wordListPath) {
try {
final Path path = env.configFile().resolve(wordListPath).normalize();
return path.startsWith(env.configFile().toAbsolutePath());
} catch (Exception ex) {
return false;
}
}
}