
org.opensearch.index.similarity.SimilarityProviders Maven / Gradle / Ivy
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.index.similarity;
import org.apache.lucene.search.similarities.AfterEffect;
import org.apache.lucene.search.similarities.AfterEffectB;
import org.apache.lucene.search.similarities.AfterEffectL;
import org.apache.lucene.search.similarities.BasicModel;
import org.apache.lucene.search.similarities.BasicModelG;
import org.apache.lucene.search.similarities.BasicModelIF;
import org.apache.lucene.search.similarities.BasicModelIn;
import org.apache.lucene.search.similarities.BasicModelIne;
import org.apache.lucene.search.similarities.BooleanSimilarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.DFISimilarity;
import org.apache.lucene.search.similarities.DFRSimilarity;
import org.apache.lucene.search.similarities.Distribution;
import org.apache.lucene.search.similarities.DistributionLL;
import org.apache.lucene.search.similarities.DistributionSPL;
import org.apache.lucene.search.similarities.IBSimilarity;
import org.apache.lucene.search.similarities.Independence;
import org.apache.lucene.search.similarities.IndependenceChiSquared;
import org.apache.lucene.search.similarities.IndependenceSaturated;
import org.apache.lucene.search.similarities.IndependenceStandardized;
import org.apache.lucene.search.similarities.LMDirichletSimilarity;
import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
import org.apache.lucene.search.similarities.Lambda;
import org.apache.lucene.search.similarities.LambdaDF;
import org.apache.lucene.search.similarities.LambdaTTF;
import org.apache.lucene.search.similarities.Normalization;
import org.apache.lucene.search.similarities.NormalizationH1;
import org.apache.lucene.search.similarities.NormalizationH2;
import org.apache.lucene.search.similarities.NormalizationH3;
import org.apache.lucene.search.similarities.NormalizationZ;
import org.apache.lucene.search.similarity.LegacyBM25Similarity;
import org.opensearch.LegacyESVersion;
import org.opensearch.Version;
import org.opensearch.common.logging.DeprecationLogger;
import org.opensearch.common.settings.Settings;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import static java.util.Collections.unmodifiableMap;
final class SimilarityProviders {
private SimilarityProviders() {} // no instantiation
private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(SimilarityProviders.class);
static final String DISCOUNT_OVERLAPS = "discount_overlaps";
private static final Map BASIC_MODELS;
private static final Map LEGACY_BASIC_MODELS;
private static final Map AFTER_EFFECTS;
private static final Map LEGACY_AFTER_EFFECTS;
static {
Map models = new HashMap<>();
models.put("g", new BasicModelG());
models.put("if", new BasicModelIF());
models.put("in", new BasicModelIn());
models.put("ine", new BasicModelIne());
BASIC_MODELS = unmodifiableMap(models);
Map legacyModels = new HashMap<>();
// TODO: be and g and both based on the bose-einstein model.
// Is there a better replacement for d and p which use the binomial model?
legacyModels.put("be", "g");
legacyModels.put("d", "ine");
legacyModels.put("p", "ine");
LEGACY_BASIC_MODELS = unmodifiableMap(legacyModels);
Map effects = new HashMap<>();
effects.put("b", new AfterEffectB());
effects.put("l", new AfterEffectL());
AFTER_EFFECTS = unmodifiableMap(effects);
Map legacyEffects = new HashMap<>();
// l is simpler than b, so this should be a better replacement for "no"
legacyEffects.put("no", "l");
LEGACY_AFTER_EFFECTS = unmodifiableMap(legacyEffects);
}
private static final Map INDEPENDENCE_MEASURES;
static {
Map measures = new HashMap<>();
measures.put("standardized", new IndependenceStandardized());
measures.put("saturated", new IndependenceSaturated());
measures.put("chisquared", new IndependenceChiSquared());
INDEPENDENCE_MEASURES = unmodifiableMap(measures);
}
private static final Map DISTRIBUTIONS;
private static final Map LAMBDAS;
static {
Map distributions = new HashMap<>();
distributions.put("ll", new DistributionLL());
distributions.put("spl", new DistributionSPL());
DISTRIBUTIONS = unmodifiableMap(distributions);
Map lamdas = new HashMap<>();
lamdas.put("df", new LambdaDF());
lamdas.put("ttf", new LambdaTTF());
LAMBDAS = unmodifiableMap(lamdas);
}
/**
* Parses the given Settings and creates the appropriate {@link BasicModel}
*
* @param settings Settings to parse
* @return {@link BasicModel} referred to in the Settings
*/
private static BasicModel parseBasicModel(Version indexCreatedVersion, Settings settings) {
String basicModel = settings.get("basic_model");
BasicModel model = BASIC_MODELS.get(basicModel);
if (model == null) {
String replacement = LEGACY_BASIC_MODELS.get(basicModel);
if (replacement != null) {
if (indexCreatedVersion.onOrAfter(LegacyESVersion.V_7_0_0)) {
throw new IllegalArgumentException(
"Basic model [" + basicModel + "] isn't supported anymore, " + "please use another model."
);
} else {
deprecationLogger.deprecate(
basicModel + "_similarity_model_replaced",
"Basic model ["
+ basicModel
+ "] isn't supported anymore and has arbitrarily been replaced with ["
+ replacement
+ "]."
);
model = BASIC_MODELS.get(replacement);
assert model != null;
}
}
}
if (model == null) {
throw new IllegalArgumentException("Unsupported BasicModel [" + basicModel + "], expected one of " + BASIC_MODELS.keySet());
}
return model;
}
/**
* Parses the given Settings and creates the appropriate {@link AfterEffect}
*
* @param settings Settings to parse
* @return {@link AfterEffect} referred to in the Settings
*/
private static AfterEffect parseAfterEffect(Version indexCreatedVersion, Settings settings) {
String afterEffect = settings.get("after_effect");
AfterEffect effect = AFTER_EFFECTS.get(afterEffect);
if (effect == null) {
String replacement = LEGACY_AFTER_EFFECTS.get(afterEffect);
if (replacement != null) {
if (indexCreatedVersion.onOrAfter(LegacyESVersion.V_7_0_0)) {
throw new IllegalArgumentException(
"After effect [" + afterEffect + "] isn't supported anymore, please use another effect."
);
} else {
deprecationLogger.deprecate(
afterEffect + "_after_effect_replaced",
"After effect ["
+ afterEffect
+ "] isn't supported anymore and has arbitrarily been replaced with ["
+ replacement
+ "]."
);
effect = AFTER_EFFECTS.get(replacement);
assert effect != null;
}
}
}
if (effect == null) {
throw new IllegalArgumentException("Unsupported AfterEffect [" + afterEffect + "], expected one of " + AFTER_EFFECTS.keySet());
}
return effect;
}
/**
* Parses the given Settings and creates the appropriate {@link Normalization}
*
* @param settings Settings to parse
* @return {@link Normalization} referred to in the Settings
*/
private static Normalization parseNormalization(Settings settings) {
String normalization = settings.get("normalization");
if ("no".equals(normalization)) {
return new Normalization.NoNormalization();
} else if ("h1".equals(normalization)) {
float c = settings.getAsFloat("normalization.h1.c", 1f);
return new NormalizationH1(c);
} else if ("h2".equals(normalization)) {
float c = settings.getAsFloat("normalization.h2.c", 1f);
return new NormalizationH2(c);
} else if ("h3".equals(normalization)) {
float c = settings.getAsFloat("normalization.h3.c", 800f);
return new NormalizationH3(c);
} else if ("z".equals(normalization)) {
float z = settings.getAsFloat("normalization.z.z", 0.30f);
return new NormalizationZ(z);
} else {
throw new IllegalArgumentException("Unsupported Normalization [" + normalization + "]");
}
}
private static Independence parseIndependence(Settings settings) {
String name = settings.get("independence_measure");
Independence measure = INDEPENDENCE_MEASURES.get(name);
if (measure == null) {
throw new IllegalArgumentException(
"Unsupported IndependenceMeasure [" + name + "], expected one of " + INDEPENDENCE_MEASURES.keySet()
);
}
return measure;
}
/**
* Parses the given Settings and creates the appropriate {@link Distribution}
*
* @param settings Settings to parse
* @return {@link Normalization} referred to in the Settings
*/
private static Distribution parseDistribution(Settings settings) {
String rawDistribution = settings.get("distribution");
Distribution distribution = DISTRIBUTIONS.get(rawDistribution);
if (distribution == null) {
throw new IllegalArgumentException("Unsupported Distribution [" + rawDistribution + "]");
}
return distribution;
}
/**
* Parses the given Settings and creates the appropriate {@link Lambda}
*
* @param settings Settings to parse
* @return {@link Normalization} referred to in the Settings
*/
private static Lambda parseLambda(Settings settings) {
String rawLambda = settings.get("lambda");
Lambda lambda = LAMBDAS.get(rawLambda);
if (lambda == null) {
throw new IllegalArgumentException("Unsupported Lambda [" + rawLambda + "]");
}
return lambda;
}
static void assertSettingsIsSubsetOf(String type, Version version, Settings settings, String... supportedSettings) {
Set unknownSettings = new HashSet<>(settings.keySet());
unknownSettings.removeAll(Arrays.asList(supportedSettings));
unknownSettings.remove("type"); // used to figure out which sim this is
if (unknownSettings.isEmpty() == false) {
if (version.onOrAfter(LegacyESVersion.V_7_0_0)) {
throw new IllegalArgumentException("Unknown settings for similarity of type [" + type + "]: " + unknownSettings);
} else {
deprecationLogger.deprecate(
"unknown_similarity_setting",
"Unknown settings for similarity of type [" + type + "]: " + unknownSettings
);
}
}
}
public static LegacyBM25Similarity createBM25Similarity(Settings settings, Version indexCreatedVersion) {
assertSettingsIsSubsetOf("BM25", indexCreatedVersion, settings, "k1", "b", DISCOUNT_OVERLAPS);
float k1 = settings.getAsFloat("k1", 1.2f);
float b = settings.getAsFloat("b", 0.75f);
boolean discountOverlaps = settings.getAsBoolean(DISCOUNT_OVERLAPS, true);
LegacyBM25Similarity similarity = new LegacyBM25Similarity(k1, b);
similarity.setDiscountOverlaps(discountOverlaps);
return similarity;
}
public static BooleanSimilarity createBooleanSimilarity(Settings settings, Version indexCreatedVersion) {
assertSettingsIsSubsetOf("boolean", indexCreatedVersion, settings);
return new BooleanSimilarity();
}
public static ClassicSimilarity createClassicSimilarity(Settings settings, Version indexCreatedVersion) {
assertSettingsIsSubsetOf("classic", indexCreatedVersion, settings, DISCOUNT_OVERLAPS);
boolean discountOverlaps = settings.getAsBoolean(DISCOUNT_OVERLAPS, true);
ClassicSimilarity similarity = new ClassicSimilarity();
similarity.setDiscountOverlaps(discountOverlaps);
return similarity;
}
public static DFRSimilarity createDfrSimilarity(Settings settings, Version indexCreatedVersion) {
assertSettingsIsSubsetOf(
"DFR",
indexCreatedVersion,
settings,
"basic_model",
"after_effect",
"normalization",
"normalization.h1.c",
"normalization.h2.c",
"normalization.h3.c",
"normalization.z.z"
);
return new DFRSimilarity(
parseBasicModel(indexCreatedVersion, settings),
parseAfterEffect(indexCreatedVersion, settings),
parseNormalization(settings)
);
}
public static DFISimilarity createDfiSimilarity(Settings settings, Version indexCreatedVersion) {
assertSettingsIsSubsetOf("DFI", indexCreatedVersion, settings, "independence_measure");
return new DFISimilarity(parseIndependence(settings));
}
public static IBSimilarity createIBSimilarity(Settings settings, Version indexCreatedVersion) {
assertSettingsIsSubsetOf(
"IB",
indexCreatedVersion,
settings,
"distribution",
"lambda",
"normalization",
"normalization.h1.c",
"normalization.h2.c",
"normalization.h3.c",
"normalization.z.z"
);
return new IBSimilarity(parseDistribution(settings), parseLambda(settings), parseNormalization(settings));
}
public static LMDirichletSimilarity createLMDirichletSimilarity(Settings settings, Version indexCreatedVersion) {
assertSettingsIsSubsetOf("LMDirichlet", indexCreatedVersion, settings, "mu");
float mu = settings.getAsFloat("mu", 2000f);
return new LMDirichletSimilarity(mu);
}
public static LMJelinekMercerSimilarity createLMJelinekMercerSimilarity(Settings settings, Version indexCreatedVersion) {
assertSettingsIsSubsetOf("LMJelinekMercer", indexCreatedVersion, settings, "lambda");
float lambda = settings.getAsFloat("lambda", 0.1f);
return new LMJelinekMercerSimilarity(lambda);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy