All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.indices.analysis.AnalysisFactoryTestCase Maven / Gradle / Ivy

There is a newer version: 8.16.0
Show newest version
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */

package org.elasticsearch.indices.analysis;

import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenizerFactory;
import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;

import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;

import static java.util.Collections.emptyMap;
import static java.util.Map.entry;

/**
 * Alerts us if new analysis components are added to Lucene, so we don't miss them.
 * 

* If we don't want to expose one for a specific reason, just map it to Void. * The deprecated ones can be mapped to Deprecated.class. */ public abstract class AnalysisFactoryTestCase extends ESTestCase { private static final Map> KNOWN_TOKENIZERS = Map.ofEntries( // exposed in ES entry("classic", MovedToAnalysisCommon.class), entry("edgengram", MovedToAnalysisCommon.class), entry("keyword", MovedToAnalysisCommon.class), entry("letter", MovedToAnalysisCommon.class), entry("ngram", MovedToAnalysisCommon.class), entry("pathhierarchy", MovedToAnalysisCommon.class), entry("pattern", MovedToAnalysisCommon.class), entry("simplepattern", MovedToAnalysisCommon.class), entry("simplepatternsplit", MovedToAnalysisCommon.class), entry("standard", StandardTokenizerFactory.class), entry("thai", MovedToAnalysisCommon.class), entry("uax29urlemail", MovedToAnalysisCommon.class), entry("whitespace", MovedToAnalysisCommon.class), // this one "seems to mess up offsets". probably shouldn't be a tokenizer... entry("wikipedia", Void.class) ); static final Map> KNOWN_TOKENFILTERS = Map.ofEntries( // exposed in ES entry("apostrophe", MovedToAnalysisCommon.class), entry("arabicnormalization", MovedToAnalysisCommon.class), entry("arabicstem", MovedToAnalysisCommon.class), entry("asciifolding", MovedToAnalysisCommon.class), entry("bengalinormalization", MovedToAnalysisCommon.class), entry("bengalistem", MovedToAnalysisCommon.class), entry("brazilianstem", MovedToAnalysisCommon.class), entry("bulgarianstem", MovedToAnalysisCommon.class), entry("cjkbigram", MovedToAnalysisCommon.class), entry("cjkwidth", MovedToAnalysisCommon.class), entry("classic", MovedToAnalysisCommon.class), entry("commongrams", MovedToAnalysisCommon.class), entry("commongramsquery", MovedToAnalysisCommon.class), entry("czechstem", MovedToAnalysisCommon.class), entry("decimaldigit", MovedToAnalysisCommon.class), entry("delimitedpayload", MovedToAnalysisCommon.class), entry("dictionarycompoundword", MovedToAnalysisCommon.class), entry("edgengram", MovedToAnalysisCommon.class), entry("elision", MovedToAnalysisCommon.class), entry("englishminimalstem", MovedToAnalysisCommon.class), entry("englishpossessive", MovedToAnalysisCommon.class), entry("finnishlightstem", MovedToAnalysisCommon.class), entry("fixedshingle", MovedToAnalysisCommon.class), entry("frenchlightstem", MovedToAnalysisCommon.class), entry("frenchminimalstem", MovedToAnalysisCommon.class), entry("galicianminimalstem", MovedToAnalysisCommon.class), entry("galicianstem", MovedToAnalysisCommon.class), entry("germanstem", MovedToAnalysisCommon.class), entry("germanlightstem", MovedToAnalysisCommon.class), entry("germanminimalstem", MovedToAnalysisCommon.class), entry("germannormalization", MovedToAnalysisCommon.class), entry("greeklowercase", MovedToAnalysisCommon.class), entry("greekstem", MovedToAnalysisCommon.class), entry("hindinormalization", MovedToAnalysisCommon.class), entry("hindistem", MovedToAnalysisCommon.class), entry("hungarianlightstem", MovedToAnalysisCommon.class), entry("hunspellstem", HunspellTokenFilterFactory.class), entry("hyphenationcompoundword", MovedToAnalysisCommon.class), entry("indicnormalization", MovedToAnalysisCommon.class), entry("irishlowercase", MovedToAnalysisCommon.class), entry("indonesianstem", MovedToAnalysisCommon.class), entry("italianlightstem", MovedToAnalysisCommon.class), entry("keepword", MovedToAnalysisCommon.class), entry("keywordmarker", MovedToAnalysisCommon.class), entry("kstem", MovedToAnalysisCommon.class), entry("latvianstem", MovedToAnalysisCommon.class), entry("length", MovedToAnalysisCommon.class), entry("limittokencount", MovedToAnalysisCommon.class), entry("lowercase", MovedToAnalysisCommon.class), entry("ngram", MovedToAnalysisCommon.class), entry("norwegianlightstem", MovedToAnalysisCommon.class), entry("norwegianminimalstem", MovedToAnalysisCommon.class), entry("norwegiannormalization", MovedToAnalysisCommon.class), entry("patterncapturegroup", MovedToAnalysisCommon.class), entry("patternreplace", MovedToAnalysisCommon.class), entry("persiannormalization", MovedToAnalysisCommon.class), entry("porterstem", MovedToAnalysisCommon.class), entry("portuguesestem", MovedToAnalysisCommon.class), entry("portugueselightstem", MovedToAnalysisCommon.class), entry("portugueseminimalstem", MovedToAnalysisCommon.class), entry("reversestring", MovedToAnalysisCommon.class), entry("russianlightstem", MovedToAnalysisCommon.class), entry("scandinavianfolding", MovedToAnalysisCommon.class), entry("scandinaviannormalization", MovedToAnalysisCommon.class), entry("serbiannormalization", MovedToAnalysisCommon.class), entry("shingle", ShingleTokenFilterFactory.class), entry("minhash", MovedToAnalysisCommon.class), entry("snowballporter", MovedToAnalysisCommon.class), entry("soraninormalization", MovedToAnalysisCommon.class), entry("soranistem", MovedToAnalysisCommon.class), entry("spanishlightstem", MovedToAnalysisCommon.class), entry("stemmeroverride", MovedToAnalysisCommon.class), entry("stop", StopTokenFilterFactory.class), entry("swedishlightstem", MovedToAnalysisCommon.class), entry("swedishminimalstem", MovedToAnalysisCommon.class), entry("synonym", MovedToAnalysisCommon.class), entry("synonymgraph", MovedToAnalysisCommon.class), entry("telugunormalization", MovedToAnalysisCommon.class), entry("telugustem", MovedToAnalysisCommon.class), entry("trim", MovedToAnalysisCommon.class), entry("truncate", MovedToAnalysisCommon.class), entry("turkishlowercase", MovedToAnalysisCommon.class), entry("type", MovedToAnalysisCommon.class), entry("uppercase", MovedToAnalysisCommon.class), entry("worddelimiter", MovedToAnalysisCommon.class), entry("worddelimitergraph", MovedToAnalysisCommon.class), entry("flattengraph", MovedToAnalysisCommon.class), // TODO: these tokenfilters are not yet exposed: useful? // suggest stop entry("suggeststop", Void.class), // capitalizes tokens entry("capitalization", Void.class), // like length filter (but codepoints) entry("codepointcount", Void.class), // puts hyphenated words back together entry("hyphenatedwords", Void.class), // repeats anything marked as keyword entry("keywordrepeat", Void.class), // like limittokencount, but by offset entry("limittokenoffset", Void.class), // like limittokencount, but by position entry("limittokenposition", Void.class), // ??? entry("numericpayload", Void.class), // removes duplicates at the same position (this should be used by the existing factory) entry("removeduplicates", Void.class), // ??? entry("tokenoffsetpayload", Void.class), // puts the type into the payload entry("typeaspayload", Void.class), // puts the type as a synonym entry("typeassynonym", Void.class), // fingerprint entry("fingerprint", Void.class), // for tee-sinks entry("daterecognizer", Void.class), // for token filters that generate bad offsets, which are now rejected since Lucene 7 entry("fixbrokenoffsets", Void.class), // should we expose it, or maybe think about higher level integration of the // fake term frequency feature (LUCENE-7854) entry("delimitedtermfrequency", Void.class), // LUCENE-8273: ProtectedTermFilterFactory allows analysis chains to skip // particular token filters based on the attributes of the current token. entry("protectedterm", Void.class), // LUCENE-8332 entry("concatenategraph", Void.class), // LUCENE-8936 entry("spanishminimalstem", Void.class), entry("delimitedboost", Void.class), // LUCENE-9574 entry("dropifflagged", Void.class), entry("japanesecompletion", Void.class), // LUCENE-9575 entry("patterntyping", Void.class), // LUCENE-10248 entry("spanishpluralstem", Void.class), // LUCENE-10352 entry("daitchmokotoffsoundex", Void.class), entry("persianstem", Void.class), // not exposed entry("word2vecsynonym", Void.class) ); static final Map> KNOWN_CHARFILTERS = Map.of( "htmlstrip", MovedToAnalysisCommon.class, "mapping", MovedToAnalysisCommon.class, "patternreplace", MovedToAnalysisCommon.class, // TODO: these charfilters are not yet exposed: useful? // handling of zwnj for persian "persian", Void.class, // LUCENE-9413 : it might useful for dictionary-based CJK analyzers "cjkwidth", Void.class ); /** * The plugin being tested. Core uses an "empty" plugin so we don't have to throw null checks all over the place. */ private final AnalysisPlugin plugin; public AnalysisFactoryTestCase(AnalysisPlugin plugin) { this.plugin = Objects.requireNonNull(plugin, "plugin is required. use an empty plugin for core"); } protected Map> getCharFilters() { return KNOWN_CHARFILTERS; } protected Map> getTokenFilters() { return KNOWN_TOKENFILTERS; } protected Map> getTokenizers() { return KNOWN_TOKENIZERS; } /** * Map containing pre-configured token filters that should be available * after installing this plugin. The map is from the name of the token * filter to the class of the Lucene {@link TokenFilterFactory} that it * is emulating. If the Lucene {@linkplain TokenFilterFactory} is * {@code null} then the test will look it up for you from the name. If * there is no Lucene {@linkplain TokenFilterFactory} then the right * hand side should be {@link Void}. */ protected Map> getPreConfiguredTokenFilters() { Map> filters = new HashMap<>(); filters.put("lowercase", null); // for old indices filters.put("standard", Void.class); return filters; } /** * Map containing pre-configured tokenizers that should be available * after installing this plugin. The map is from the name of the token * filter to the class of the Lucene {@link TokenizerFactory} that it * is emulating. If the Lucene {@linkplain TokenizerFactory} is * {@code null} then the test will look it up for you from the name. * If there is no Lucene {@linkplain TokenizerFactory} then the right * hand side should be {@link Void}. */ protected Map> getPreConfiguredTokenizers() { Map> tokenizers = new HashMap<>(); // TODO drop this temporary shim when all the old style tokenizers have been migrated to new style for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) { tokenizers.put(tokenizer.name().toLowerCase(Locale.ROOT), null); } return tokenizers; } public Map> getPreConfiguredCharFilters() { return emptyMap(); } public void testTokenizers() { Set missing = new TreeSet(); missing.addAll( org.apache.lucene.analysis.TokenizerFactory.availableTokenizers() .stream() .map(key -> key.toLowerCase(Locale.ROOT)) .collect(Collectors.toSet()) ); missing.removeAll(getTokenizers().keySet()); assertTrue("new tokenizers found, please update KNOWN_TOKENIZERS: " + missing.toString(), missing.isEmpty()); } public void testCharFilters() { Set missing = new TreeSet(); missing.addAll( org.apache.lucene.analysis.CharFilterFactory.availableCharFilters() .stream() .map(key -> key.toLowerCase(Locale.ROOT)) .collect(Collectors.toSet()) ); missing.removeAll(getCharFilters().keySet()); assertTrue("new charfilters found, please update KNOWN_CHARFILTERS: " + missing.toString(), missing.isEmpty()); } public void testTokenFilters() { Set missing = new TreeSet<>(); missing.addAll( org.apache.lucene.analysis.TokenFilterFactory.availableTokenFilters() .stream() .map(key -> key.toLowerCase(Locale.ROOT)) .collect(Collectors.toSet()) ); missing.removeAll(getTokenFilters().keySet()); assertTrue("new tokenfilters found, please update KNOWN_TOKENFILTERS: " + missing, missing.isEmpty()); } /** * Marker class for components that have moved to the analysis-common modules. This will be * removed when the module is complete and these analysis components aren't available to core. */ protected static final class MovedToAnalysisCommon { private MovedToAnalysisCommon() {} } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy