org.elasticsearch.indices.analysis.PreBuiltTokenizers Maven / Gradle / Ivy
The newest version!
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.indices.analysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.ngram.Lucene43EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.Lucene43NGramTokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.standard.std40.StandardTokenizer40;
import org.apache.lucene.analysis.standard.std40.UAX29URLEmailTokenizer40;
import org.apache.lucene.analysis.th.ThaiTokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import java.util.Locale;
/**
*
*/
public enum PreBuiltTokenizers {
STANDARD(CachingStrategy.LUCENE) {
@Override
protected Tokenizer create(Version version) {
if (version.luceneVersion.onOrAfter(org.apache.lucene.util.Version.LUCENE_4_7_0)) {
return new StandardTokenizer();
} else {
return new StandardTokenizer40();
}
}
},
CLASSIC(CachingStrategy.LUCENE) {
@Override
protected Tokenizer create(Version version) {
return new ClassicTokenizer();
}
},
UAX_URL_EMAIL(CachingStrategy.LUCENE) {
@Override
protected Tokenizer create(Version version) {
if (version.luceneVersion.onOrAfter(org.apache.lucene.util.Version.LUCENE_4_7_0)) {
return new UAX29URLEmailTokenizer();
} else {
return new UAX29URLEmailTokenizer40();
}
}
},
PATH_HIERARCHY(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Version version) {
return new PathHierarchyTokenizer();
}
},
KEYWORD(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Version version) {
return new KeywordTokenizer();
}
},
LETTER(CachingStrategy.LUCENE) {
@Override
protected Tokenizer create(Version version) {
return new LetterTokenizer();
}
},
LOWERCASE(CachingStrategy.LUCENE) {
@Override
protected Tokenizer create(Version version) {
return new LowerCaseTokenizer();
}
},
WHITESPACE(CachingStrategy.LUCENE) {
@Override
protected Tokenizer create(Version version) {
return new WhitespaceTokenizer();
}
},
NGRAM(CachingStrategy.LUCENE) {
@Override
protected Tokenizer create(Version version) {
// see NGramTokenizerFactory for an explanation of this logic:
// 4.4 patch was used before 4.4 was released
if (version.onOrAfter(org.elasticsearch.Version.V_0_90_2) &&
version.luceneVersion.onOrAfter(org.apache.lucene.util.Version.LUCENE_4_3)) {
return new NGramTokenizer();
} else {
return new Lucene43NGramTokenizer();
}
}
},
EDGE_NGRAM(CachingStrategy.LUCENE) {
@Override
protected Tokenizer create(Version version) {
// see EdgeNGramTokenizerFactory for an explanation of this logic:
// 4.4 patch was used before 4.4 was released
if (version.onOrAfter(org.elasticsearch.Version.V_0_90_2) &&
version.luceneVersion.onOrAfter(org.apache.lucene.util.Version.LUCENE_4_3)) {
return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
} else {
return new Lucene43EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
}
}
},
PATTERN(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Version version) {
return new PatternTokenizer(Regex.compile("\\W+", null), -1);
}
},
THAI(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Version version) {
return new ThaiTokenizer();
}
}
;
abstract protected Tokenizer create(Version version);
protected final PreBuiltCacheFactory.PreBuiltCache cache;
PreBuiltTokenizers(CachingStrategy cachingStrategy) {
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
}
public synchronized TokenizerFactory getTokenizerFactory(final Version version) {
TokenizerFactory tokenizerFactory = cache.get(version);
if (tokenizerFactory == null) {
final String finalName = name();
tokenizerFactory = new TokenizerFactory() {
@Override
public String name() {
return finalName.toLowerCase(Locale.ROOT);
}
@Override
public Tokenizer create() {
return valueOf(finalName).create(version);
}
};
cache.put(version, tokenizerFactory);
}
return tokenizerFactory;
}
/**
* Get a pre built Tokenizer by its name or fallback to the default one
* @param name Tokenizer name
* @param defaultTokenizer default Tokenizer if name not found
*/
public static PreBuiltTokenizers getOrDefault(String name, PreBuiltTokenizers defaultTokenizer) {
try {
return valueOf(name.toUpperCase(Locale.ROOT));
} catch (IllegalArgumentException e) {
return defaultTokenizer;
}
}
}