org.apache.lucene.analysis.util.AbstractAnalysisFactory Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.util;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
/**
* Abstract parent class for analysis factories {@link TokenizerFactory},
* {@link TokenFilterFactory} and {@link CharFilterFactory}.
*
* The typical lifecycle for a factory consumer is:
*
* - Create factory via its constructor (or via XXXFactory.forName)
*
- (Optional) If the factory uses resources such as files, {@link ResourceLoaderAware#inform(ResourceLoader)} is called to initialize those resources.
*
- Consumer calls create() to obtain instances.
*
*/
public abstract class AbstractAnalysisFactory {
public static final String LUCENE_MATCH_VERSION_PARAM = "luceneMatchVersion";
/** The original args, before any processing */
private final Map originalArgs;
/** the luceneVersion arg */
protected final Version luceneMatchVersion;
/** whether the luceneMatchVersion arg is explicitly specified in the serialized schema */
private boolean isExplicitLuceneMatchVersion = false;
/**
* Initialize this factory via a set of key-value pairs.
*/
protected AbstractAnalysisFactory(Map args) {
originalArgs = Collections.unmodifiableMap(new HashMap<>(args));
String version = get(args, LUCENE_MATCH_VERSION_PARAM);
if (version == null) {
luceneMatchVersion = Version.LATEST;
} else {
try {
luceneMatchVersion = Version.parseLeniently(version);
} catch (ParseException pe) {
throw new IllegalArgumentException(pe);
}
}
args.remove(CLASS_NAME); // consume the class arg
}
public final Map getOriginalArgs() {
return originalArgs;
}
public final Version getLuceneMatchVersion() {
return this.luceneMatchVersion;
}
public String require(Map args, String name) {
String s = args.remove(name);
if (s == null) {
throw new IllegalArgumentException("Configuration Error: missing parameter '" + name + "'");
}
return s;
}
public String require(Map args, String name, Collection allowedValues) {
return require(args, name, allowedValues, true);
}
public String require(Map args, String name, Collection allowedValues, boolean caseSensitive) {
String s = args.remove(name);
if (s == null) {
throw new IllegalArgumentException("Configuration Error: missing parameter '" + name + "'");
} else {
for (String allowedValue : allowedValues) {
if (caseSensitive) {
if (s.equals(allowedValue)) {
return s;
}
} else {
if (s.equalsIgnoreCase(allowedValue)) {
return s;
}
}
}
throw new IllegalArgumentException("Configuration Error: '" + name + "' value must be one of " + allowedValues);
}
}
public String get(Map args, String name) {
return args.remove(name); // defaultVal = null
}
public String get(Map args, String name, String defaultVal) {
String s = args.remove(name);
return s == null ? defaultVal : s;
}
public String get(Map args, String name, Collection allowedValues) {
return get(args, name, allowedValues, null); // defaultVal = null
}
public String get(Map args, String name, Collection allowedValues, String defaultVal) {
return get(args, name, allowedValues, defaultVal, true);
}
public String get(Map args, String name, Collection allowedValues, String defaultVal, boolean caseSensitive) {
String s = args.remove(name);
if (s == null) {
return defaultVal;
} else {
for (String allowedValue : allowedValues) {
if (caseSensitive) {
if (s.equals(allowedValue)) {
return s;
}
} else {
if (s.equalsIgnoreCase(allowedValue)) {
return s;
}
}
}
throw new IllegalArgumentException("Configuration Error: '" + name + "' value must be one of " + allowedValues);
}
}
protected final int requireInt(Map args, String name) {
return Integer.parseInt(require(args, name));
}
protected final int getInt(Map args, String name, int defaultVal) {
String s = args.remove(name);
return s == null ? defaultVal : Integer.parseInt(s);
}
protected final boolean requireBoolean(Map args, String name) {
return Boolean.parseBoolean(require(args, name));
}
protected final boolean getBoolean(Map args, String name, boolean defaultVal) {
String s = args.remove(name);
return s == null ? defaultVal : Boolean.parseBoolean(s);
}
protected final float requireFloat(Map args, String name) {
return Float.parseFloat(require(args, name));
}
protected final float getFloat(Map args, String name, float defaultVal) {
String s = args.remove(name);
return s == null ? defaultVal : Float.parseFloat(s);
}
public char requireChar(Map args, String name) {
return require(args, name).charAt(0);
}
public char getChar(Map args, String name, char defaultValue) {
String s = args.remove(name);
if (s == null) {
return defaultValue;
} else {
if (s.length() != 1) {
throw new IllegalArgumentException(name + " should be a char. \"" + s + "\" is invalid");
} else {
return s.charAt(0);
}
}
}
private static final Pattern ITEM_PATTERN = Pattern.compile("[^,\\s]+");
/** Returns whitespace- and/or comma-separated set of values, or null if none are found */
public Set getSet(Map args, String name) {
String s = args.remove(name);
if (s == null) {
return null;
} else {
Set set = null;
Matcher matcher = ITEM_PATTERN.matcher(s);
if (matcher.find()) {
set = new HashSet<>();
set.add(matcher.group(0));
while (matcher.find()) {
set.add(matcher.group(0));
}
}
return set;
}
}
/**
* Compiles a pattern for the value of the specified argument key name
*/
protected final Pattern getPattern(Map args, String name) {
try {
return Pattern.compile(require(args, name));
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException
("Configuration Error: '" + name + "' can not be parsed in " +
this.getClass().getSimpleName(), e);
}
}
/**
* Returns as {@link CharArraySet} from wordFiles, which
* can be a comma-separated list of filenames
*/
protected final CharArraySet getWordSet(ResourceLoader loader,
String wordFiles, boolean ignoreCase) throws IOException {
List files = splitFileNames(wordFiles);
CharArraySet words = null;
if (files.size() > 0) {
// default stopwords list has 35 or so words, but maybe don't make it that
// big to start
words = new CharArraySet(files.size() * 10, ignoreCase);
for (String file : files) {
List wlist = getLines(loader, file.trim());
words.addAll(StopFilter.makeStopSet(wlist, ignoreCase));
}
}
return words;
}
/**
* Returns the resource's lines (with content treated as UTF-8)
*/
protected final List getLines(ResourceLoader loader, String resource) throws IOException {
return WordlistLoader.getLines(loader.openResource(resource), StandardCharsets.UTF_8);
}
/** same as {@link #getWordSet(ResourceLoader, String, boolean)},
* except the input is in snowball format. */
protected final CharArraySet getSnowballWordSet(ResourceLoader loader,
String wordFiles, boolean ignoreCase) throws IOException {
List files = splitFileNames(wordFiles);
CharArraySet words = null;
if (files.size() > 0) {
// default stopwords list has 35 or so words, but maybe don't make it that
// big to start
words = new CharArraySet(files.size() * 10, ignoreCase);
for (String file : files) {
InputStream stream = null;
Reader reader = null;
try {
stream = loader.openResource(file.trim());
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
reader = new InputStreamReader(stream, decoder);
WordlistLoader.getSnowballWordSet(reader, words);
} finally {
IOUtils.closeWhileHandlingException(reader, stream);
}
}
}
return words;
}
/**
* Splits file names separated by comma character.
* File names can contain comma characters escaped by backslash '\'
*
* @param fileNames the string containing file names
* @return a list of file names with the escaping backslashed removed
*/
protected final List splitFileNames(String fileNames) {
if (fileNames == null)
return Collections.emptyList();
List result = new ArrayList<>();
for (String file : fileNames.split("(?
© 2015 - 2025 Weber Informatics LLC | Privacy Policy