org.apache.lucene.queryparser.analyzing.AnalyzingQueryParser Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.queryparser.analyzing;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
/**
* Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
* are also passed through the given analyzer, but wildcard characters *
and
* ?
don't get removed from the search terms.
*
* Warning: This class should only be used with analyzers that do not use stopwords
* or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer
* will turn Häuser
into hau
, but H?user
will
* become h?user
when using this parser and thus no match would be found (i.e.
* using this parser will be no improvement over QueryParser in such cases).
*/
public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.QueryParser {
// gobble escaped chars or find a wildcard character
private static final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)");
public AnalyzingQueryParser(String field, Analyzer analyzer) {
super(field, analyzer);
setAnalyzeRangeTerms(true);
}
/**
* Called when parser parses an input term that contains one or more wildcard
* characters (like *
), but is not a prefix term (one that has
* just a single *
character at the end).
*
* Example: will be called for H?user
or for H*user
.
*
* Depending on analyzer and settings, a wildcard term may (most probably will)
* be lower-cased automatically. It will go through the default Analyzer.
*
* Overrides super class, by passing terms through analyzer.
*
* @param field Name of the field query will use.
* @param termStr Term that contains one or more wildcard
* characters (? or *), but is not simple prefix term
*
* @return Resulting {@link Query} built for the term
*/
@Override
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
if ("*".equals(field)) {
if ("*".equals(termStr)) return newMatchAllDocsQuery();
}
if (getAllowLeadingWildcard() == false && (termStr.startsWith("*") || termStr.startsWith("?")))
throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery");
Term t = new Term(field, analyzeWildcard(field, termStr));
return newWildcardQuery(t);
}
private BytesRef analyzeWildcard(String field, String termStr) {
// best effort to not pass the wildcard characters and escaped characters through #normalize
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(termStr);
BytesRefBuilder sb = new BytesRefBuilder();
int last = 0;
while (wildcardMatcher.find()){
if (wildcardMatcher.start() > 0) {
String chunk = termStr.substring(last, wildcardMatcher.start());
BytesRef normalized = getAnalyzer().normalize(field, chunk);
sb.append(normalized);
}
//append the matched group - without normalizing
sb.append(new BytesRef(wildcardMatcher.group()));
last = wildcardMatcher.end();
}
if (last < termStr.length()){
String chunk = termStr.substring(last);
BytesRef normalized = getAnalyzer().normalize(field, chunk);
sb.append(normalized);
}
return sb.toBytesRef();
}
/**
* Called when parser parses an input term
* that uses prefix notation; that is, contains a single '*' wildcard
* character as its last character. Since this is a special case
* of generic wildcard term, and such a query can be optimized easily,
* this usually results in a different query object.
*
* Depending on analyzer and settings, a prefix term may (most probably will)
* be lower-cased automatically. It will go through the default Analyzer.
*
* Overrides super class, by passing terms through analyzer.
*
* @param field Name of the field query will use.
* @param termStr Term to use for building term for the query
* (without trailing '*' character!)
*
* @return Resulting {@link Query} built for the term
*/
@Override
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
if (!getAllowLeadingWildcard() && termStr.startsWith("*"))
throw new ParseException("'*' not allowed as first character in PrefixQuery");
if (getLowercaseExpandedTerms()) {
termStr = termStr.toLowerCase(getLocale());
}
BytesRef term = getAnalyzer().normalize(field, termStr);
Term t = new Term(field, term);
return newPrefixQuery(t);
}
/**
* Called when parser parses an input term that has the fuzzy suffix (~) appended.
*
* Depending on analyzer and settings, a fuzzy term may (most probably will)
* be lower-cased automatically. It will go through the default Analyzer.
*
* Overrides super class, by passing terms through analyzer.
*
* @param field Name of the field query will use.
* @param termStr Term to use for building term for the query
*
* @return Resulting {@link Query} built for the term
*/
@Override
protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
throws ParseException {
BytesRef term = getAnalyzer().normalize(field, termStr);
Term t = new Term(field, term);
return newFuzzyQuery(t, minSimilarity, getFuzzyPrefixLength());
}
}