All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.queryparser.analyzing.AnalyzingQueryParser Maven / Gradle / Ivy

There is a newer version: 10.1.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.queryparser.analyzing;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;

/**
 * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
 * are also passed through the given analyzer, but wildcard characters * and
 * ? don't get removed from the search terms.
 * 
 * 

Warning: This class should only be used with analyzers that do not use stopwords * or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer * will turn Häuser into hau, but H?user will * become h?user when using this parser and thus no match would be found (i.e. * using this parser will be no improvement over QueryParser in such cases). */ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.QueryParser { // gobble escaped chars or find a wildcard character private static final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)"); public AnalyzingQueryParser(String field, Analyzer analyzer) { super(field, analyzer); setAnalyzeRangeTerms(true); } /** * Called when parser parses an input term that contains one or more wildcard * characters (like *), but is not a prefix term (one that has * just a single * character at the end). *

* Example: will be called for H?user or for H*user. *

* Depending on analyzer and settings, a wildcard term may (most probably will) * be lower-cased automatically. It will go through the default Analyzer. *

* Overrides super class, by passing terms through analyzer. * * @param field Name of the field query will use. * @param termStr Term that contains one or more wildcard * characters (? or *), but is not simple prefix term * * @return Resulting {@link Query} built for the term */ @Override protected Query getWildcardQuery(String field, String termStr) throws ParseException { if ("*".equals(field)) { if ("*".equals(termStr)) return newMatchAllDocsQuery(); } if (getAllowLeadingWildcard() == false && (termStr.startsWith("*") || termStr.startsWith("?"))) throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"); Term t = new Term(field, analyzeWildcard(field, termStr)); return newWildcardQuery(t); } private BytesRef analyzeWildcard(String field, String termStr) { // best effort to not pass the wildcard characters and escaped characters through #normalize Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(termStr); BytesRefBuilder sb = new BytesRefBuilder(); int last = 0; while (wildcardMatcher.find()){ if (wildcardMatcher.start() > 0) { String chunk = termStr.substring(last, wildcardMatcher.start()); BytesRef normalized = getAnalyzer().normalize(field, chunk); sb.append(normalized); } //append the matched group - without normalizing sb.append(new BytesRef(wildcardMatcher.group())); last = wildcardMatcher.end(); } if (last < termStr.length()){ String chunk = termStr.substring(last); BytesRef normalized = getAnalyzer().normalize(field, chunk); sb.append(normalized); } return sb.toBytesRef(); } /** * Called when parser parses an input term * that uses prefix notation; that is, contains a single '*' wildcard * character as its last character. Since this is a special case * of generic wildcard term, and such a query can be optimized easily, * this usually results in a different query object. *

* Depending on analyzer and settings, a prefix term may (most probably will) * be lower-cased automatically. It will go through the default Analyzer. *

* Overrides super class, by passing terms through analyzer. * * @param field Name of the field query will use. * @param termStr Term to use for building term for the query * (without trailing '*' character!) * * @return Resulting {@link Query} built for the term */ @Override protected Query getPrefixQuery(String field, String termStr) throws ParseException { if (!getAllowLeadingWildcard() && termStr.startsWith("*")) throw new ParseException("'*' not allowed as first character in PrefixQuery"); if (getLowercaseExpandedTerms()) { termStr = termStr.toLowerCase(getLocale()); } BytesRef term = getAnalyzer().normalize(field, termStr); Term t = new Term(field, term); return newPrefixQuery(t); } /** * Called when parser parses an input term that has the fuzzy suffix (~) appended. *

* Depending on analyzer and settings, a fuzzy term may (most probably will) * be lower-cased automatically. It will go through the default Analyzer. *

* Overrides super class, by passing terms through analyzer. * * @param field Name of the field query will use. * @param termStr Term to use for building term for the query * * @return Resulting {@link Query} built for the term */ @Override protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException { BytesRef term = getAnalyzer().normalize(field, termStr); Term t = new Term(field, term); return newFuzzyQuery(t, minSimilarity, getFuzzyPrefixLength()); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy