All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.queryparser.analyzing.AnalyzingQueryParser Maven / Gradle / Ivy

package org.apache.lucene.queryparser.analyzing;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;

/**
 * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
 * are also passed through the given analyzer, but wildcard characters * and
 * ? don't get removed from the search terms.
 * 
 * 

Warning: This class should only be used with analyzers that do not use stopwords * or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer * will turn Häuser into hau, but H?user will * become h?user when using this parser and thus no match would be found (i.e. * using this parser will be no improvement over QueryParser in such cases). */ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.QueryParser { // gobble escaped chars or find a wildcard character private final Pattern wildcardPattern = Pattern.compile("(\\.)|([?*]+)"); public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) { super(matchVersion, field, analyzer); setAnalyzeRangeTerms(true); } /** * Called when parser parses an input term that contains one or more wildcard * characters (like *), but is not a prefix term (one that has * just a single * character at the end). *

* Example: will be called for H?user or for H*user. *

* Depending on analyzer and settings, a wildcard term may (most probably will) * be lower-cased automatically. It will go through the default Analyzer. *

* Overrides super class, by passing terms through analyzer. * * @param field Name of the field query will use. * @param termStr Term that contains one or more wildcard * characters (? or *), but is not simple prefix term * * @return Resulting {@link Query} built for the term */ @Override protected Query getWildcardQuery(String field, String termStr) throws ParseException { if (termStr == null){ //can't imagine this would ever happen throw new ParseException("Passed null value as term to getWildcardQuery"); } if ( ! getAllowLeadingWildcard() && (termStr.startsWith("*") || termStr.startsWith("?"))) { throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery" + " unless getAllowLeadingWildcard() returns true"); } Matcher wildcardMatcher = wildcardPattern.matcher(termStr); StringBuilder sb = new StringBuilder(); int last = 0; while (wildcardMatcher.find()){ // continue if escaped char if (wildcardMatcher.group(1) != null){ continue; } if (wildcardMatcher.start() > 0){ String chunk = termStr.substring(last, wildcardMatcher.start()); String analyzed = analyzeSingleChunk(field, termStr, chunk); sb.append(analyzed); } //append the wildcard character sb.append(wildcardMatcher.group(2)); last = wildcardMatcher.end(); } if (last < termStr.length()){ sb.append(analyzeSingleChunk(field, termStr, termStr.substring(last))); } return super.getWildcardQuery(field, sb.toString()); } /** * Called when parser parses an input term * that uses prefix notation; that is, contains a single '*' wildcard * character as its last character. Since this is a special case * of generic wildcard term, and such a query can be optimized easily, * this usually results in a different query object. *

* Depending on analyzer and settings, a prefix term may (most probably will) * be lower-cased automatically. It will go through the default Analyzer. *

* Overrides super class, by passing terms through analyzer. * * @param field Name of the field query will use. * @param termStr Term to use for building term for the query * (without trailing '*' character!) * * @return Resulting {@link Query} built for the term */ @Override protected Query getPrefixQuery(String field, String termStr) throws ParseException { String analyzed = analyzeSingleChunk(field, termStr, termStr); return super.getPrefixQuery(field, analyzed); } /** * Called when parser parses an input term that has the fuzzy suffix (~) appended. *

* Depending on analyzer and settings, a fuzzy term may (most probably will) * be lower-cased automatically. It will go through the default Analyzer. *

* Overrides super class, by passing terms through analyzer. * * @param field Name of the field query will use. * @param termStr Term to use for building term for the query * * @return Resulting {@link Query} built for the term */ @Override protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException { String analyzed = analyzeSingleChunk(field, termStr, termStr); return super.getFuzzyQuery(field, analyzed, minSimilarity); } /** * Returns the analyzed form for the given chunk * * If the analyzer produces more than one output token from the given chunk, * a ParseException is thrown. * * @param field The target field * @param termStr The full term from which the given chunk is excerpted * @param chunk The portion of the given termStr to be analyzed * @return The result of analyzing the given chunk * @throws ParseException when analysis returns other than one output token */ protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException{ String analyzed = null; TokenStream stream = null; try { stream = getAnalyzer().tokenStream(field, chunk); stream.reset(); CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); // get first and hopefully only output token if (stream.incrementToken()) { analyzed = termAtt.toString(); // try to increment again, there should only be one output token StringBuilder multipleOutputs = null; while (stream.incrementToken()) { if (null == multipleOutputs) { multipleOutputs = new StringBuilder(); multipleOutputs.append('"'); multipleOutputs.append(analyzed); multipleOutputs.append('"'); } multipleOutputs.append(','); multipleOutputs.append('"'); multipleOutputs.append(termAtt.toString()); multipleOutputs.append('"'); } stream.end(); if (null != multipleOutputs) { throw new ParseException( String.format(getLocale(), "Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString())); } } else { // nothing returned by analyzer. Was it a stop word and the user accidentally // used an analyzer with stop words? stream.end(); throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk)); } } catch (IOException e){ throw new ParseException( String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr)); } finally { IOUtils.closeWhileHandlingException(stream); } return analyzed; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy