All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.queryparser.classic.QueryParserBase Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.queryparser.classic;

import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;

import java.io.StringReader;
import java.text.DateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.charstream.CharStream;
import org.apache.lucene.queryparser.charstream.FastCharStream;
import org.apache.lucene.queryparser.classic.QueryParser.Operator;
import org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher.TooManyClauses;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.automaton.RegExp;

/**
 * This class is overridden by QueryParser in QueryParser.jj and acts to separate the majority of
 * the Java code from the .jj grammar file.
 */
public abstract class QueryParserBase extends QueryBuilder
    implements CommonQueryParserConfiguration {

  static final int CONJ_NONE = 0;
  static final int CONJ_AND = 1;
  static final int CONJ_OR = 2;

  static final int MOD_NONE = 0;
  static final int MOD_NOT = 10;
  static final int MOD_REQ = 11;

  // make it possible to call setDefaultOperator() without accessing
  // the nested class:
  /** Alternative form of QueryParser.Operator.AND */
  public static final Operator AND_OPERATOR = Operator.AND;

  /** Alternative form of QueryParser.Operator.OR */
  public static final Operator OR_OPERATOR = Operator.OR;

  /** The actual operator that parser uses to combine query terms */
  Operator operator = OR_OPERATOR;

  MultiTermQuery.RewriteMethod multiTermRewriteMethod =
      MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE;
  boolean allowLeadingWildcard = false;

  protected String field;
  int phraseSlop = 0;
  float fuzzyMinSim = FuzzyQuery.defaultMaxEdits;
  int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
  Locale locale = Locale.getDefault();
  TimeZone timeZone = TimeZone.getDefault();

  // the default date resolution
  DateTools.Resolution dateResolution = null;
  // maps field names to date resolutions
  Map fieldToDateResolution = null;

  boolean autoGeneratePhraseQueries;
  int determinizeWorkLimit = DEFAULT_DETERMINIZE_WORK_LIMIT;

  // So the generated QueryParser(CharStream) won't error out
  protected QueryParserBase() {
    super(null);
  }

  /**
   * Initializes a query parser. Called by the QueryParser constructor
   *
   * @param f the default field for query terms.
   * @param a used to find terms in the query text.
   */
  public void init(String f, Analyzer a) {
    setAnalyzer(a);
    field = f;
    setAutoGeneratePhraseQueries(false);
  }

  // the generated parser will create these in QueryParser
  public abstract void ReInit(CharStream stream);

  public abstract Query TopLevelQuery(String field) throws ParseException;

  /**
   * Parses a query string, returning a {@link org.apache.lucene.search.Query}.
   *
   * @param query the query string to be parsed.
   * @throws ParseException if the parsing fails
   */
  public Query parse(String query) throws ParseException {
    ReInit(new FastCharStream(new StringReader(query)));
    try {
      // TopLevelQuery is a Query followed by the end-of-input (EOF)
      Query res = TopLevelQuery(field);
      return res != null ? res : newBooleanQuery().build();
    } catch (ParseException | TokenMgrError tme) {
      // rethrow to include the original query:
      ParseException e = new ParseException("Cannot parse '" + query + "': " + tme.getMessage());
      e.initCause(tme);
      throw e;
    } catch (TooManyClauses tmc) {
      ParseException e =
          new ParseException("Cannot parse '" + query + "': too many boolean clauses");
      e.initCause(tmc);
      throw e;
    }
  }

  /**
   * @return Returns the default field.
   */
  public String getField() {
    return field;
  }

  /**
   * @see #setAutoGeneratePhraseQueries(boolean)
   */
  public final boolean getAutoGeneratePhraseQueries() {
    return autoGeneratePhraseQueries;
  }

  /**
   * Set to true if phrase queries will be automatically generated when the analyzer returns more
   * than one term from whitespace delimited text. NOTE: this behavior may not be suitable for all
   * languages.
   *
   * 

Set to false if phrase queries should only be generated when surrounded by double quotes. */ public void setAutoGeneratePhraseQueries(boolean value) { this.autoGeneratePhraseQueries = value; } /** Get the minimal similarity for fuzzy queries. */ @Override public float getFuzzyMinSim() { return fuzzyMinSim; } /** Set the minimum similarity for fuzzy queries. Default is 2f. */ @Override public void setFuzzyMinSim(float fuzzyMinSim) { this.fuzzyMinSim = fuzzyMinSim; } /** * Get the prefix length for fuzzy queries. * * @return Returns the fuzzyPrefixLength. */ @Override public int getFuzzyPrefixLength() { return fuzzyPrefixLength; } /** * Set the prefix length for fuzzy queries. Default is 0. * * @param fuzzyPrefixLength The fuzzyPrefixLength to set. */ @Override public void setFuzzyPrefixLength(int fuzzyPrefixLength) { this.fuzzyPrefixLength = fuzzyPrefixLength; } /** * Sets the default slop for phrases. If zero, then exact phrase matches are required. Default * value is zero. */ @Override public void setPhraseSlop(int phraseSlop) { this.phraseSlop = phraseSlop; } /** Gets the default slop for phrases. */ @Override public int getPhraseSlop() { return phraseSlop; } /** * Set to true to allow leading wildcard characters. * *

When set, * or ? are allowed as the first character of a * PrefixQuery and WildcardQuery. Note that this can produce very slow queries on big indexes. * *

Default: false. */ @Override public void setAllowLeadingWildcard(boolean allowLeadingWildcard) { this.allowLeadingWildcard = allowLeadingWildcard; } /** * @see #setAllowLeadingWildcard(boolean) */ @Override public boolean getAllowLeadingWildcard() { return allowLeadingWildcard; } /** * Sets the boolean operator of the QueryParser. In default mode (OR_OPERATOR) terms * without any modifiers are considered optional: for example capital of Hungary is * equal to capital OR of OR Hungary.
* In AND_OPERATOR mode terms are considered to be in conjunction: the * above-mentioned query is parsed as capital AND of AND Hungary */ public void setDefaultOperator(Operator op) { this.operator = op; } /** Gets implicit operator setting, which will be either AND_OPERATOR or OR_OPERATOR. */ public Operator getDefaultOperator() { return operator; } @Override public void setMultiTermRewriteMethod(MultiTermQuery.RewriteMethod method) { multiTermRewriteMethod = method; } /** * @see #setMultiTermRewriteMethod */ @Override public MultiTermQuery.RewriteMethod getMultiTermRewriteMethod() { return multiTermRewriteMethod; } /** Set locale used by date range parsing, lowercasing, and other locale-sensitive operations. */ @Override public void setLocale(Locale locale) { this.locale = locale; } /** Returns current locale, allowing access by subclasses. */ @Override public Locale getLocale() { return locale; } @Override public void setTimeZone(TimeZone timeZone) { this.timeZone = timeZone; } @Override public TimeZone getTimeZone() { return timeZone; } /** * Sets the default date resolution used by RangeQueries for fields for which no specific date * resolutions has been set. Field specific resolutions can be set with {@link * #setDateResolution(String, org.apache.lucene.document.DateTools.Resolution)}. * * @param dateResolution the default date resolution to set */ @Override public void setDateResolution(DateTools.Resolution dateResolution) { this.dateResolution = dateResolution; } /** * Sets the date resolution used by RangeQueries for a specific field. * * @param fieldName field for which the date resolution is to be set * @param dateResolution date resolution to set */ public void setDateResolution(String fieldName, DateTools.Resolution dateResolution) { if (fieldName == null) { throw new IllegalArgumentException("Field must not be null."); } if (fieldToDateResolution == null) { // lazily initialize HashMap fieldToDateResolution = new HashMap<>(); } fieldToDateResolution.put(fieldName, dateResolution); } /** * Returns the date resolution that is used by RangeQueries for the given field. Returns null, if * no default or field specific date resolution has been set for the given field. */ public DateTools.Resolution getDateResolution(String fieldName) { if (fieldName == null) { throw new IllegalArgumentException("Field must not be null."); } if (fieldToDateResolution == null) { // no field specific date resolutions set; return default date resolution instead return this.dateResolution; } DateTools.Resolution resolution = fieldToDateResolution.get(fieldName); if (resolution == null) { // no date resolutions set for the given field; return default date resolution instead resolution = this.dateResolution; } return resolution; } /** * @param determinizeWorkLimit the maximum effort that determinizing a regexp query can spend. If * the query requires more effort, a TooComplexToDeterminizeException is thrown. */ public void setDeterminizeWorkLimit(int determinizeWorkLimit) { this.determinizeWorkLimit = determinizeWorkLimit; } /** * @return the maximum effort that determinizing a regexp query can spend. If the query requires * more effort, a TooComplexToDeterminizeException is thrown. */ public int getDeterminizeWorkLimit() { return determinizeWorkLimit; } protected void addClause(List clauses, int conj, int mods, Query q) { boolean required, prohibited; // If this term is introduced by AND, make the preceding term required, // unless it's already prohibited if (clauses.size() > 0 && conj == CONJ_AND) { BooleanClause c = clauses.get(clauses.size() - 1); if (!c.isProhibited()) clauses.set(clauses.size() - 1, new BooleanClause(c.query(), Occur.MUST)); } if (clauses.size() > 0 && operator == AND_OPERATOR && conj == CONJ_OR) { // If this term is introduced by OR, make the preceding term optional, // unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b) // notice if the input is a OR b, first term is parsed as required; without // this modification a OR b would be parsed as +a OR b BooleanClause c = clauses.get(clauses.size() - 1); if (!c.isProhibited()) clauses.set(clauses.size() - 1, new BooleanClause(c.query(), Occur.SHOULD)); } // We might have been passed a null query; the term might have been // filtered away by the analyzer. if (q == null) return; if (operator == OR_OPERATOR) { // We set REQUIRED if we're introduced by AND or +; PROHIBITED if // introduced by NOT or -; make sure not to set both. prohibited = (mods == MOD_NOT); required = (mods == MOD_REQ); if (conj == CONJ_AND && !prohibited) { required = true; } } else { // We set PROHIBITED if we're introduced by NOT or -; We set REQUIRED // if not PROHIBITED and not introduced by OR prohibited = (mods == MOD_NOT); required = (!prohibited && conj != CONJ_OR); } if (required && !prohibited) clauses.add(newBooleanClause(q, BooleanClause.Occur.MUST)); else if (!required && !prohibited) clauses.add(newBooleanClause(q, BooleanClause.Occur.SHOULD)); else if (!required && prohibited) clauses.add(newBooleanClause(q, BooleanClause.Occur.MUST_NOT)); else throw new RuntimeException("Clause cannot be both required and prohibited"); } /** * Adds clauses generated from analysis over text containing whitespace. There are no operators, * so the query's clauses can either be MUST (if the default operator is AND) or SHOULD (default * OR). * *

If all of the clauses in the given Query are TermQuery-s, this method flattens the result by * adding the TermQuery-s individually to the output clause list; otherwise, the given Query is * added as a single clause including its nested clauses. */ protected void addMultiTermClauses(List clauses, Query q) { // We might have been passed a null query; the term might have been // filtered away by the analyzer. if (q == null) { return; } boolean allNestedTermQueries = false; if (q instanceof BooleanQuery) { allNestedTermQueries = true; for (BooleanClause clause : ((BooleanQuery) q).clauses()) { if (!(clause.query() instanceof TermQuery)) { allNestedTermQueries = false; break; } } } if (allNestedTermQueries) { clauses.addAll(((BooleanQuery) q).clauses()); } else { BooleanClause.Occur occur = operator == OR_OPERATOR ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.MUST; if (q instanceof BooleanQuery) { for (BooleanClause clause : ((BooleanQuery) q).clauses()) { clauses.add(newBooleanClause(clause.query(), occur)); } } else { clauses.add(newBooleanClause(q, occur)); } } } /** * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to * disallow */ protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException { return newFieldQuery(getAnalyzer(), field, queryText, quoted); } /** * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to * disallow */ protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted) throws ParseException { BooleanClause.Occur occur = operator == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD; return createFieldQuery( analyzer, occur, field, queryText, quoted || autoGeneratePhraseQueries, phraseSlop); } /** * Base implementation delegates to {@link #getFieldQuery(String,String,boolean)}. This method may * be overridden, for example, to return a SpanNearQuery instead of a PhraseQuery. * * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to * disallow */ protected Query getFieldQuery(String field, String queryText, int slop) throws ParseException { Query query = getFieldQuery(field, queryText, true); if (query instanceof PhraseQuery) { query = addSlopToPhrase((PhraseQuery) query, slop); } else if (query instanceof MultiPhraseQuery) { MultiPhraseQuery mpq = (MultiPhraseQuery) query; if (slop != mpq.getSlop()) { query = new MultiPhraseQuery.Builder(mpq).setSlop(slop).build(); } } return query; } /** Rebuild a phrase query with a slop value */ private PhraseQuery addSlopToPhrase(PhraseQuery query, int slop) { PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.setSlop(slop); org.apache.lucene.index.Term[] terms = query.getTerms(); int[] positions = query.getPositions(); for (int i = 0; i < terms.length; ++i) { builder.add(terms[i], positions[i]); } return builder.build(); } protected Query getRangeQuery( String field, String part1, String part2, boolean startInclusive, boolean endInclusive) throws ParseException { DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, locale); df.setLenient(true); DateTools.Resolution resolution = getDateResolution(field); try { part1 = DateTools.dateToString(df.parse(part1), resolution); } catch ( @SuppressWarnings("unused") Exception e) { } try { Date d2 = df.parse(part2); if (endInclusive) { // The user can only specify the date, not the time, so make sure // the time is set to the latest possible time of that date to really // include all documents: Calendar cal = Calendar.getInstance(timeZone, locale); cal.setTime(d2); cal.set(Calendar.HOUR_OF_DAY, 23); cal.set(Calendar.MINUTE, 59); cal.set(Calendar.SECOND, 59); cal.set(Calendar.MILLISECOND, 999); d2 = cal.getTime(); } part2 = DateTools.dateToString(d2, resolution); } catch ( @SuppressWarnings("unused") Exception e) { } return newRangeQuery(field, part1, part2, startInclusive, endInclusive); } /** * Builds a new BooleanClause instance * * @param q sub query * @param occur how this clause should occur when matching documents * @return new BooleanClause instance */ protected BooleanClause newBooleanClause(Query q, BooleanClause.Occur occur) { return new BooleanClause(q, occur); } /** * Builds a new PrefixQuery instance * * @param prefix Prefix term * @return new PrefixQuery instance */ protected Query newPrefixQuery(Term prefix) { return new PrefixQuery(prefix, multiTermRewriteMethod); } /** * Builds a new RegexpQuery instance * * @param regexp Regexp term * @return new RegexpQuery instance */ protected Query newRegexpQuery(Term regexp) { return new RegexpQuery( regexp, RegExp.ALL, 0, RegexpQuery.DEFAULT_PROVIDER, determinizeWorkLimit, multiTermRewriteMethod); } /** * Builds a new FuzzyQuery instance * * @param term Term * @param minimumSimilarity minimum similarity * @param prefixLength prefix length * @return new FuzzyQuery Instance */ protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) { // FuzzyQuery doesn't yet allow constant score rewrite String text = term.text(); int numEdits = FuzzyQuery.floatToEdits(minimumSimilarity, text.codePointCount(0, text.length())); return new FuzzyQuery(term, numEdits, prefixLength); } /** * Builds a new {@link TermRangeQuery} instance * * @param field Field * @param part1 min * @param part2 max * @param startInclusive true if the start of the range is inclusive * @param endInclusive true if the end of the range is inclusive * @return new {@link TermRangeQuery} instance */ protected Query newRangeQuery( String field, String part1, String part2, boolean startInclusive, boolean endInclusive) { final BytesRef start; final BytesRef end; if (part1 == null) { start = null; } else { start = getAnalyzer().normalize(field, part1); } if (part2 == null) { end = null; } else { end = getAnalyzer().normalize(field, part2); } return new TermRangeQuery( field, start, end, startInclusive, endInclusive, multiTermRewriteMethod); } /** * Builds a new MatchAllDocsQuery instance * * @return new MatchAllDocsQuery instance */ protected Query newMatchAllDocsQuery() { return new MatchAllDocsQuery(); } /** * Builds a new WildcardQuery instance * * @param t wildcard term * @return new WildcardQuery instance */ protected Query newWildcardQuery(Term t) { return new WildcardQuery(t, determinizeWorkLimit, multiTermRewriteMethod); } /** * Factory method for generating query, given a set of clauses. By default creates a boolean query * composed of clauses passed in. * *

Can be overridden by extending classes, to modify query being returned. * * @param clauses List that contains {@link org.apache.lucene.search.BooleanClause} instances to * join. * @return Resulting {@link org.apache.lucene.search.Query} object. * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to * disallow */ protected Query getBooleanQuery(List clauses) throws ParseException { if (clauses.isEmpty()) { return null; // all clause words were filtered away by the analyzer. } BooleanQuery.Builder query = newBooleanQuery(); for (final BooleanClause clause : clauses) { query.add(clause); } return query.build(); } /** * Factory method for generating a query. Called when parser parses an input term token that * contains one or more wildcard characters (? and *), but is not a prefix term token (one that * has just a single * character at the end) * *

Depending on settings, prefix term may be lower-cased automatically. It will not go through * the default Analyzer, however, since normal Analyzers are unlikely to work properly with * wildcard templates. * *

Can be overridden by extending classes, to provide custom handling for wildcard queries, * which may be necessary due to missing analyzer calls. * * @param field Name of the field query will use. * @param termStr Term token that contains one or more wild card characters (? or *), but is not * simple prefix term * @return Resulting {@link org.apache.lucene.search.Query} built for the term * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to * disallow */ protected Query getWildcardQuery(String field, String termStr) throws ParseException { if ("*".equals(field)) { if ("*".equals(termStr)) return newMatchAllDocsQuery(); } if (!allowLeadingWildcard && (termStr.startsWith("*") || termStr.startsWith("?"))) throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"); Term t = new Term(field, analyzeWildcard(field, termStr)); return newWildcardQuery(t); } private static final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)"); private BytesRef analyzeWildcard(String field, String termStr) { // best effort to not pass the wildcard characters and escaped characters through #normalize Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(termStr); BytesRefBuilder sb = new BytesRefBuilder(); int last = 0; while (wildcardMatcher.find()) { if (wildcardMatcher.start() > 0) { String chunk = termStr.substring(last, wildcardMatcher.start()); BytesRef normalized = getAnalyzer().normalize(field, chunk); sb.append(normalized); } // append the matched group - without normalizing sb.append(new BytesRef(wildcardMatcher.group())); last = wildcardMatcher.end(); } if (last < termStr.length()) { String chunk = termStr.substring(last); BytesRef normalized = getAnalyzer().normalize(field, chunk); sb.append(normalized); } return sb.toBytesRef(); } /** * Factory method for generating a query. Called when parser parses an input term token that * contains a regular expression query. * *

Depending on settings, pattern term may be lower-cased automatically. It will not go through * the default Analyzer, however, since normal Analyzers are unlikely to work properly with * regular expression templates. * *

Can be overridden by extending classes, to provide custom handling for regular expression * queries, which may be necessary due to missing analyzer calls. * * @param field Name of the field query will use. * @param termStr Term token that contains a regular expression * @return Resulting {@link org.apache.lucene.search.Query} built for the term * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to * disallow */ protected Query getRegexpQuery(String field, String termStr) throws ParseException { // We need to pass the whole string to #normalize, which will not work with // custom attribute factories for the binary term impl, and may not work // with some analyzers BytesRef term = getAnalyzer().normalize(field, termStr); Term t = new Term(field, term); return newRegexpQuery(t); } /** * Factory method for generating a query (similar to {@link #getWildcardQuery}). Called when * parser parses an input term token that uses prefix notation; that is, contains a single '*' * wildcard character as its last character. Since this is a special case of generic wildcard * term, and such a query can be optimized easily, this usually results in a different query * object. * *

Depending on settings, a prefix term may be lower-cased automatically. It will not go * through the default Analyzer, however, since normal Analyzers are unlikely to work properly * with wildcard templates. * *

Can be overridden by extending classes, to provide custom handling for wild card queries, * which may be necessary due to missing analyzer calls. * * @param field Name of the field query will use. * @param termStr Term token to use for building term for the query (without trailing '*' * character!) * @return Resulting {@link org.apache.lucene.search.Query} built for the term * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to * disallow */ protected Query getPrefixQuery(String field, String termStr) throws ParseException { if (!allowLeadingWildcard && termStr.startsWith("*")) throw new ParseException("'*' not allowed as first character in PrefixQuery"); BytesRef term = getAnalyzer().normalize(field, termStr); Term t = new Term(field, term); return newPrefixQuery(t); } /** * Factory method for generating a query (similar to {@link #getWildcardQuery}). Called when * parser parses an input term token that has the fuzzy suffix (~) appended. * * @param field Name of the field query will use. * @param termStr Term token to use for building term for the query * @return Resulting {@link org.apache.lucene.search.Query} built for the term * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to * disallow */ protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException { BytesRef term = getAnalyzer().normalize(field, termStr); Term t = new Term(field, term); return newFuzzyQuery(t, minSimilarity, fuzzyPrefixLength); } // extracted from the .jj grammar Query handleBareTokenQuery( String qfield, Token term, Token fuzzySlop, boolean prefix, boolean wildcard, boolean fuzzy, boolean regexp) throws ParseException { Query q; String termImage = discardEscapeChar(term.image); if (wildcard) { q = getWildcardQuery(qfield, term.image); } else if (prefix) { q = getPrefixQuery( qfield, discardEscapeChar(term.image.substring(0, term.image.length() - 1))); } else if (regexp) { q = getRegexpQuery(qfield, term.image.substring(1, term.image.length() - 1)); } else if (fuzzy) { q = handleBareFuzzy(qfield, fuzzySlop, termImage); } else { q = getFieldQuery(qfield, termImage, false); } return q; } /** * Determines the similarity distance for the given fuzzy token and term string. * *

The default implementation uses the string image of the {@code fuzzyToken} in an attempt to * parse it to a primitive float value. Otherwise, the {@linkplain #getFuzzyMinSim() minimal * similarity} distance is returned. Subclasses can override this method to return a similarity * distance, say based on the {@code termStr}, if the {@code fuzzyToken} does not specify a * distance. * * @param fuzzyToken The Fuzzy token * @param termStr The Term string * @return The similarity distance */ protected float getFuzzyDistance(Token fuzzyToken, String termStr) { try { return Float.parseFloat(fuzzyToken.image.substring(1)); } catch ( @SuppressWarnings("unused") Exception ignored) { } return fuzzyMinSim; } Query handleBareFuzzy(String qfield, Token fuzzySlop, String termImage) throws ParseException { float fms = getFuzzyDistance(fuzzySlop, termImage); if (fms < 0.0f) { throw new ParseException( "Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !"); } else if (fms >= 1.0f && fms != (int) fms) { throw new ParseException("Fractional edit distances are not allowed!"); } return getFuzzyQuery(qfield, termImage, fms); } // extracted from the .jj grammar Query handleQuotedTerm(String qfield, Token term, Token fuzzySlop) throws ParseException { int s = phraseSlop; // default if (fuzzySlop != null) { try { s = (int) Float.parseFloat(fuzzySlop.image.substring(1)); } catch ( @SuppressWarnings("unused") Exception ignored) { } } return getFieldQuery( qfield, discardEscapeChar(term.image.substring(1, term.image.length() - 1)), s); } // extracted from the .jj grammar Query handleBoost(Query q, Token boost) { if (boost != null) { float f = (float) 1.0; try { f = Float.parseFloat(boost.image); } catch ( @SuppressWarnings("unused") Exception ignored) { /* Should this be handled somehow? (defaults to "no boost", if * boost number is invalid) */ } // avoid boosting null queries, such as those caused by stop words if (q != null) { q = new BoostQuery(q, f); } } return q; } /** * Returns a String where the escape char has been removed, or kept only once if there was a * double escape. * *

Supports escaped Unicode characters, e.g. translates {@code \u005Cu0041} to {@code A}. */ String discardEscapeChar(String input) throws ParseException { // Create char array to hold unescaped char sequence char[] output = new char[input.length()]; // The length of the output can be less than the input // due to discarded escape chars. This variable holds // the actual length of the output int length = 0; // We remember whether the last processed character was // an escape character boolean lastCharWasEscapeChar = false; // The multiplier the current unicode digit must be multiplied with. // E.g. the first digit must be multiplied with 16^3, the second with 16^2... int codePointMultiplier = 0; // Used to calculate the codepoint of the escaped unicode character int codePoint = 0; for (int i = 0; i < input.length(); i++) { char curChar = input.charAt(i); if (codePointMultiplier > 0) { codePoint += hexToInt(curChar) * codePointMultiplier; codePointMultiplier >>>= 4; if (codePointMultiplier == 0) { output[length++] = (char) codePoint; codePoint = 0; } } else if (lastCharWasEscapeChar) { if (curChar == 'u') { // found an escaped unicode character codePointMultiplier = 16 * 16 * 16; } else { // this character was escaped output[length] = curChar; length++; } lastCharWasEscapeChar = false; } else { if (curChar == '\\') { lastCharWasEscapeChar = true; } else { output[length] = curChar; length++; } } } if (codePointMultiplier > 0) { throw new ParseException("Truncated Unicode escape sequence."); } if (lastCharWasEscapeChar) { throw new ParseException("Term can not end with escape character."); } return new String(output, 0, length); } /** Returns the numeric value of the hexadecimal character */ static int hexToInt(char c) throws ParseException { if ('0' <= c && c <= '9') { return c - '0'; } else if ('a' <= c && c <= 'f') { return c - 'a' + 10; } else if ('A' <= c && c <= 'F') { return c - 'A' + 10; } else { throw new ParseException("Non-hex character in Unicode escape sequence: " + c); } } /** * Returns a String where those characters that QueryParser expects to be escaped are escaped by a * preceding \. */ public static String escape(String s) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); // These characters are part of the query syntax and must be escaped if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/') { sb.append('\\'); } sb.append(c); } return sb.toString(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy