org.apache.lucene.queryparser.classic.QueryParser.jj Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-queryparser Show documentation
Lucene QueryParsers module
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

options {
  STATIC=false;
  JAVA_UNICODE_ESCAPE=true;
  USER_CHAR_STREAM=true;
}

PARSER_BEGIN(QueryParser)

package org.apache.lucene.queryparser.classic;

import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;

/**
 * This class is generated by JavaCC.  The most important method is
 * {@link #parse(String)}.
 *
 * The syntax for query strings is as follows:
 * A Query is a series of clauses.
 * A clause may be prefixed by:
 * 
 *  a plus (+) or a minus (-) sign, indicating
 * that the clause is required or prohibited respectively; or
 * 
 a term followed by a colon, indicating the field to be searched.
 * This enables one to construct queries which search multiple fields.
 * 
 *
 * A clause may be either:
 * 
 *  a term, indicating all the documents that contain this term; or
 * 
 a nested query, enclosed in parentheses.  Note that this may be used
 * with a +/- prefix to require any of a set of
 * terms.
 * 
 *
 * Thus, in BNF, the query grammar is:
 *  *   Query  ::= ( Clause )*
 *   Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
 * 
 *
 * 
 * Examples of appropriately formatted queries can be found in the query syntax
 * documentation.
 * 
 *
 * 
 * In {@link TermRangeQuery}s, QueryParser tries to detect date values, e.g.
 * date:[6/1/2005 TO 6/4/2005] produces a range query that searches
 * for "date" fields between 2005-06-01 and 2005-06-04. Note that the format
 * of the accepted input depends on {@link #setLocale(Locale) the locale}.
 * A {@link org.apache.lucene.document.DateTools.Resolution} has to be set,
 * if you want to use {@link DateTools} for date conversion.
 * 
 * 
 * The date resolution that shall be used for RangeQueries can be set
 * using {@link #setDateResolution(DateTools.Resolution)}
 * or {@link #setDateResolution(String, DateTools.Resolution)}. The former
 * sets the default date resolution for all fields, whereas the latter can
 * be used to set field specific date resolutions. Field specific date
 * resolutions take, if set, precedence over the default date resolution.
 * 
 * 
 * If you don't use {@link DateTools} in your index, you can create your own
 * query parser that inherits QueryParser and overwrites
 * {@link #getRangeQuery(String, String, String, boolean, boolean)} to
 * use a different method for date conversion.
 * 
 *
 * Note that QueryParser is not thread-safe. 
 * 
 * NOTE: there is a new QueryParser in contrib, which matches
 * the same syntax as this class, but is more modular,
 * enabling substantial customization to how a query is created.
 */
public class QueryParser extends QueryParserBase {
  /** The default operator for parsing queries.
   * Use {@link QueryParserBase#setDefaultOperator} to change it.
   */
  static public enum Operator { OR, AND }
  
  /** default split on whitespace behavior */
  public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = false;

  /** Create a query parser.
   *  @param f  the default field for query terms.
   *  @param a   used to find terms in the query text.
   */
   public QueryParser(String f, Analyzer a) {
    this(new FastCharStream(new StringReader("")));
    init(f, a);
  }

  /**
   * Set to true if phrase queries will be automatically generated
   * when the analyzer returns more than one term from whitespace
   * delimited text.
   * NOTE: this behavior may not be suitable for all languages.
   * 

   * Set to false if phrase queries should only be generated when
   * surrounded by double quotes.
   * 

   * The combination splitOnWhitespace=false and autoGeneratePhraseQueries=true
   * is disallowed.  See LUCENE-7533.
   */
  @Override
  public void setAutoGeneratePhraseQueries(boolean value) {
    if (splitOnWhitespace == false && value == true) {
      throw new IllegalArgumentException
          ("setAutoGeneratePhraseQueries(true) is disallowed when getSplitOnWhitespace() == false");
    }
    this.autoGeneratePhraseQueries = value;
  }

  /**
   * @see #setSplitOnWhitespace(boolean)
   */
  public boolean getSplitOnWhitespace() {
    return splitOnWhitespace;
  }

  /**
   * Whether query text should be split on whitespace prior to analysis.
   * Default is {@value #DEFAULT_SPLIT_ON_WHITESPACE}.
   * 
   * The combination splitOnWhitespace=false and autoGeneratePhraseQueries=true
   * is disallowed.  See LUCENE-7533.
   */
  public void setSplitOnWhitespace(boolean splitOnWhitespace) {
    if (splitOnWhitespace == false && getAutoGeneratePhraseQueries() == true) {
      throw new IllegalArgumentException
          ("setSplitOnWhitespace(false) is disallowed when getAutoGeneratePhraseQueries() == true");
    }
    this.splitOnWhitespace = splitOnWhitespace;
  }

  private boolean splitOnWhitespace = DEFAULT_SPLIT_ON_WHITESPACE;
  private static Set disallowedPostMultiTerm
    = new HashSet(Arrays.asList(COLON, STAR, FUZZY_SLOP, CARAT, AND, OR));
  private static boolean allowedPostMultiTerm(int tokenKind) {
    return disallowedPostMultiTerm.contains(tokenKind) == false;
  }
}

PARSER_END(QueryParser)

/* ***************** */
/* Token Definitions */
/* ***************** */

<*> TOKEN : {
  <#_NUM_CHAR:        ["0"-"9"] >
| <#_ESCAPED_CHAR:    "\\" ~[] >  // every character that follows a backslash is considered as an escaped character
| <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "\u3000", "+", "-", "!", "(", ")", ":", "^",
                           "[", "]", "\"", "{", "}", "~", "*", "?", "\\", "/" ]
                        | <_ESCAPED_CHAR> ) >
| <#_TERM_CHAR:       ( <_TERM_START_CHAR> | "-" | "+" ) >
| <#_WHITESPACE:      ( " " | "\t" | "\n" | "\r" | "\u3000") >
| <#_QUOTED_CHAR:     ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) >
}

 SKIP : {
  < <_WHITESPACE>>
}

 TOKEN : {
  
| 
| 
| 
| 
|  >
| 
| 
| 
| 
|  : Boost
| )* "\"">
|  (<_TERM_CHAR>)*  >
| )+ (( "." (<_NUM_CHAR>)+ )? (<_TERM_CHAR>)*) | (<_TERM_CHAR>)*) >
|  (<_TERM_CHAR>)* "*" ) >
|  | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* >
| 
|  : Range
|  : Range
}

 TOKEN : {
  )+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT
}

 TOKEN : {
  
|  : DEFAULT
|  : DEFAULT
| 
| 
}

// *   Query  ::= ( Clause )*
// *   Clause ::= ["+", "-"] [ ":"] (  | "(" Query ")" )

int Conjunction() : {
  int ret = CONJ_NONE;
}
{
  [
     { ret = CONJ_AND; }
    |   { ret = CONJ_OR; }
  ]
  { return ret; }
}

int Modifiers() : {
  int ret = MOD_NONE;
}
{
  [
     { ret = MOD_REQ; }
    |  { ret = MOD_NOT; }
    |  { ret = MOD_NOT; }
  ]
  { return ret; }
}

// This makes sure that there is no garbage after the query string
Query TopLevelQuery(String field) : {
  Query q;
}
{
  q=Query(field) 
  { return q; }
}

Query Query(String field) :
{
  List clauses = new ArrayList();
  Query q, firstQuery=null;
  int conj, mods;
}
{
  (
    LOOKAHEAD(2)
    firstQuery=MultiTerm(field, clauses)
    | mods=Modifiers() q=Clause(field)
      {
        addClause(clauses, CONJ_NONE, mods, q);
        if (mods == MOD_NONE) {
          firstQuery = q;
        }
      }
  )
  (
    LOOKAHEAD(2)
    MultiTerm(field, clauses)
    | conj=Conjunction() mods=Modifiers() q=Clause(field)
      { addClause(clauses, conj, mods, q); }
  )*
  {
    if (clauses.size() == 1 && firstQuery != null) {
      return firstQuery;
    } else {
      return getBooleanQuery(clauses);
    }
  }
}

Query Clause(String field) : {
  Query q;
  Token fieldToken=null, boost=null;
}
{
  [
    LOOKAHEAD(2)
    (
      fieldToken=  {field=discardEscapeChar(fieldToken.image);}
      |   {field="*";}
    )
  ]
  (
    q=Term(field)
    |  q=Query(field)  [  boost= ]
  )
  { return handleBoost(q, boost); }
}

Query Term(String field) : {
  Token term, boost=null, fuzzySlop=null, goop1, goop2;
  boolean prefix = false;
  boolean wildcard = false;
  boolean fuzzy = false;
  boolean regexp = false;
  boolean startInc=false;
  boolean endInc=false;
  Query q;
}
{
  (
    (
      term=
      | term= { wildcard=true; }
      | term= { prefix=true; }
      | term= { wildcard=true; }
      | term= { regexp=true; }
      | term=
      | term= { term.image = term.image.substring(0,1); }
    )
    [
       boost= [ fuzzySlop= { fuzzy=true; } ]
      | fuzzySlop= { fuzzy=true; } [  boost= ]
    ]
    { q = handleBareTokenQuery(field, term, fuzzySlop, prefix, wildcard, fuzzy, regexp); }

  | (  { startInc = true; } |  )
    ( goop1= | goop1= | goop1= )
    (  )
    ( goop2= | goop2= | goop2= )
    (  { endInc = true; } |  )
    [  boost= ]
    {
      boolean startOpen=false;
      boolean endOpen=false;
      if (goop1.kind == RANGE_QUOTED) {
        goop1.image = goop1.image.substring(1, goop1.image.length()-1);
      } else if ("*".equals(goop1.image)) {
        startOpen=true;
      }
      if (goop2.kind == RANGE_QUOTED) {
        goop2.image = goop2.image.substring(1, goop2.image.length()-1);
      } else if ("*".equals(goop2.image)) {
        endOpen=true;
      }
      q = getRangeQuery(field, startOpen ? null : discardEscapeChar(goop1.image), endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc);
    }

  | term=
    [
       boost= [ fuzzySlop= { fuzzy=true; } ]
      | fuzzySlop= { fuzzy=true; } [  boost= ]
    ]
    { q = handleQuotedTerm(field, term, fuzzySlop); }
  )
  { return handleBoost(q, boost); }
}

/** Returns the first query if splitOnWhitespace=true or otherwise the entire produced query */
Query MultiTerm(String field, List clauses) : {
  Token text, whitespace, followingText;
  Query firstQuery = null;
}
{
  text=
  {
    if (splitOnWhitespace) {
      firstQuery = getFieldQuery(field, discardEscapeChar(text.image), false);
      addClause(clauses, CONJ_NONE, MOD_NONE, firstQuery);
    }
  }
  // Both lookaheads are required; the first lookahead vets the first following term and the second lookahead vets the rest
  LOOKAHEAD({ getToken(1).kind == TERM && allowedPostMultiTerm(getToken(2).kind) })
  (
    LOOKAHEAD({ getToken(1).kind == TERM && allowedPostMultiTerm(getToken(2).kind) })
    followingText=
    {
      if (splitOnWhitespace) {
        Query q = getFieldQuery(field, discardEscapeChar(followingText.image), false);
        addClause(clauses, CONJ_NONE, MOD_NONE, q);
      } else { // build up the text to send to analysis
        text.image += " " + followingText.image;
      }
    }
  )+
  {
    if (splitOnWhitespace == false) {
      firstQuery = getFieldQuery(field, discardEscapeChar(text.image), false);
      addMultiTermClauses(clauses, firstQuery);
    }
    return firstQuery;
  }
}