org.apache.solr.parser.QueryParser.jj Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

options {
  STATIC=false;
  JAVA_UNICODE_ESCAPE=true;
  USER_CHAR_STREAM=true;
}

PARSER_BEGIN(QueryParser)

package org.apache.solr.parser;

import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.Query;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.search.QParser;


public class QueryParser extends SolrQueryParserBase {
  /** The default operator for parsing queries.
   */
  static public enum Operator { OR, AND }

  /** default split on whitespace behavior */
  public static final boolean DEFAULT_SPLIT_ON_WHITESPACE = false;

   public QueryParser(String defaultField, QParser parser) {
    this(new FastCharStream(new StringReader("")));
    init(defaultField, parser);
  }

  /**
   * @see #setSplitOnWhitespace(boolean)
   */
  public boolean getSplitOnWhitespace() {
    return splitOnWhitespace;
  }

  /**
   * Whether query text should be split on whitespace prior to analysis.
   * Default is {@value #DEFAULT_SPLIT_ON_WHITESPACE}.
   */
  public void setSplitOnWhitespace(boolean splitOnWhitespace) {
    this.splitOnWhitespace = splitOnWhitespace;
  }

  private boolean splitOnWhitespace = DEFAULT_SPLIT_ON_WHITESPACE;
  private static Set disallowedPostMultiTerm
    = new HashSet(Arrays.asList(COLON, STAR, FUZZY_SLOP, CARAT, AND, OR));
  private static boolean allowedPostMultiTerm(int tokenKind) {
    return disallowedPostMultiTerm.contains(tokenKind) == false;
  }

  @Override
  protected Query newFieldQuery(Analyzer analyzer, String field, String queryText,
                                boolean quoted, boolean fieldAutoGenPhraseQueries, boolean fieldEnableGraphQueries)
      throws SyntaxError {
    setAutoGenerateMultiTermSynonymsPhraseQuery(fieldAutoGenPhraseQueries || getAutoGeneratePhraseQueries());
    // Don't auto-quote graph-aware field queries 
    boolean treatAsQuoted = getSplitOnWhitespace()
        ? (quoted || fieldAutoGenPhraseQueries || getAutoGeneratePhraseQueries()) : quoted;
    return super.newFieldQuery(analyzer, field, queryText, treatAsQuoted, false, fieldEnableGraphQueries);
  }
}

PARSER_END(QueryParser)

TOKEN_MGR_DECLS : {
  int commentNestingDepth ;
}

/* ***************** */
/* Token Definitions */
/* ***************** */

<*> TOKEN : {
  <#_NUM_CHAR:        ["0"-"9"] >
| <#_ESCAPED_CHAR:    "\\" ~[] >  // every character that follows a backslash is considered as an escaped character
| <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "\u3000", "+", "-", "!", "(", ")", ":", "^",
                           "[", "]", "\"", "{", "}", "~", "*", "?", "\\", "/" ]
                        | <_ESCAPED_CHAR> ) >
| <#_TERM_CHAR:       ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> | "-" | "+" | "/" | "!") >
| <#_WHITESPACE:      ( " " | "\t" | "\n" | "\r" | "\u3000") >
| <#_QUOTED_CHAR:     ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) >
| <#_SQUOTED_CHAR:    ( ~[ "'", "\\" ] | <_ESCAPED_CHAR> ) >
}

 SKIP : {
   < <_WHITESPACE>>
  | "/*" {commentNestingDepth++;} : COMMENT
 }

 SKIP : {
  // trying to avoid matching end-of-comment in string leads to more problems (incorrectly thinking we are in a string due
  // to the simplistic matching in this state.
  // < ("\"" (<_QUOTED_CHAR>)* "\"") >
  "*/" { commentNestingDepth -= 1; SwitchTo( commentNestingDepth==0 ? DEFAULT : COMMENT ); }
  | < ~[]>
}

 SKIP : {
  < <_WHITESPACE>>
}

 TOKEN : {
  
| 
| 
| 
| 
|  >
| 
| 
| 
| 
|  : Boost
| )* "\"">
|  (<_TERM_CHAR>)*  >
| )+ ( "." (<_NUM_CHAR>)+ )? )? >
|  (<_TERM_CHAR>)* "*" ) >
|  | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* >
| 
|  : Range
|  : Range
// TODO: consider using token states instead of inlining SQUOTED
// | )* "'">
// | )* (~["=","}"])+ ( "=" ( |  | (~[" ","}"])+ )? )? )* "}")+  (~[")"," ","\t","\n","{","^"])*  >
| )* (~["=","}"])+ ( "=" ( | ("'" (<_SQUOTED_CHAR>)* "'") | (~[" ","}"])+ )? )? )* "}")+  (~[")"," ","\t","\n","{","^"])*  >
| 
}

 TOKEN : {
  )+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT
}

 TOKEN : {
  
|  : DEFAULT
|  : DEFAULT
| 
| 
}

// *   Query  ::= ( Clause )*
// *   Clause ::= ["+", "-"] [ ":"] (  | "(" Query ")" )

int Conjunction() : {
  int ret = CONJ_NONE;
}
{
  [
     { ret = CONJ_AND; }
    |   { ret = CONJ_OR; }
  ]
  { return ret; }
}

int Modifiers() : {
  int ret = MOD_NONE;
}
{
  [
      { ret = MOD_REQ; }
     |  { ret = MOD_NOT; }
     |  { ret = MOD_NOT; }
  ]
  { return ret; }
}

// This makes sure that there is no garbage after the query string
Query TopLevelQuery(String field) throws SyntaxError : {
  Query q;
}
{
  q=Query(field) 
  {
    return q;
  }
}

Query Query(String field) throws SyntaxError :
{
  List clauses = new ArrayList();
  Query q;
  int conj, mods;
}
{
  (
    LOOKAHEAD(2)
    MultiTerm(field, clauses)
    | mods=Modifiers() q=Clause(field)
      { addClause(clauses, CONJ_NONE, mods, q); }
  )
  (
    LOOKAHEAD(2)
    MultiTerm(field, clauses)
    | conj=Conjunction() mods=Modifiers() q=Clause(field)
      { addClause(clauses, conj, mods, q); }
  )*
  {
    if (clauses.size() == 1 && clauses.get(0).getOccur() == BooleanClause.Occur.SHOULD) {
      Query firstQuery = clauses.get(0).getQuery();
      if ( ! (firstQuery instanceof RawQuery) || ((RawQuery)firstQuery).getTermCount() == 1) {
        return rawToNormal(firstQuery);
      }
    }
    return getBooleanQuery(clauses);
  }
}

Query Clause(String field) throws SyntaxError : {
  Query q;
  Token fieldToken=null, boost=null;
  Token localParams=null;
  int flags = 0;
}
{
  [
    LOOKAHEAD(2)
    (
      fieldToken=  { field = discardEscapeChar(fieldToken.image); }
      |   { field = "*"; }
    )
  ]
  (
   q=Term(field)
   |  q=Query(field)  [  boost= ]
   | ( { flags=startFilter(); } q=Query(field)  [  boost= ] { q=getFilter(q); restoreFlags(flags); } )
   | (localParams =  [  boost= ] { q=getLocalParams(field, localParams.image); }  )
  )
  { return handleBoost(q, boost); }
}

Query Term(String field) throws SyntaxError : {
  Token term, boost=null, fuzzySlop=null, goop1, goop2;
  boolean prefix = false;
  boolean wildcard = false;
  boolean fuzzy = false;
  boolean regexp = false;
  boolean startInc=false;
  boolean endInc=false;
  Query q;
}
{
  (
    (
      term=
      | term= { wildcard=true; }
      | term= { prefix=true; }
      | term= { wildcard=true; }
      | term= { regexp=true; }
      | term=
      | term= { term.image = term.image.substring(0,1); }
    )
    [
       boost= [ fuzzySlop= { fuzzy=true; } ]
      | fuzzySlop= { fuzzy=true; } [  boost= ]
    ]
    { q = handleBareTokenQuery(getField(field), term, fuzzySlop, prefix, wildcard, fuzzy, regexp); }

  | (  { startInc = true; } |  )
    ( goop1= | goop1= | goop1= )
    (  )
    ( goop2= | goop2= | goop2= )
    (  { endInc = true; } |  )
    [  boost= ]
    {
      boolean startOpen=false;
      boolean endOpen=false;
      if (goop1.kind == RANGE_QUOTED) {
        goop1.image = goop1.image.substring(1, goop1.image.length()-1);
      } else if ("*".equals(goop1.image)) {
        startOpen=true;
      }
      if (goop2.kind == RANGE_QUOTED) {
        goop2.image = goop2.image.substring(1, goop2.image.length()-1);
      } else if ("*".equals(goop2.image)) {
        endOpen=true;
      }
      q = getRangeQuery(getField(field),
                        startOpen ? null : discardEscapeChar(goop1.image),
                        endOpen ? null : discardEscapeChar(goop2.image), startInc, endInc);
    }
  | term=
    [
       boost= [ fuzzySlop= { fuzzy=true; } ]
      | fuzzySlop= { fuzzy=true; } [  boost= ]
    ]
    { q = handleQuotedTerm(getField(field), term, fuzzySlop); }
  )
  { return handleBoost(q, boost); }
}

void MultiTerm(String field, List clauses) throws SyntaxError : {
  Token text;
  List terms = null;
}
{
  text=
  {
    if (splitOnWhitespace) {
      Query q = getFieldQuery(getField(field), discardEscapeChar(text.image), false, true);
      addClause(clauses, CONJ_NONE, MOD_NONE, q);
    } else {
      terms = new ArrayList();
      terms.add(discardEscapeChar(text.image));
    }
  }
  // Both lookaheads are required; the first lookahead vets the first following term and the second lookahead vets the rest
  LOOKAHEAD({ getToken(1).kind == TERM && allowedPostMultiTerm(getToken(2).kind) })
  (
    LOOKAHEAD({ getToken(1).kind == TERM && allowedPostMultiTerm(getToken(2).kind) })
    text=
    {
      if (splitOnWhitespace) {
        Query q = getFieldQuery(getField(field), discardEscapeChar(text.image), false, true);
        addClause(clauses, CONJ_NONE, MOD_NONE, q);
      } else {
        terms.add(discardEscapeChar(text.image));
      }
    }
  )+
  {
    if (splitOnWhitespace == false) {
      Query q = getFieldQuery(getField(field), terms, true);
      addMultiTermClause(clauses, q);
    }
  }
}