All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.pageseeder.flint.lucene.query.Queries Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2015 Allette Systems (Australia)
 * http://www.allette.com.au
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.pageseeder.flint.lucene.query;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause.Occur;
import org.pageseeder.flint.lucene.search.Fields;
import org.pageseeder.flint.lucene.search.Terms;
import org.pageseeder.flint.lucene.util.Beta;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A set of utility methods related to query objects in Lucene.
 *
 * @author  Christophe Lauret (Weborganic)
 * @version 13 August 2010
 */
public final class Queries {

  /**
   * Text that matches this pattern is considered a phrase.
   */
  private static final Pattern IS_A_PHRASE = Pattern.compile("\"[^\"]+\"");

  /**
   * Prevents creation of instances.
   */
  private Queries() {
  }

  /**
   * Returns a boolean query combining all the specified queries in {@link Occur#MUST} clauses
   * as if it were an AND operator.
   *
   * @param queries the queries to combine with an AND.
   * @return The combined queries, may be empty if no arguments/empty argument provided.
   */
  public static Query and(Query... queries) {
    if (queries.length == 1) return queries[0];
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    for (Query q : queries) {
      query.add(q, Occur.MUST);
    }
    return query.build();
  }

  /**
   * Returns a boolean query combining all the specified queries in {@link Occur#MUST} clauses
   * as if it were an OR operator.
   *
   * @param queries the queries to combine with an OR.
   * @return The combined queries, may be empty if no arguments/empty argument provided.
   */
  public static Query or(Query... queries) {
    if (queries.length == 1) return queries[0];
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    for (Query q : queries) {
      query.add(q, Occur.SHOULD);
    }
    return query.build();
  }

  /**
   * Returns a boolean query combining all the specified queries using 'OR' or 'AND' operator.
   *
   * @param withOR  whether to use 'OR' or 'AND' between queries
   * @param queries the queries to combine with an OR.
   * @return The combined queries, may be empty if no arguments/empty argument provided.
   */
  public static Query combine(boolean withOR, List queries) {
    return withOR ? or(queries.toArray(new Query[]{})) : and(queries.toArray(new Query[]{}));
  }

  /**
   * Returns the list of similar queries by substituting one term only in the query.
   *
   * @param query  The original query
   * @param reader A reader to extract the similar terms.
   *
   * @return A list of similar queries to the specified one.
   *
   * @throws IOException If thrown by the reader while extracting fuzzy terms.
   */
  @Beta
  public static List similar(Query query, Collection terms, IndexReader reader) throws IOException {
    List similar = new ArrayList<>();
    // Extract the list of similar terms
    for (Term t : terms) {
      List fuzzy = Terms.fuzzy(reader, t);
      for (String f : fuzzy) {
        Query sq = substitute(query, t, new Term(t.field(), f));
        similar.add(sq);
      }
    }
    return similar;
  }

  public static boolean isAPhrase(String text) {
    return IS_A_PHRASE.matcher(text).matches();
  }

  /**
   * Build a query from the term provided, could be wildcard query or term query.
   * @param field             the term field name
   * @param text              the term value
   * @param supportWildcards  if wildcards are supported
   * @return the query
   */
  public static Query termQuery(String field, String text, boolean supportWildcards) {
    Term t = new Term(field, text);
    return supportWildcards && hasWildcards(text) ? new WildcardQuery(t) : new TermQuery(t);
  }

  /**
   * Returns the term or phrase query corresponding to the specified text.
   *
   * 

If the text is surrounded by double quotes, this method will * return a {@link PhraseQuery} otherwise, it will return a simple {@link TermQuery}. * *

Note: Quotation marks are thrown away. * * @param field the field to construct the terms. * @param text the text to construct the query from. * @return the corresponding query. */ @Beta public static Query toTermOrPhraseQuery(String field, String text) { return toTermOrPhraseQuery(field, text, false); } /** * Returns the term or phrase query corresponding to the specified text. * *

If the text is surrounded by double quotes, this method will * return a {@link PhraseQuery} otherwise, it will return a simple {@link TermQuery}. * *

Note: Quotation marks are thrown away. * * @param field the field to construct the terms. * @param text the text to construct the query from. * @param supportWildcards if wildcards are supported. * @return the corresponding query. */ @Beta public static Query toTermOrPhraseQuery(String field, String text, boolean supportWildcards) { if (field == null) throw new NullPointerException("field"); if (text == null) throw new NullPointerException("text"); boolean isPhrase = isAPhrase(text); if (isPhrase) { PhraseQuery.Builder phrase = new PhraseQuery.Builder(); String[] terms = text.substring(1, text.length()-1).split("\\s+"); for (String t : terms) { phrase.add(new Term(field, t)); } return phrase.build(); } else { return termQuery(field, text, supportWildcards); } } /** * Returns the term or phrase query corresponding to the specified text. * *

If the text is surrounded by double quotes, this method will * return a {@link PhraseQuery} otherwise, it will return a simple {@link TermQuery}. * *

Note: Quotation marks are thrown away. * * @param field the field to construct the terms. * @param text the text to construct the query from. * @param analyzer used to analyze the text * * @return the corresponding query. */ @Beta public static List toTermOrPhraseQueries(String field, String text, Analyzer analyzer) { return toTermOrPhraseQueries(field, text, false, analyzer); } /** * Returns the term or phrase query corresponding to the specified text. * *

If the text is surrounded by double quotes, this method will * return a {@link PhraseQuery} otherwise, it will return a simple {@link TermQuery}. * *

Note: Quotation marks are thrown away. * * @param field the field to construct the terms. * @param text the text to construct the query from. * @param supportWildcards if wildcards are supported. * @param analyzer used to analyze the text * * @return the corresponding query. */ @Beta public static List toTermOrPhraseQueries(String field, String text, boolean supportWildcards, Analyzer analyzer) { if (field == null) throw new NullPointerException("field"); if (text == null) throw new NullPointerException("text"); if (analyzer == null) return Collections.singletonList(toTermOrPhraseQuery(field, text, supportWildcards)); boolean isPhrase = isAPhrase(text); if (isPhrase && isTokenized(field, analyzer)) { PhraseQuery.Builder phrase = new PhraseQuery.Builder(); addTermsToPhrase(field, text.substring(1, text.length() - 1), analyzer, phrase); return Collections.singletonList(phrase.build()); } else if (supportWildcards && hasWildcards(text)) { boolean lowercase = isLowercase(field, analyzer); if (isTokenized(field, analyzer)) { List q = new ArrayList<>(); for (String t : text.split("\\s+")) { q.add(termQuery(field, lowercase ? t.toLowerCase() : t, true)); } return q; } else { return Collections.singletonList(termQuery(field, lowercase ? text.toLowerCase() : text, true)); } } else { List q = new ArrayList<>(); for (String t : Fields.toTerms(field, text, analyzer)) { q.add(termQuery(field, t, supportWildcards)); } return q; } } /** * Returns the query corresponding to the specified text after parsing it. *

Supported operators are AND and OR, parentheses are also handled. * *

The examples below show the resulting query as a Lucene predicate from the text specified using "field" as the field name: *

   * |Big|             => field:Big
   * |Big Bang|        => field:Big field:Bang
   * |   Big   bang |  => field:Big field:Bang
   * |"Big Bang"|      => field:"Big Bang"
   * |Big AND Bang|    => +field:Big +field:Bang
   * |Big OR Bang|     => field:Big field:Bang
   * |"Big AND Bang"|  => field:"Big AND Bang"
   * |First "Big Bang"|  => field:First field:"Big bang"
   * |First "Big Bang|   => field:First field:"Big field:Bang
   * |First AND (Big Bang)|  => +field:First +(field:Big field:Bang)
   * 
* * @param field the field to construct the terms. * @param text the text to construct the query from. * @param analyzer used to analyze the text * * @return the corresponding query. */ @Beta public static Query parseToQuery(String field, String text, Analyzer analyzer) { return parseToQuery(field, text, analyzer, true); } /** * Returns the query corresponding to the specified text after parsing it. *

Supported operators are AND and OR, parentheses are also handled. * *

The examples below show the resulting query as a Lucene predicate from the text specified using "field" as the field name: *

   * |Big|             => field:Big
   * |Big Bang|        => field:Big field:Bang
   * |   Big   bang |  => field:Big field:Bang
   * |"Big Bang"|      => field:"Big Bang"
   * |Big AND Bang|    => +field:Big +field:Bang
   * |Big OR Bang|     => field:Big field:Bang
   * |"Big AND Bang"|  => field:"Big AND Bang"
   * |First "Big Bang"|  => field:First field:"Big bang"
   * |First "Big Bang|   => field:First field:"Big field:Bang
   * |First AND (Big Bang)|  => +field:First +(field:Big field:Bang)
   * 
* * @param field the field to construct the terms. * @param text the text to construct the query from. * @param analyzer used to analyze the text * @param defaultOperatorOR if the operator between terms is 'OR' or 'AND' * * @return the corresponding query. */ @Beta public static Query parseToQuery(String field, String text, Analyzer analyzer, boolean defaultOperatorOR) { return parseToQuery(field, text, analyzer, defaultOperatorOR, false); } /** * Returns the query corresponding to the specified text after parsing it. *

Supported operators are AND and OR, parentheses are also handled. * *

The examples below show the resulting query as a Lucene predicate from the text specified using "field" as the field name: *

   * |Big|             => field:Big
   * |Big Bang|        => field:Big field:Bang
   * |   Big   bang |  => field:Big field:Bang
   * |"Big Bang"|      => field:"Big Bang"
   * |Big AND Bang|    => +field:Big +field:Bang
   * |Big OR Bang|     => field:Big field:Bang
   * |"Big AND Bang"|  => field:"Big AND Bang"
   * |First "Big Bang"|  => field:First field:"Big bang"
   * |First "Big Bang|   => field:First field:"Big field:Bang
   * |First AND (Big Bang)|  => +field:First +(field:Big field:Bang)
   * 
* * @param field the field to construct the terms. * @param text the text to construct the query from. * @param analyzer used to analyze the text * @param defaultOperatorOR if the operator between terms is 'OR' or 'AND' * * @return the corresponding query. */ @Beta public static Query parseToQuery(String field, String text, Analyzer analyzer, boolean defaultOperatorOR, boolean supportWildcards) { if (field == null) throw new NullPointerException("field"); if (text == null) throw new NullPointerException("text"); // shortcut for single word or single sentence if (!text.trim().matches(".*?\\s.*?") || isAPhrase(text) || (analyzer != null && !isTokenized(field, analyzer) && hasNoOperators(text))) { if (analyzer == null) return toTermOrPhraseQuery(field, text, supportWildcards); return combine(defaultOperatorOR, toTermOrPhraseQueries(field, text, supportWildcards, analyzer)); } // get last query Query query = null; boolean lastIsAND = !defaultOperatorOR; // parse text Pattern p = Pattern.compile("(\\([^\\(]+\\))|(\\S+)"); Matcher m = p.matcher(text); while (m.find()) { // compute query for this item Query thisQuery = null; String g = m.group().trim(); if (g.charAt(0) == '(' && g.charAt(g.length()-1) == ')') { // parentheses? thisQuery = parseToQuery(field, g.substring(1, g.length()-1), analyzer, true, supportWildcards); } else if ("AND".equals(g)) { // AND? lastIsAND = true; } else if ("OR".equals(g)) { // OR? lastIsAND = false; } else if (analyzer == null) { // phrase or normal word then thisQuery = toTermOrPhraseQuery(field, g, supportWildcards); } else { // phrase or normal word then List combined = toTermOrPhraseQueries(field, g, supportWildcards, analyzer); // check if no resulting queries (word is a stop word for example) if (!combined.isEmpty()) thisQuery = combine(defaultOperatorOR, combined); } if (thisQuery != null) { if (query == null) { query = thisQuery; } else if (lastIsAND) { query = and(query, thisQuery); } else { query = or(query, thisQuery); } lastIsAND = !defaultOperatorOR; } } return query; } /** * @param text the text * @return true if the text has no " OR " and no " AND " (in or out of quotes) */ public static boolean hasNoOperators(String text) { return text != null && !text.contains(" OR ") && !text.contains(" AND "); } /** * Returns the terms for a field * * @param field The field * @param text The text to analyze * @param analyzer The analyzer */ private static void addTermsToPhrase(String field, String text, Analyzer analyzer, PhraseQuery.Builder phrase) { try { TokenStream stream = analyzer.tokenStream(field, text); PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); int position = -1; stream.reset(); while (stream.incrementToken()) { position += increment.getPositionIncrement(); Term term = new Term(field, attribute.toString()); phrase.add(term, position); } stream.end(); stream.close(); } catch (IOException ex) { // Should not occur since we use a StringReader } } private static boolean isTokenized(String field, Analyzer analyzer) { // try to load terms for a phrase and return true if more than one term TokenStream stream = null; try { stream = analyzer.tokenStream(field, "word1 word2"); stream.reset(); if (stream.incrementToken()) { return stream.incrementToken(); } } catch (IOException ex) { // Should not occur since we use a StringReader } finally { if (stream != null) try { stream.end(); stream.close(); } catch (IOException ex) { // Should not occur since we use a StringReader } } return false; } private static boolean isLowercase(String field, Analyzer analyzer) { // try to load terms for a phrase and return true if it is set to lower case TokenStream stream = null; try { stream = analyzer.tokenStream(field, "WORD"); CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { return "word".equals(attribute.toString()); } } catch (IOException ex) { // Should not occur since we use a StringReader } finally { if (stream != null) try { stream.end(); stream.close(); } catch (IOException ex) { // Should not occur since we use a StringReader } } return false; } private static boolean hasWildcards(String text) { return text != null && (text.indexOf('?') != -1 || text.indexOf('*') != -1); } // Substitutions // ============================================================================================== /** * Substitutes one term in the query for another. * *

This method only creates new query object if required; it does not modify the given query. * *

This method simply delegates to the appropriate substitute method based * on the query class. Only query types for which there is an applicable substitute * method can be substituted. * * @param query the query where the substitution should occur. * @param original the original term to replace. * @param replacement the term it should be replaced with. * * @return A new query where the term has been substituted; * or the same query if no substitution was required or possible. */ @Beta public static Query substitute(Query query, Term original, Term replacement) { if (query instanceof TermQuery) return substitute((TermQuery)query, original, replacement); else if (query instanceof PhraseQuery) return substitute((PhraseQuery)query, original, replacement); else if (query instanceof BooleanQuery) return substitute((BooleanQuery)query, original, replacement); else if (query instanceof BoostQuery) return substitute((BoostQuery)query, original, replacement); else return query; } /** * Substitutes one term in the term query for another. * *

This method only creates new query object if required; it does not modify the given query. * * @param query the query where the substitution should occur. * @param original the original term to replace. * @param replacement the term it should be replaced with. * * @return A new term query where the term has been substituted; * or the same query if no substitution was needed. */ @Beta public static Query substitute(BooleanQuery query, Term original, Term replacement) { BooleanQuery.Builder q = new BooleanQuery.Builder(); for (BooleanClause clause : query.clauses()) { Query qx = substitute(clause.getQuery(), original, replacement); q.add(qx, clause.getOccur()); } return q.build(); } /** * Substitutes one term in the term query for another. * *

This method only creates new query object if required; it does not modify the given query. * * @param query the query where the substitution should occur. * @param original the original term to replace. * @param replacement the term it should be replaced with. * * @return A new term query where the term has been substituted; * or the same query if no substitution was needed. */ @Beta public static Query substitute(BoostQuery query, Term original, Term replacement) { return new BoostQuery(substitute(query.getQuery(), original, replacement), query.getBoost()); } /** * Substitutes one term in the term query for another. * *

This method only creates new query object if required; it does not modify the given query. * * @param query the query where the substitution should occur. * @param original the original term to replace. * @param replacement the term it should be replaced with. * * @return A new term query where the term has been substituted; * or the same query if no substitution was needed. */ @Beta public static TermQuery substitute(TermQuery query, Term original, Term replacement) { Term t = query.getTerm(); if (t.equals(original)) return new TermQuery(replacement); else return query; } /** * Substitutes one term in the phrase query for another. * *

In a phrase query the replacement term must be on the same field as the original term. * *

This method only creates new query object if required; it does not modify the given query. * * @param query the query where the substitution should occur. * @param original the original term to replace. * @param replacement the term it should be replaced with. * * @return A new term query where the term has been substituted; * or the same query if no substitution was needed. * * @throws IllegalArgumentException if the replacement term is not on the same field as the original term. */ @Beta public static PhraseQuery substitute(PhraseQuery query, Term original, Term replacement) throws IllegalArgumentException { boolean doSubstitute = false; // Check if we need to substitute for (Term t : query.getTerms()) { if (t.equals(original)) { doSubstitute = true; break; } } // Substitute if required if (doSubstitute) { PhraseQuery.Builder q = new PhraseQuery.Builder(); for (Term t : query.getTerms()) { q.add(t.equals(original)? replacement : t); } q.setSlop(query.getSlop()); return q.build(); // No substitution return the query } else return query; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy