org.apache.lucene.monitor.RegexpQueryHandler Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-monitor Show documentation
Apache Lucene (module: monitor)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.monitor;

import java.util.Collections;
import java.util.Set;
import java.util.function.BiConsumer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.util.BytesRef;

/**
 * A query handler implementation that matches Regexp queries by indexing regex terms by their
 * longest static substring, and generates ngrams from Document tokens to match them.
 *
 * This implementation will filter out more wildcard queries than TermFilteredPresearcher, at the
 * expense of longer document build times. Which one is more performant will depend on the type and
 * number of queries registered in the Monitor, and the size of documents to be monitored. Profiling
 * is recommended.
 */
public class RegexpQueryHandler implements CustomQueryHandler {

  /** The default suffix with which to mark ngrams */
  public static final String DEFAULT_NGRAM_SUFFIX = "XX";

  /** The default maximum length of an input token before ANYTOKENS are generated */
  public static final int DEFAULT_MAX_TOKEN_SIZE = 30;

  /** The default token to emit if a term is longer than MAX_TOKEN_SIZE */
  public static final String DEFAULT_WILDCARD_TOKEN = "__WILDCARD__";

  private final String ngramSuffix;

  private final String wildcardToken;
  private final BytesRef wildcardTokenBytes;

  private final int maxTokenSize;

  private final Set excludedFields;

  /**
   * Creates a new RegexpQueryHandler
   *
   * @param ngramSuffix the suffix with which to mark ngrams
   * @param maxTokenSize the maximum length of an input token before WILDCARD tokens are generated
   * @param wildcardToken the token to emit if a token is longer than maxTokenSize in length
   * @param excludedFields a Set of fields to ignore when generating ngrams
   */
  public RegexpQueryHandler(
      String ngramSuffix, int maxTokenSize, String wildcardToken, Set excludedFields) {
    this.ngramSuffix = ngramSuffix;
    this.maxTokenSize = maxTokenSize;
    this.wildcardTokenBytes = new BytesRef(wildcardToken);
    this.wildcardToken = wildcardToken;
    this.excludedFields = excludedFields == null ? Collections.emptySet() : excludedFields;
  }

  /** Creates a new RegexpQueryHandler using default settings */
  public RegexpQueryHandler() {
    this(DEFAULT_NGRAM_SUFFIX, DEFAULT_MAX_TOKEN_SIZE, DEFAULT_WILDCARD_TOKEN, null);
  }

  /**
   * Creates a new RegexpQueryHandler with a maximum token size
   *
   * @param maxTokenSize the maximum length of an input token before WILDCARD tokens are generated
   */
  public RegexpQueryHandler(int maxTokenSize) {
    this(DEFAULT_NGRAM_SUFFIX, maxTokenSize, DEFAULT_WILDCARD_TOKEN, null);
  }

  @Override
  public TokenStream wrapTermStream(String field, TokenStream ts) {
    if (excludedFields.contains(field)) return ts;
    return new SuffixingNGramTokenFilter(ts, ngramSuffix, wildcardToken, maxTokenSize);
  }

  @Override
  public QueryTree handleQuery(Query q, TermWeightor termWeightor) {
    if (q instanceof RegexpQuery == false) {
      return null;
    }
    RegexpQuery query = (RegexpQuery) q;
    String regexp = parseOutRegexp(query.toString(""));
    String selected = selectLongestSubstring(regexp);
    Term term = new Term(query.getField(), selected + ngramSuffix);
    double weight = termWeightor.applyAsDouble(term);
    return new QueryTree() {
      @Override
      public double weight() {
        return weight;
      }

      @Override
      public void collectTerms(BiConsumer termCollector) {
        termCollector.accept(term.field(), term.bytes());
        termCollector.accept(term.field(), wildcardTokenBytes);
      }

      @Override
      public boolean advancePhase(double minWeight) {
        return false;
      }

      @Override
      public String toString(int depth) {
        return space(depth) + "WILDCARD_NGRAM[" + term.toString() + "]^" + weight;
      }
    };
  }

  private static String parseOutRegexp(String rep) {
    int fieldSepPos = rep.indexOf(':');
    int firstSlash = rep.indexOf('/', fieldSepPos);
    int lastSlash = rep.lastIndexOf('/');
    return rep.substring(firstSlash + 1, lastSlash);
  }

  private static String selectLongestSubstring(String regexp) {
    String selected = "";
    for (String substr : regexp.split("\\.|\\*|.\\?")) {
      if (substr.length() > selected.length()) {
        selected = substr;
      }
    }
    return selected;
  }
}