org.apache.lucene.monitor.RegexpQueryHandler Maven / Gradle / Ivy
Show all versions of lucene-monitor Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.monitor;
import java.util.Collections;
import java.util.Set;
import java.util.function.BiConsumer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.util.BytesRef;
/**
* A query handler implementation that matches Regexp queries by indexing regex terms by their
* longest static substring, and generates ngrams from Document tokens to match them.
*
* This implementation will filter out more wildcard queries than TermFilteredPresearcher, at the
* expense of longer document build times. Which one is more performant will depend on the type and
* number of queries registered in the Monitor, and the size of documents to be monitored. Profiling
* is recommended.
*/
public class RegexpQueryHandler implements CustomQueryHandler {
/** The default suffix with which to mark ngrams */
public static final String DEFAULT_NGRAM_SUFFIX = "XX";
/** The default maximum length of an input token before ANYTOKENS are generated */
public static final int DEFAULT_MAX_TOKEN_SIZE = 30;
/** The default token to emit if a term is longer than MAX_TOKEN_SIZE */
public static final String DEFAULT_WILDCARD_TOKEN = "__WILDCARD__";
private final String ngramSuffix;
private final String wildcardToken;
private final BytesRef wildcardTokenBytes;
private final int maxTokenSize;
private final Set excludedFields;
/**
* Creates a new RegexpQueryHandler
*
* @param ngramSuffix the suffix with which to mark ngrams
* @param maxTokenSize the maximum length of an input token before WILDCARD tokens are generated
* @param wildcardToken the token to emit if a token is longer than maxTokenSize in length
* @param excludedFields a Set of fields to ignore when generating ngrams
*/
public RegexpQueryHandler(
String ngramSuffix, int maxTokenSize, String wildcardToken, Set excludedFields) {
this.ngramSuffix = ngramSuffix;
this.maxTokenSize = maxTokenSize;
this.wildcardTokenBytes = new BytesRef(wildcardToken);
this.wildcardToken = wildcardToken;
this.excludedFields = excludedFields == null ? Collections.emptySet() : excludedFields;
}
/** Creates a new RegexpQueryHandler using default settings */
public RegexpQueryHandler() {
this(DEFAULT_NGRAM_SUFFIX, DEFAULT_MAX_TOKEN_SIZE, DEFAULT_WILDCARD_TOKEN, null);
}
/**
* Creates a new RegexpQueryHandler with a maximum token size
*
* @param maxTokenSize the maximum length of an input token before WILDCARD tokens are generated
*/
public RegexpQueryHandler(int maxTokenSize) {
this(DEFAULT_NGRAM_SUFFIX, maxTokenSize, DEFAULT_WILDCARD_TOKEN, null);
}
@Override
public TokenStream wrapTermStream(String field, TokenStream ts) {
if (excludedFields.contains(field)) return ts;
return new SuffixingNGramTokenFilter(ts, ngramSuffix, wildcardToken, maxTokenSize);
}
@Override
public QueryTree handleQuery(Query q, TermWeightor termWeightor) {
if (q instanceof RegexpQuery == false) {
return null;
}
RegexpQuery query = (RegexpQuery) q;
String regexp = parseOutRegexp(query.toString(""));
String selected = selectLongestSubstring(regexp);
Term term = new Term(query.getField(), selected + ngramSuffix);
double weight = termWeightor.applyAsDouble(term);
return new QueryTree() {
@Override
public double weight() {
return weight;
}
@Override
public void collectTerms(BiConsumer termCollector) {
termCollector.accept(term.field(), term.bytes());
termCollector.accept(term.field(), wildcardTokenBytes);
}
@Override
public boolean advancePhase(double minWeight) {
return false;
}
@Override
public String toString(int depth) {
return space(depth) + "WILDCARD_NGRAM[" + term.toString() + "]^" + weight;
}
};
}
private static String parseOutRegexp(String rep) {
int fieldSepPos = rep.indexOf(':');
int firstSlash = rep.indexOf('/', fieldSepPos);
int lastSlash = rep.lastIndexOf('/');
return rep.substring(firstSlash + 1, lastSlash);
}
private static String selectLongestSubstring(String regexp) {
String selected = "";
for (String substr : regexp.split("\\.|\\*|.\\?")) {
if (substr.length() > selected.length()) {
selected = substr;
}
}
return selected;
}
}