All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.co.flax.luwak.presearcher.WildcardNGramPresearcherComponent Maven / Gradle / Ivy

package uk.co.flax.luwak.presearcher;

import java.util.HashSet;
import java.util.Set;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.BytesRef;
import uk.co.flax.luwak.analysis.SuffixingNGramTokenFilter;
import uk.co.flax.luwak.termextractor.QueryTerm;
import uk.co.flax.luwak.termextractor.treebuilder.RegexpNGramTermQueryTreeBuilder;

/*
 * Copyright (c) 2013 Lemur Consulting Ltd.
 * 

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at *

* http://www.apache.org/licenses/LICENSE-2.0 *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * A Presearcher implementation that matches Wildcard queries by indexing regex * terms by their longest static substring, and generates ngrams from InputDocument * tokens to match them. * * This implementation will filter out more wildcard queries than TermFilteredPresearcher, * at the expense of longer document build times. Which one is more performant will depend * on the type and number of queries registered in the Monitor, and the size of documents * to be monitored. Profiling is recommended. */ public class WildcardNGramPresearcherComponent extends PresearcherComponent { /** The default suffix with which to mark ngrams */ public static final String DEFAULT_NGRAM_SUFFIX = "XX"; /** The default maximum length of an input token before ANYTOKENS are generated */ public static final int DEFAULT_MAX_TOKEN_SIZE = 30; /** The default token to emit if a term is longer than MAX_TOKEN_SIZE */ public static final String DEFAULT_WILDCARD_TOKEN = "__WILDCARD__"; private final String ngramSuffix; private final String wildcardToken; private final int maxTokenSize; private final Set excludedFields; /** * Create a new WildcardNGramPresearcherComponent * @param ngramSuffix the suffix with which to mark ngrams * @param maxTokenSize the maximum length of an input token before WILDCARD tokens are generated * @param wildcardToken the token to emit if a token is longer than maxTokenSize in length * @param excludedFields a Set of fields to ignore when generating ngrams */ public WildcardNGramPresearcherComponent(String ngramSuffix, int maxTokenSize, String wildcardToken, Set excludedFields) { super(new RegexpNGramTermQueryTreeBuilder(ngramSuffix, wildcardToken)); this.ngramSuffix = ngramSuffix; this.maxTokenSize = maxTokenSize; this.wildcardToken = wildcardToken; this.excludedFields = excludedFields == null ? new HashSet<>() : excludedFields; } /** * Create a new WildcardNGramPresearcherComponent using default settings */ public WildcardNGramPresearcherComponent() { this(DEFAULT_NGRAM_SUFFIX, DEFAULT_MAX_TOKEN_SIZE, DEFAULT_WILDCARD_TOKEN, null); } /** * Create a new WildcardNGramPresearcherComponent with a maximum token size * @param maxTokenSize the maximum length of an input token before WILDCARD tokens are generated */ public WildcardNGramPresearcherComponent(int maxTokenSize) { this(DEFAULT_NGRAM_SUFFIX, maxTokenSize, DEFAULT_WILDCARD_TOKEN, null); } @Override public TokenStream filterDocumentTokens(String field, TokenStream ts) { if (excludedFields.contains(field)) return ts; return new SuffixingNGramTokenFilter(ts, ngramSuffix, wildcardToken, maxTokenSize); } @Override public BytesRef extraToken(QueryTerm term) { if (term.type == QueryTerm.Type.CUSTOM && wildcardToken.equals(term.payload)) return new BytesRef(wildcardToken); return null; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy