org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.suggest.analyzing;
import java.io.Closeable;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopFieldCollectorManager;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
// TODO:
// - a PostingsFormat that stores super-high-freq terms as
// a bitset should be a win for the prefix terms?
// (LUCENE-5052)
// - we could offer a better integration with
// DocumentDictionary and NRT? so that your suggester
// "automatically" keeps in sync w/ your index
/**
* Analyzes the input text and then suggests matches based on prefix matches to any tokens in the
* indexed text. This also highlights the tokens that match.
*
* This suggester supports payloads. Matches are sorted only by the suggest weight; it would be
* nice to support blended score + weight sort in the future. This means this suggester best applies
* when there is a strong a-priori ranking of all the suggestions.
*
*
This suggester supports contexts, including arbitrary binary terms.
*
* @lucene.experimental
*/
public class AnalyzingInfixSuggester extends Lookup implements Closeable {
/**
* edgegrams for searching short prefixes without Prefix Query that's controlled by {@linkplain
* #minPrefixChars}
*/
protected static final String TEXTGRAMS_FIELD_NAME = "textgrams";
/** Field name used for the indexed text. */
protected static final String TEXT_FIELD_NAME = "text";
/** Field name used for the indexed text, as a StringField, for exact lookup. */
protected static final String EXACT_TEXT_FIELD_NAME = "exacttext";
/**
* Field name used for the indexed context, as a StringField and a SortedSetDVField, for
* filtering.
*/
protected static final String CONTEXTS_FIELD_NAME = "contexts";
/** Analyzer used at search time */
protected final Analyzer queryAnalyzer;
/** Analyzer used at index time */
protected final Analyzer indexAnalyzer;
private final Directory dir;
final int minPrefixChars;
private final boolean allTermsRequired;
private final boolean highlight;
private final boolean commitOnBuild;
private final boolean closeIndexWriterOnBuild;
/**
* Used for ongoing NRT additions/updates. May be null depending on closeIndexWriterOnBuild
*
constructor arg
*/
protected IndexWriter writer;
/** Used to manage concurrent access to writer */
protected final Object writerLock = new Object();
/**
* {@link IndexSearcher} used for lookups. May be null if {@link Directory} did not exist on
* instantiation and neither {@link #build}, {@link #add}, or {@link #update} have been called
*/
protected SearcherManager searcherMgr;
/** Used to manage concurrent access to searcherMgr */
protected final ReadWriteLock searcherMgrLock = new ReentrantReadWriteLock();
private final Lock searcherMgrReadLock = searcherMgrLock.readLock();
private final Lock searcherMgrWriteLock = searcherMgrLock.writeLock();
/** Default minimum number of leading characters before PrefixQuery is used (4). */
public static final int DEFAULT_MIN_PREFIX_CHARS = 4;
/** Default boolean clause option for multiple terms matching (all terms required). */
public static final boolean DEFAULT_ALL_TERMS_REQUIRED = true;
/** Default higlighting option. */
public static final boolean DEFAULT_HIGHLIGHT = true;
/** Default option to close the IndexWriter once the index has been built. */
protected static final boolean DEFAULT_CLOSE_INDEXWRITER_ON_BUILD = true;
/** How we sort the postings and search results. */
private static final Sort SORT = new Sort(new SortField("weight", SortField.Type.LONG, true));
/**
* Create a new instance, loading from a previously built AnalyzingInfixSuggester directory, if it
* exists. This directory must be private to the infix suggester (i.e., not an external Lucene
* index). Note that {@link #close} will also close the provided directory.
*/
public AnalyzingInfixSuggester(Directory dir, Analyzer analyzer) throws IOException {
this(
dir,
analyzer,
analyzer,
DEFAULT_MIN_PREFIX_CHARS,
false,
DEFAULT_ALL_TERMS_REQUIRED,
DEFAULT_HIGHLIGHT);
}
/**
* Create a new instance, loading from a previously built AnalyzingInfixSuggester directory, if it
* exists. This directory must be private to the infix suggester (i.e., not an external Lucene
* index). Note that {@link #close} will also close the provided directory.
*
* @param minPrefixChars Minimum number of leading characters before PrefixQuery is used (default
* 4). Prefixes shorter than this are indexed as character ngrams (increasing index size but
* making lookups faster).
* @param commitOnBuild Call commit after the index has finished building. This would persist the
* suggester index to disk and future instances of this suggester can use this pre-built
* dictionary.
*/
public AnalyzingInfixSuggester(
Directory dir,
Analyzer indexAnalyzer,
Analyzer queryAnalyzer,
int minPrefixChars,
boolean commitOnBuild)
throws IOException {
this(
dir,
indexAnalyzer,
queryAnalyzer,
minPrefixChars,
commitOnBuild,
DEFAULT_ALL_TERMS_REQUIRED,
DEFAULT_HIGHLIGHT);
}
/**
* Create a new instance, loading from a previously built AnalyzingInfixSuggester directory, if it
* exists. This directory must be private to the infix suggester (i.e., not an external Lucene
* index). Note that {@link #close} will also close the provided directory.
*
* @param minPrefixChars Minimum number of leading characters before PrefixQuery is used (default
* 4). Prefixes shorter than this are indexed as character ngrams (increasing index size but
* making lookups faster).
* @param commitOnBuild Call commit after the index has finished building. This would persist the
* suggester index to disk and future instances of this suggester can use this pre-built
* dictionary.
* @param allTermsRequired All terms in the suggest query must be matched.
* @param highlight Highlight suggest query in suggestions.
*/
public AnalyzingInfixSuggester(
Directory dir,
Analyzer indexAnalyzer,
Analyzer queryAnalyzer,
int minPrefixChars,
boolean commitOnBuild,
boolean allTermsRequired,
boolean highlight)
throws IOException {
this(
dir,
indexAnalyzer,
queryAnalyzer,
minPrefixChars,
commitOnBuild,
allTermsRequired,
highlight,
DEFAULT_CLOSE_INDEXWRITER_ON_BUILD);
}
/**
* Create a new instance, loading from a previously built AnalyzingInfixSuggester directory, if it
* exists. This directory must be private to the infix suggester (i.e., not an external Lucene
* index). Note that {@link #close} will also close the provided directory.
*
* @param minPrefixChars Minimum number of leading characters before PrefixQuery is used (default
* 4). Prefixes shorter than this are indexed as character ngrams (increasing index size but
* making lookups faster).
* @param commitOnBuild Call commit after the index has finished building. This would persist the
* suggester index to disk and future instances of this suggester can use this pre-built
* dictionary.
* @param allTermsRequired All terms in the suggest query must be matched.
* @param highlight Highlight suggest query in suggestions.
* @param closeIndexWriterOnBuild If true, the IndexWriter will be closed after the index has
* finished building.
*/
public AnalyzingInfixSuggester(
Directory dir,
Analyzer indexAnalyzer,
Analyzer queryAnalyzer,
int minPrefixChars,
boolean commitOnBuild,
boolean allTermsRequired,
boolean highlight,
boolean closeIndexWriterOnBuild)
throws IOException {
if (minPrefixChars < 0) {
throw new IllegalArgumentException("minPrefixChars must be >= 0; got: " + minPrefixChars);
}
this.queryAnalyzer = queryAnalyzer;
this.indexAnalyzer = indexAnalyzer;
this.dir = dir;
this.minPrefixChars = minPrefixChars;
this.commitOnBuild = commitOnBuild;
this.allTermsRequired = allTermsRequired;
this.highlight = highlight;
this.closeIndexWriterOnBuild = closeIndexWriterOnBuild;
if (DirectoryReader.indexExists(dir)) {
// Already built; open it:
searcherMgr = new SearcherManager(dir, null);
}
}
private void setAndCloseOldSearcherManager(final SearcherManager newSearcherMgr)
throws IOException {
searcherMgrWriteLock.lock();
try {
final SearcherManager oldSearcherMgr = searcherMgr;
searcherMgr = newSearcherMgr;
if (oldSearcherMgr != null) {
oldSearcherMgr.close();
}
} finally {
searcherMgrWriteLock.unlock();
}
}
/** Override this to customize index settings, e.g. which codec to use. */
protected IndexWriterConfig getIndexWriterConfig(
Analyzer indexAnalyzer, IndexWriterConfig.OpenMode openMode) {
IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer);
iwc.setOpenMode(openMode);
// This way all merged segments will be sorted at
// merge time, allow for per-segment early termination
// when those segments are searched:
iwc.setIndexSort(SORT);
return iwc;
}
/** Subclass can override to choose a specific {@link Directory} implementation. */
protected Directory getDirectory(Path path) throws IOException {
return FSDirectory.open(path);
}
@Override
public void build(InputIterator iter) throws IOException {
synchronized (writerLock) {
if (writer != null) {
writer.close();
writer = null;
}
boolean success = false;
try {
// First pass: build a temporary normal Lucene index,
// just indexing the suggestions as they iterate:
writer =
new IndexWriter(
dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.CREATE));
// long t0 = System.nanoTime();
// TODO: use threads?
BytesRef text;
while ((text = iter.next()) != null) {
BytesRef payload;
if (iter.hasPayloads()) {
payload = iter.payload();
} else {
payload = null;
}
add(text, iter.contexts(), iter.weight(), payload);
}
// System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + "
// msec");
if (commitOnBuild || closeIndexWriterOnBuild) {
commit();
}
setAndCloseOldSearcherManager(new SearcherManager(writer, null));
success = true;
} finally {
if (success) {
if (closeIndexWriterOnBuild) {
writer.close();
writer = null;
}
} else { // failure
if (writer != null) {
writer.rollback();
writer = null;
}
}
}
}
}
/**
* Commits all pending changes made to this suggester to disk.
*
* @see IndexWriter#commit
*/
public void commit() throws IOException {
if (writer == null) {
if (searcherMgr == null || closeIndexWriterOnBuild == false) {
throw new IllegalStateException("Cannot commit on an closed writer. Add documents first");
}
// else no-op: writer was committed and closed after the index was built, so commit is
// unnecessary
} else {
writer.commit();
}
}
private Analyzer getGramAnalyzer() {
return new AnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return indexAnalyzer;
}
@Override
protected TokenStreamComponents wrapComponents(
String fieldName, TokenStreamComponents components) {
assert !(fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars == 0)
: "no need \"textgrams\" when minPrefixChars=" + minPrefixChars;
if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) {
// TODO: should use an EdgeNGramTokenFilterFactory here
TokenFilter filter =
new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars, false);
return new TokenStreamComponents(components.getSource(), filter);
} else {
return components;
}
}
};
}
private void ensureOpen() throws IOException {
synchronized (writerLock) {
if (writer == null) {
if (DirectoryReader.indexExists(dir)) {
// Already built; open it:
writer =
new IndexWriter(
dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.APPEND));
} else {
writer =
new IndexWriter(
dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.CREATE));
}
setAndCloseOldSearcherManager(new SearcherManager(writer, null));
}
}
}
/**
* Adds a new suggestion. Be sure to use {@link #update} instead if you want to replace a previous
* suggestion. After adding or updating a batch of new suggestions, you must call {@link #refresh}
* in the end in order to see the suggestions in {@link #lookup}
*/
public void add(BytesRef text, Set contexts, long weight, BytesRef payload)
throws IOException {
ensureOpen();
writer.addDocument(buildDocument(text, contexts, weight, payload));
}
/**
* Updates a previous suggestion, matching the exact same text as before. Use this to change the
* weight or payload of an already added suggestion. If you know this text is not already present
* you can use {@link #add} instead. After adding or updating a batch of new suggestions, you must
* call {@link #refresh} in the end in order to see the suggestions in {@link #lookup}
*/
public void update(BytesRef text, Set contexts, long weight, BytesRef payload)
throws IOException {
ensureOpen();
writer.updateDocument(
new Term(EXACT_TEXT_FIELD_NAME, text.utf8ToString()),
buildDocument(text, contexts, weight, payload));
}
private Document buildDocument(
BytesRef text, Set contexts, long weight, BytesRef payload) throws IOException {
String textString = text.utf8ToString();
Document doc = new Document();
FieldType ft = getTextFieldType();
doc.add(new Field(TEXT_FIELD_NAME, textString, ft));
if (minPrefixChars > 0) {
doc.add(new Field(TEXTGRAMS_FIELD_NAME, textString, ft));
}
doc.add(new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO));
doc.add(new BinaryDocValuesField(TEXT_FIELD_NAME, text));
doc.add(new NumericDocValuesField("weight", weight));
if (payload != null) {
doc.add(new BinaryDocValuesField("payloads", payload));
}
if (contexts != null) {
for (BytesRef context : contexts) {
doc.add(new StringField(CONTEXTS_FIELD_NAME, context, Field.Store.NO));
doc.add(new SortedSetDocValuesField(CONTEXTS_FIELD_NAME, context));
}
}
return doc;
}
/**
* Reopens the underlying searcher; it's best to "batch up" many additions/updates, and then call
* refresh once in the end.
*/
public void refresh() throws IOException {
if (searcherMgr == null) {
throw new IllegalStateException("suggester was not built");
}
if (writer != null) {
searcherMgr.maybeRefreshBlocking();
}
// else no-op: writer was committed and closed after the index was built
// and before searchMgr was constructed, so refresh is unnecessary
}
/**
* Subclass can override this method to change the field type of the text field e.g. to change the
* index options
*/
protected FieldType getTextFieldType() {
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS);
ft.setOmitNorms(true);
return ft;
}
@Override
public List lookup(
CharSequence key, Set contexts, boolean onlyMorePopular, int num)
throws IOException {
return lookup(key, contexts, num, allTermsRequired, highlight);
}
/** Lookup, without any context. */
public List lookup(
CharSequence key, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
return lookup(key, (BooleanQuery) null, num, allTermsRequired, doHighlight);
}
/**
* Lookup, with context but without booleans. Context booleans default to SHOULD, so each
* suggestion must have at least one of the contexts.
*/
public List lookup(
CharSequence key,
Set contexts,
int num,
boolean allTermsRequired,
boolean doHighlight)
throws IOException {
return lookup(key, toQuery(contexts), num, allTermsRequired, doHighlight);
}
/**
* This is called if the last token isn't ended (e.g. user did not type a space after it). Return
* an appropriate Query clause to add to the BooleanQuery.
*/
protected Query getLastTokenQuery(String token) throws IOException {
if (token.length() < minPrefixChars) {
// The leading ngram was directly indexed:
return new TermQuery(new Term(TEXTGRAMS_FIELD_NAME, token));
}
return new PrefixQuery(new Term(TEXT_FIELD_NAME, token));
}
/**
* Retrieve suggestions, specifying whether all terms must match ({@code allTermsRequired}) and
* whether the hits should be highlighted ({@code doHighlight}).
*/
public List lookup(
CharSequence key,
Map contextInfo,
int num,
boolean allTermsRequired,
boolean doHighlight)
throws IOException {
return lookup(key, toQuery(contextInfo), num, allTermsRequired, doHighlight);
}
private BooleanQuery toQuery(Map contextInfo) {
if (contextInfo == null || contextInfo.isEmpty()) {
return null;
}
BooleanQuery.Builder contextFilter = new BooleanQuery.Builder();
for (Map.Entry entry : contextInfo.entrySet()) {
addContextToQuery(contextFilter, entry.getKey(), entry.getValue());
}
return contextFilter.build();
}
private BooleanQuery toQuery(Set contextInfo) {
if (contextInfo == null || contextInfo.isEmpty()) {
return null;
}
BooleanQuery.Builder contextFilter = new BooleanQuery.Builder();
for (BytesRef context : contextInfo) {
addContextToQuery(contextFilter, context, BooleanClause.Occur.SHOULD);
}
return contextFilter.build();
}
/**
* This method is handy as we do not need access to internal fields such as CONTEXTS_FIELD_NAME in
* order to build queries However, here may not be its best location.
*
* @param query an instance of @See {@link BooleanQuery}
* @param context the context
* @param clause one of {@link Occur}
*/
public void addContextToQuery(
BooleanQuery.Builder query, BytesRef context, BooleanClause.Occur clause) {
// NOTE: we "should" wrap this in
// ConstantScoreQuery, or maybe send this as a
// Filter instead to search.
// TODO: if we had a BinaryTermField we could fix
// this "must be valid ut8f" limitation:
query.add(new TermQuery(new Term(CONTEXTS_FIELD_NAME, context)), clause);
}
/**
* This is an advanced method providing the capability to send down to the suggester any arbitrary
* lucene query to be used to filter the result of the suggester
*
* @param key the keyword being looked for
* @param contextQuery an arbitrary Lucene query to be used to filter the result of the suggester.
* {@link #addContextToQuery} could be used to build this contextQuery.
* @param num number of items to return
* @param allTermsRequired all searched terms must match or not
* @param doHighlight if true, the matching term will be highlighted in the search result
* @return the result of the suggester
* @throws IOException f the is IO exception while reading data from the index
*/
@Override
public List lookup(
CharSequence key,
BooleanQuery contextQuery,
int num,
boolean allTermsRequired,
boolean doHighlight)
throws IOException {
if (searcherMgr == null) {
throw new IllegalStateException("suggester was not built");
}
final BooleanClause.Occur occur;
if (allTermsRequired) {
occur = BooleanClause.Occur.MUST;
} else {
occur = BooleanClause.Occur.SHOULD;
}
BooleanQuery.Builder query;
Set matchedTokens;
String prefixToken = null;
try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()))) {
// long t0 = System.currentTimeMillis();
ts.reset();
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
String lastToken = null;
query = new BooleanQuery.Builder();
int maxEndOffset = -1;
matchedTokens = new HashSet<>();
while (ts.incrementToken()) {
if (lastToken != null) {
matchedTokens.add(lastToken);
query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur);
}
lastToken = termAtt.toString();
if (lastToken != null) {
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
}
}
ts.end();
if (lastToken != null) {
Query lastQuery;
if (maxEndOffset == offsetAtt.endOffset()) {
// Use PrefixQuery (or the ngram equivalent) when
// there was no trailing discarded chars in the
// string (e.g. whitespace), so that if query does
// not end with a space we show prefix matches for
// that token:
lastQuery = getLastTokenQuery(lastToken);
prefixToken = lastToken;
} else {
// Use TermQuery for an exact match if there were
// trailing discarded chars (e.g. whitespace), so
// that if query ends with a space we only show
// exact matches for that term:
matchedTokens.add(lastToken);
lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken));
}
if (lastQuery != null) {
query.add(lastQuery, occur);
}
}
if (contextQuery != null) {
boolean allMustNot = true;
for (BooleanClause clause : contextQuery.clauses()) {
if (clause.occur() != BooleanClause.Occur.MUST_NOT) {
allMustNot = false;
break;
}
}
if (allMustNot) {
// All are MUST_NOT: add the contextQuery to the main query instead (not as sub-query)
for (BooleanClause clause : contextQuery.clauses()) {
query.add(clause);
}
} else if (allTermsRequired == false) {
// We must carefully upgrade the query clauses to MUST:
BooleanQuery.Builder newQuery = new BooleanQuery.Builder();
newQuery.add(query.build(), BooleanClause.Occur.MUST);
newQuery.add(contextQuery, BooleanClause.Occur.MUST);
query = newQuery;
} else {
// Add contextQuery as sub-query
query.add(contextQuery, BooleanClause.Occur.MUST);
}
}
}
// TODO: we could allow blended sort here, combining
// weight w/ score. Now we ignore score and sort only
// by weight:
Query finalQuery = finishQuery(query, allTermsRequired);
// System.out.println("finalQuery=" + finalQuery);
// Sort by weight, descending:
List results = null;
SearcherManager mgr;
IndexSearcher searcher;
searcherMgrReadLock.lock();
try {
mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference
searcher = mgr.acquire();
} finally {
searcherMgrReadLock.unlock();
}
try {
TopFieldCollectorManager c = new TopFieldCollectorManager(SORT, num, null, 1);
// System.out.println("got searcher=" + searcher);
TopFieldDocs hits = searcher.search(finalQuery, c);
// Slower way if postings are not pre-sorted by weight:
// hits = searcher.search(query, null, num, SORT);
results = createResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken);
} finally {
mgr.release(searcher);
}
// System.out.println((System.currentTimeMillis() - t0) + " ms for infix suggest");
// System.out.println(results);
return results;
}
/**
* Create the results based on the search hits. Can be overridden by subclass to add particular
* behavior (e.g. weight transformation). Note that there is no prefix token (the {@code
* prefixToken} argument will be null) whenever the final token in the incoming request was in
* fact finished (had trailing characters, such as white-space).
*
* @throws IOException If there are problems reading fields from the underlying Lucene index.
*/
protected List createResults(
IndexSearcher searcher,
TopFieldDocs hits,
int num,
CharSequence charSequence,
boolean doHighlight,
Set matchedTokens,
String prefixToken)
throws IOException {
List leaves = searcher.getIndexReader().leaves();
List results = new ArrayList<>();
for (int i = 0; i < hits.scoreDocs.length; i++) {
FieldDoc fd = (FieldDoc) hits.scoreDocs[i];
BinaryDocValues textDV =
MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME);
textDV.advance(fd.doc);
BytesRef term = textDV.binaryValue();
String text = term.utf8ToString();
long score = (Long) fd.fields[0];
// This will just be null if app didn't pass payloads to build():
// TODO: maybe just stored fields? they compress...
BinaryDocValues payloadsDV =
MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads");
BytesRef payload;
if (payloadsDV != null) {
if (payloadsDV.advance(fd.doc) == fd.doc) {
payload = BytesRef.deepCopyOf(payloadsDV.binaryValue());
} else {
payload = new BytesRef(BytesRef.EMPTY_BYTES);
}
} else {
payload = null;
}
// Must look up sorted-set by segment:
int segment = ReaderUtil.subIndex(fd.doc, leaves);
SortedSetDocValues contextsDV =
leaves.get(segment).reader().getSortedSetDocValues(CONTEXTS_FIELD_NAME);
Set contexts;
if (contextsDV != null) {
contexts = new HashSet();
int targetDocID = fd.doc - leaves.get(segment).docBase;
if (contextsDV.advance(targetDocID) == targetDocID) {
for (int j = 0; j < contextsDV.docValueCount(); j++) {
BytesRef context = BytesRef.deepCopyOf(contextsDV.lookupOrd(contextsDV.nextOrd()));
contexts.add(context);
}
}
} else {
contexts = null;
}
LookupResult result;
if (doHighlight) {
result =
new LookupResult(
text, highlight(text, matchedTokens, prefixToken), score, payload, contexts);
} else {
result = new LookupResult(text, score, payload, contexts);
}
results.add(result);
}
return results;
}
/** Subclass can override this to tweak the Query before searching. */
protected Query finishQuery(BooleanQuery.Builder in, boolean allTermsRequired) {
return in.build();
}
/**
* Override this method to customize the Object representing a single highlighted suggestions; the
* result is set on each {@link org.apache.lucene.search.suggest.Lookup.LookupResult#highlightKey}
* member.
*/
protected Object highlight(String text, Set matchedTokens, String prefixToken)
throws IOException {
try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
StringBuilder sb = new StringBuilder();
int upto = 0;
while (ts.incrementToken()) {
String token = termAtt.toString();
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
if (upto < startOffset) {
addNonMatch(sb, text.substring(upto, startOffset));
upto = startOffset;
} else if (upto > startOffset) {
continue;
}
if (matchedTokens.contains(token)) {
// Token matches.
addWholeMatch(sb, text.substring(startOffset, endOffset), token);
upto = endOffset;
} else if (prefixToken != null && token.startsWith(prefixToken)) {
addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken);
upto = endOffset;
}
}
ts.end();
int endOffset = offsetAtt.endOffset();
if (upto < endOffset) {
addNonMatch(sb, text.substring(upto));
}
return sb.toString();
}
}
/**
* Called while highlighting a single result, to append a non-matching chunk of text from the
* suggestion to the provided fragments list.
*
* @param sb The {@code StringBuilder} to append to
* @param text The text chunk to add
*/
protected void addNonMatch(StringBuilder sb, String text) {
sb.append(text);
}
/**
* Called while highlighting a single result, to append the whole matched token to the provided
* fragments list.
*
* @param sb The {@code StringBuilder} to append to
* @param surface The surface form (original) text
* @param analyzed The analyzed token corresponding to the surface form text
*/
protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) {
sb.append("");
sb.append(surface);
sb.append("");
}
/**
* Called while highlighting a single result, to append a matched prefix token, to the provided
* fragments list.
*
* @param sb The {@code StringBuilder} to append to
* @param surface The fragment of the surface form (indexed during {@link #build}, corresponding
* to this match
* @param analyzed The analyzed token that matched
* @param prefixToken The prefix of the token that matched
*/
protected void addPrefixMatch(
StringBuilder sb, String surface, String analyzed, String prefixToken) {
// TODO: apps can try to invert their analysis logic
// here, e.g. downcase the two before checking prefix:
if (prefixToken.length() >= surface.length()) {
addWholeMatch(sb, surface, analyzed);
return;
}
sb.append("");
sb.append(surface, 0, prefixToken.length());
sb.append("");
sb.append(surface.substring(prefixToken.length()));
}
@Override
public boolean store(DataOutput in) throws IOException {
return false;
}
@Override
public boolean load(DataInput out) throws IOException {
return false;
}
@Override
public void close() throws IOException {
if (searcherMgr != null) {
searcherMgr.close();
searcherMgr = null;
}
if (writer != null) {
writer.close();
writer = null;
}
if (dir != null) {
dir.close();
}
}
@Override
public long ramBytesUsed() {
return 0L;
}
@Override
public long getCount() throws IOException {
if (searcherMgr == null) {
return 0;
}
SearcherManager mgr;
IndexSearcher searcher;
searcherMgrReadLock.lock();
try {
mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference
searcher = mgr.acquire();
} finally {
searcherMgrReadLock.unlock();
}
try {
return searcher.getIndexReader().numDocs();
} finally {
mgr.release(searcher);
}
}
}