All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester Maven / Gradle / Ivy

There is a newer version: 10.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.analyzing;

import java.io.Closeable;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopFieldCollectorManager;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;

// TODO:
//   - a PostingsFormat that stores super-high-freq terms as
//     a bitset should be a win for the prefix terms?
//     (LUCENE-5052)
//   - we could offer a better integration with
//     DocumentDictionary and NRT?  so that your suggester
//     "automatically" keeps in sync w/ your index

/**
 * Analyzes the input text and then suggests matches based on prefix matches to any tokens in the
 * indexed text. This also highlights the tokens that match.
 *
 * 

This suggester supports payloads. Matches are sorted only by the suggest weight; it would be * nice to support blended score + weight sort in the future. This means this suggester best applies * when there is a strong a-priori ranking of all the suggestions. * *

This suggester supports contexts, including arbitrary binary terms. * * @lucene.experimental */ public class AnalyzingInfixSuggester extends Lookup implements Closeable { /** * edgegrams for searching short prefixes without Prefix Query that's controlled by {@linkplain * #minPrefixChars} */ protected static final String TEXTGRAMS_FIELD_NAME = "textgrams"; /** Field name used for the indexed text. */ protected static final String TEXT_FIELD_NAME = "text"; /** Field name used for the indexed text, as a StringField, for exact lookup. */ protected static final String EXACT_TEXT_FIELD_NAME = "exacttext"; /** * Field name used for the indexed context, as a StringField and a SortedSetDVField, for * filtering. */ protected static final String CONTEXTS_FIELD_NAME = "contexts"; /** Analyzer used at search time */ protected final Analyzer queryAnalyzer; /** Analyzer used at index time */ protected final Analyzer indexAnalyzer; private final Directory dir; final int minPrefixChars; private final boolean allTermsRequired; private final boolean highlight; private final boolean commitOnBuild; private final boolean closeIndexWriterOnBuild; /** * Used for ongoing NRT additions/updates. May be null depending on closeIndexWriterOnBuild * constructor arg */ protected IndexWriter writer; /** Used to manage concurrent access to writer */ protected final Object writerLock = new Object(); /** * {@link IndexSearcher} used for lookups. May be null if {@link Directory} did not exist on * instantiation and neither {@link #build}, {@link #add}, or {@link #update} have been called */ protected SearcherManager searcherMgr; /** Used to manage concurrent access to searcherMgr */ protected final ReadWriteLock searcherMgrLock = new ReentrantReadWriteLock(); private final Lock searcherMgrReadLock = searcherMgrLock.readLock(); private final Lock searcherMgrWriteLock = searcherMgrLock.writeLock(); /** Default minimum number of leading characters before PrefixQuery is used (4). */ public static final int DEFAULT_MIN_PREFIX_CHARS = 4; /** Default boolean clause option for multiple terms matching (all terms required). */ public static final boolean DEFAULT_ALL_TERMS_REQUIRED = true; /** Default higlighting option. */ public static final boolean DEFAULT_HIGHLIGHT = true; /** Default option to close the IndexWriter once the index has been built. */ protected static final boolean DEFAULT_CLOSE_INDEXWRITER_ON_BUILD = true; /** How we sort the postings and search results. */ private static final Sort SORT = new Sort(new SortField("weight", SortField.Type.LONG, true)); /** * Create a new instance, loading from a previously built AnalyzingInfixSuggester directory, if it * exists. This directory must be private to the infix suggester (i.e., not an external Lucene * index). Note that {@link #close} will also close the provided directory. */ public AnalyzingInfixSuggester(Directory dir, Analyzer analyzer) throws IOException { this( dir, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS, false, DEFAULT_ALL_TERMS_REQUIRED, DEFAULT_HIGHLIGHT); } /** * Create a new instance, loading from a previously built AnalyzingInfixSuggester directory, if it * exists. This directory must be private to the infix suggester (i.e., not an external Lucene * index). Note that {@link #close} will also close the provided directory. * * @param minPrefixChars Minimum number of leading characters before PrefixQuery is used (default * 4). Prefixes shorter than this are indexed as character ngrams (increasing index size but * making lookups faster). * @param commitOnBuild Call commit after the index has finished building. This would persist the * suggester index to disk and future instances of this suggester can use this pre-built * dictionary. */ public AnalyzingInfixSuggester( Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, boolean commitOnBuild) throws IOException { this( dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild, DEFAULT_ALL_TERMS_REQUIRED, DEFAULT_HIGHLIGHT); } /** * Create a new instance, loading from a previously built AnalyzingInfixSuggester directory, if it * exists. This directory must be private to the infix suggester (i.e., not an external Lucene * index). Note that {@link #close} will also close the provided directory. * * @param minPrefixChars Minimum number of leading characters before PrefixQuery is used (default * 4). Prefixes shorter than this are indexed as character ngrams (increasing index size but * making lookups faster). * @param commitOnBuild Call commit after the index has finished building. This would persist the * suggester index to disk and future instances of this suggester can use this pre-built * dictionary. * @param allTermsRequired All terms in the suggest query must be matched. * @param highlight Highlight suggest query in suggestions. */ public AnalyzingInfixSuggester( Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, boolean commitOnBuild, boolean allTermsRequired, boolean highlight) throws IOException { this( dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild, allTermsRequired, highlight, DEFAULT_CLOSE_INDEXWRITER_ON_BUILD); } /** * Create a new instance, loading from a previously built AnalyzingInfixSuggester directory, if it * exists. This directory must be private to the infix suggester (i.e., not an external Lucene * index). Note that {@link #close} will also close the provided directory. * * @param minPrefixChars Minimum number of leading characters before PrefixQuery is used (default * 4). Prefixes shorter than this are indexed as character ngrams (increasing index size but * making lookups faster). * @param commitOnBuild Call commit after the index has finished building. This would persist the * suggester index to disk and future instances of this suggester can use this pre-built * dictionary. * @param allTermsRequired All terms in the suggest query must be matched. * @param highlight Highlight suggest query in suggestions. * @param closeIndexWriterOnBuild If true, the IndexWriter will be closed after the index has * finished building. */ public AnalyzingInfixSuggester( Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, boolean commitOnBuild, boolean allTermsRequired, boolean highlight, boolean closeIndexWriterOnBuild) throws IOException { if (minPrefixChars < 0) { throw new IllegalArgumentException("minPrefixChars must be >= 0; got: " + minPrefixChars); } this.queryAnalyzer = queryAnalyzer; this.indexAnalyzer = indexAnalyzer; this.dir = dir; this.minPrefixChars = minPrefixChars; this.commitOnBuild = commitOnBuild; this.allTermsRequired = allTermsRequired; this.highlight = highlight; this.closeIndexWriterOnBuild = closeIndexWriterOnBuild; if (DirectoryReader.indexExists(dir)) { // Already built; open it: searcherMgr = new SearcherManager(dir, null); } } private void setAndCloseOldSearcherManager(final SearcherManager newSearcherMgr) throws IOException { searcherMgrWriteLock.lock(); try { final SearcherManager oldSearcherMgr = searcherMgr; searcherMgr = newSearcherMgr; if (oldSearcherMgr != null) { oldSearcherMgr.close(); } } finally { searcherMgrWriteLock.unlock(); } } /** Override this to customize index settings, e.g. which codec to use. */ protected IndexWriterConfig getIndexWriterConfig( Analyzer indexAnalyzer, IndexWriterConfig.OpenMode openMode) { IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer); iwc.setOpenMode(openMode); // This way all merged segments will be sorted at // merge time, allow for per-segment early termination // when those segments are searched: iwc.setIndexSort(SORT); return iwc; } /** Subclass can override to choose a specific {@link Directory} implementation. */ protected Directory getDirectory(Path path) throws IOException { return FSDirectory.open(path); } @Override public void build(InputIterator iter) throws IOException { synchronized (writerLock) { if (writer != null) { writer.close(); writer = null; } boolean success = false; try { // First pass: build a temporary normal Lucene index, // just indexing the suggestions as they iterate: writer = new IndexWriter( dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.CREATE)); // long t0 = System.nanoTime(); // TODO: use threads? BytesRef text; while ((text = iter.next()) != null) { BytesRef payload; if (iter.hasPayloads()) { payload = iter.payload(); } else { payload = null; } add(text, iter.contexts(), iter.weight(), payload); } // System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " // msec"); if (commitOnBuild || closeIndexWriterOnBuild) { commit(); } setAndCloseOldSearcherManager(new SearcherManager(writer, null)); success = true; } finally { if (success) { if (closeIndexWriterOnBuild) { writer.close(); writer = null; } } else { // failure if (writer != null) { writer.rollback(); writer = null; } } } } } /** * Commits all pending changes made to this suggester to disk. * * @see IndexWriter#commit */ public void commit() throws IOException { if (writer == null) { if (searcherMgr == null || closeIndexWriterOnBuild == false) { throw new IllegalStateException("Cannot commit on an closed writer. Add documents first"); } // else no-op: writer was committed and closed after the index was built, so commit is // unnecessary } else { writer.commit(); } } private Analyzer getGramAnalyzer() { return new AnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) { @Override protected Analyzer getWrappedAnalyzer(String fieldName) { return indexAnalyzer; } @Override protected TokenStreamComponents wrapComponents( String fieldName, TokenStreamComponents components) { assert !(fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars == 0) : "no need \"textgrams\" when minPrefixChars=" + minPrefixChars; if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) { // TODO: should use an EdgeNGramTokenFilterFactory here TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars, false); return new TokenStreamComponents(components.getSource(), filter); } else { return components; } } }; } private void ensureOpen() throws IOException { synchronized (writerLock) { if (writer == null) { if (DirectoryReader.indexExists(dir)) { // Already built; open it: writer = new IndexWriter( dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.APPEND)); } else { writer = new IndexWriter( dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.CREATE)); } setAndCloseOldSearcherManager(new SearcherManager(writer, null)); } } } /** * Adds a new suggestion. Be sure to use {@link #update} instead if you want to replace a previous * suggestion. After adding or updating a batch of new suggestions, you must call {@link #refresh} * in the end in order to see the suggestions in {@link #lookup} */ public void add(BytesRef text, Set contexts, long weight, BytesRef payload) throws IOException { ensureOpen(); writer.addDocument(buildDocument(text, contexts, weight, payload)); } /** * Updates a previous suggestion, matching the exact same text as before. Use this to change the * weight or payload of an already added suggestion. If you know this text is not already present * you can use {@link #add} instead. After adding or updating a batch of new suggestions, you must * call {@link #refresh} in the end in order to see the suggestions in {@link #lookup} */ public void update(BytesRef text, Set contexts, long weight, BytesRef payload) throws IOException { ensureOpen(); writer.updateDocument( new Term(EXACT_TEXT_FIELD_NAME, text.utf8ToString()), buildDocument(text, contexts, weight, payload)); } private Document buildDocument( BytesRef text, Set contexts, long weight, BytesRef payload) throws IOException { String textString = text.utf8ToString(); Document doc = new Document(); FieldType ft = getTextFieldType(); doc.add(new Field(TEXT_FIELD_NAME, textString, ft)); if (minPrefixChars > 0) { doc.add(new Field(TEXTGRAMS_FIELD_NAME, textString, ft)); } doc.add(new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO)); doc.add(new BinaryDocValuesField(TEXT_FIELD_NAME, text)); doc.add(new NumericDocValuesField("weight", weight)); if (payload != null) { doc.add(new BinaryDocValuesField("payloads", payload)); } if (contexts != null) { for (BytesRef context : contexts) { doc.add(new StringField(CONTEXTS_FIELD_NAME, context, Field.Store.NO)); doc.add(new SortedSetDocValuesField(CONTEXTS_FIELD_NAME, context)); } } return doc; } /** * Reopens the underlying searcher; it's best to "batch up" many additions/updates, and then call * refresh once in the end. */ public void refresh() throws IOException { if (searcherMgr == null) { throw new IllegalStateException("suggester was not built"); } if (writer != null) { searcherMgr.maybeRefreshBlocking(); } // else no-op: writer was committed and closed after the index was built // and before searchMgr was constructed, so refresh is unnecessary } /** * Subclass can override this method to change the field type of the text field e.g. to change the * index options */ protected FieldType getTextFieldType() { FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS); ft.setOmitNorms(true); return ft; } @Override public List lookup( CharSequence key, Set contexts, boolean onlyMorePopular, int num) throws IOException { return lookup(key, contexts, num, allTermsRequired, highlight); } /** Lookup, without any context. */ public List lookup( CharSequence key, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { return lookup(key, (BooleanQuery) null, num, allTermsRequired, doHighlight); } /** * Lookup, with context but without booleans. Context booleans default to SHOULD, so each * suggestion must have at least one of the contexts. */ public List lookup( CharSequence key, Set contexts, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { return lookup(key, toQuery(contexts), num, allTermsRequired, doHighlight); } /** * This is called if the last token isn't ended (e.g. user did not type a space after it). Return * an appropriate Query clause to add to the BooleanQuery. */ protected Query getLastTokenQuery(String token) throws IOException { if (token.length() < minPrefixChars) { // The leading ngram was directly indexed: return new TermQuery(new Term(TEXTGRAMS_FIELD_NAME, token)); } return new PrefixQuery(new Term(TEXT_FIELD_NAME, token)); } /** * Retrieve suggestions, specifying whether all terms must match ({@code allTermsRequired}) and * whether the hits should be highlighted ({@code doHighlight}). */ public List lookup( CharSequence key, Map contextInfo, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { return lookup(key, toQuery(contextInfo), num, allTermsRequired, doHighlight); } private BooleanQuery toQuery(Map contextInfo) { if (contextInfo == null || contextInfo.isEmpty()) { return null; } BooleanQuery.Builder contextFilter = new BooleanQuery.Builder(); for (Map.Entry entry : contextInfo.entrySet()) { addContextToQuery(contextFilter, entry.getKey(), entry.getValue()); } return contextFilter.build(); } private BooleanQuery toQuery(Set contextInfo) { if (contextInfo == null || contextInfo.isEmpty()) { return null; } BooleanQuery.Builder contextFilter = new BooleanQuery.Builder(); for (BytesRef context : contextInfo) { addContextToQuery(contextFilter, context, BooleanClause.Occur.SHOULD); } return contextFilter.build(); } /** * This method is handy as we do not need access to internal fields such as CONTEXTS_FIELD_NAME in * order to build queries However, here may not be its best location. * * @param query an instance of @See {@link BooleanQuery} * @param context the context * @param clause one of {@link Occur} */ public void addContextToQuery( BooleanQuery.Builder query, BytesRef context, BooleanClause.Occur clause) { // NOTE: we "should" wrap this in // ConstantScoreQuery, or maybe send this as a // Filter instead to search. // TODO: if we had a BinaryTermField we could fix // this "must be valid ut8f" limitation: query.add(new TermQuery(new Term(CONTEXTS_FIELD_NAME, context)), clause); } /** * This is an advanced method providing the capability to send down to the suggester any arbitrary * lucene query to be used to filter the result of the suggester * * @param key the keyword being looked for * @param contextQuery an arbitrary Lucene query to be used to filter the result of the suggester. * {@link #addContextToQuery} could be used to build this contextQuery. * @param num number of items to return * @param allTermsRequired all searched terms must match or not * @param doHighlight if true, the matching term will be highlighted in the search result * @return the result of the suggester * @throws IOException f the is IO exception while reading data from the index */ @Override public List lookup( CharSequence key, BooleanQuery contextQuery, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { if (searcherMgr == null) { throw new IllegalStateException("suggester was not built"); } final BooleanClause.Occur occur; if (allTermsRequired) { occur = BooleanClause.Occur.MUST; } else { occur = BooleanClause.Occur.SHOULD; } BooleanQuery.Builder query; Set matchedTokens; String prefixToken = null; try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()))) { // long t0 = System.currentTimeMillis(); ts.reset(); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); String lastToken = null; query = new BooleanQuery.Builder(); int maxEndOffset = -1; matchedTokens = new HashSet<>(); while (ts.incrementToken()) { if (lastToken != null) { matchedTokens.add(lastToken); query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur); } lastToken = termAtt.toString(); if (lastToken != null) { maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset()); } } ts.end(); if (lastToken != null) { Query lastQuery; if (maxEndOffset == offsetAtt.endOffset()) { // Use PrefixQuery (or the ngram equivalent) when // there was no trailing discarded chars in the // string (e.g. whitespace), so that if query does // not end with a space we show prefix matches for // that token: lastQuery = getLastTokenQuery(lastToken); prefixToken = lastToken; } else { // Use TermQuery for an exact match if there were // trailing discarded chars (e.g. whitespace), so // that if query ends with a space we only show // exact matches for that term: matchedTokens.add(lastToken); lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)); } if (lastQuery != null) { query.add(lastQuery, occur); } } if (contextQuery != null) { boolean allMustNot = true; for (BooleanClause clause : contextQuery.clauses()) { if (clause.getOccur() != BooleanClause.Occur.MUST_NOT) { allMustNot = false; break; } } if (allMustNot) { // All are MUST_NOT: add the contextQuery to the main query instead (not as sub-query) for (BooleanClause clause : contextQuery.clauses()) { query.add(clause); } } else if (allTermsRequired == false) { // We must carefully upgrade the query clauses to MUST: BooleanQuery.Builder newQuery = new BooleanQuery.Builder(); newQuery.add(query.build(), BooleanClause.Occur.MUST); newQuery.add(contextQuery, BooleanClause.Occur.MUST); query = newQuery; } else { // Add contextQuery as sub-query query.add(contextQuery, BooleanClause.Occur.MUST); } } } // TODO: we could allow blended sort here, combining // weight w/ score. Now we ignore score and sort only // by weight: Query finalQuery = finishQuery(query, allTermsRequired); // System.out.println("finalQuery=" + finalQuery); // Sort by weight, descending: List results = null; SearcherManager mgr; IndexSearcher searcher; searcherMgrReadLock.lock(); try { mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference searcher = mgr.acquire(); } finally { searcherMgrReadLock.unlock(); } try { TopFieldCollectorManager c = new TopFieldCollectorManager(SORT, num, null, 1, searcher.getSlices().length > 1); // System.out.println("got searcher=" + searcher); TopFieldDocs hits = searcher.search(finalQuery, c); // Slower way if postings are not pre-sorted by weight: // hits = searcher.search(query, null, num, SORT); results = createResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken); } finally { mgr.release(searcher); } // System.out.println((System.currentTimeMillis() - t0) + " ms for infix suggest"); // System.out.println(results); return results; } /** * Create the results based on the search hits. Can be overridden by subclass to add particular * behavior (e.g. weight transformation). Note that there is no prefix token (the {@code * prefixToken} argument will be null) whenever the final token in the incoming request was in * fact finished (had trailing characters, such as white-space). * * @throws IOException If there are problems reading fields from the underlying Lucene index. */ protected List createResults( IndexSearcher searcher, TopFieldDocs hits, int num, CharSequence charSequence, boolean doHighlight, Set matchedTokens, String prefixToken) throws IOException { List leaves = searcher.getIndexReader().leaves(); List results = new ArrayList<>(); for (int i = 0; i < hits.scoreDocs.length; i++) { FieldDoc fd = (FieldDoc) hits.scoreDocs[i]; BinaryDocValues textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME); textDV.advance(fd.doc); BytesRef term = textDV.binaryValue(); String text = term.utf8ToString(); long score = (Long) fd.fields[0]; // This will just be null if app didn't pass payloads to build(): // TODO: maybe just stored fields? they compress... BinaryDocValues payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads"); BytesRef payload; if (payloadsDV != null) { if (payloadsDV.advance(fd.doc) == fd.doc) { payload = BytesRef.deepCopyOf(payloadsDV.binaryValue()); } else { payload = new BytesRef(BytesRef.EMPTY_BYTES); } } else { payload = null; } // Must look up sorted-set by segment: int segment = ReaderUtil.subIndex(fd.doc, leaves); SortedSetDocValues contextsDV = leaves.get(segment).reader().getSortedSetDocValues(CONTEXTS_FIELD_NAME); Set contexts; if (contextsDV != null) { contexts = new HashSet(); int targetDocID = fd.doc - leaves.get(segment).docBase; if (contextsDV.advance(targetDocID) == targetDocID) { for (int j = 0; j < contextsDV.docValueCount(); j++) { BytesRef context = BytesRef.deepCopyOf(contextsDV.lookupOrd(contextsDV.nextOrd())); contexts.add(context); } } } else { contexts = null; } LookupResult result; if (doHighlight) { result = new LookupResult( text, highlight(text, matchedTokens, prefixToken), score, payload, contexts); } else { result = new LookupResult(text, score, payload, contexts); } results.add(result); } return results; } /** Subclass can override this to tweak the Query before searching. */ protected Query finishQuery(BooleanQuery.Builder in, boolean allTermsRequired) { return in.build(); } /** * Override this method to customize the Object representing a single highlighted suggestions; the * result is set on each {@link org.apache.lucene.search.suggest.Lookup.LookupResult#highlightKey} * member. */ protected Object highlight(String text, Set matchedTokens, String prefixToken) throws IOException { try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); ts.reset(); StringBuilder sb = new StringBuilder(); int upto = 0; while (ts.incrementToken()) { String token = termAtt.toString(); int startOffset = offsetAtt.startOffset(); int endOffset = offsetAtt.endOffset(); if (upto < startOffset) { addNonMatch(sb, text.substring(upto, startOffset)); upto = startOffset; } else if (upto > startOffset) { continue; } if (matchedTokens.contains(token)) { // Token matches. addWholeMatch(sb, text.substring(startOffset, endOffset), token); upto = endOffset; } else if (prefixToken != null && token.startsWith(prefixToken)) { addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken); upto = endOffset; } } ts.end(); int endOffset = offsetAtt.endOffset(); if (upto < endOffset) { addNonMatch(sb, text.substring(upto)); } return sb.toString(); } } /** * Called while highlighting a single result, to append a non-matching chunk of text from the * suggestion to the provided fragments list. * * @param sb The {@code StringBuilder} to append to * @param text The text chunk to add */ protected void addNonMatch(StringBuilder sb, String text) { sb.append(text); } /** * Called while highlighting a single result, to append the whole matched token to the provided * fragments list. * * @param sb The {@code StringBuilder} to append to * @param surface The surface form (original) text * @param analyzed The analyzed token corresponding to the surface form text */ protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) { sb.append(""); sb.append(surface); sb.append(""); } /** * Called while highlighting a single result, to append a matched prefix token, to the provided * fragments list. * * @param sb The {@code StringBuilder} to append to * @param surface The fragment of the surface form (indexed during {@link #build}, corresponding * to this match * @param analyzed The analyzed token that matched * @param prefixToken The prefix of the token that matched */ protected void addPrefixMatch( StringBuilder sb, String surface, String analyzed, String prefixToken) { // TODO: apps can try to invert their analysis logic // here, e.g. downcase the two before checking prefix: if (prefixToken.length() >= surface.length()) { addWholeMatch(sb, surface, analyzed); return; } sb.append(""); sb.append(surface.substring(0, prefixToken.length())); sb.append(""); sb.append(surface.substring(prefixToken.length())); } @Override public boolean store(DataOutput in) throws IOException { return false; } @Override public boolean load(DataInput out) throws IOException { return false; } @Override public void close() throws IOException { if (searcherMgr != null) { searcherMgr.close(); searcherMgr = null; } if (writer != null) { writer.close(); writer = null; } if (dir != null) { dir.close(); } } @Override public long ramBytesUsed() { return 0L; } @Override public long getCount() throws IOException { if (searcherMgr == null) { return 0; } SearcherManager mgr; IndexSearcher searcher; searcherMgrReadLock.lock(); try { mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference searcher = mgr.acquire(); } finally { searcherMgrReadLock.unlock(); } try { return searcher.getIndexReader().numDocs(); } finally { mgr.release(searcher); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy