All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester Maven / Gradle / Ivy

There is a newer version: 9.11.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.analyzing;

import java.io.Closeable;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;

// TODO:
//   - a PostingsFormat that stores super-high-freq terms as
//     a bitset should be a win for the prefix terms?
//     (LUCENE-5052)
//   - we could offer a better integration with
//     DocumentDictionary and NRT?  so that your suggester
//     "automatically" keeps in sync w/ your index

/** Analyzes the input text and then suggests matches based
 *  on prefix matches to any tokens in the indexed text.
 *  This also highlights the tokens that match.
 *
 *  

This suggester supports payloads. Matches are sorted only * by the suggest weight; it would be nice to support * blended score + weight sort in the future. This means * this suggester best applies when there is a strong * a-priori ranking of all the suggestions. * *

This suggester supports contexts, including arbitrary binary * terms. * * @lucene.experimental */ public class AnalyzingInfixSuggester extends Lookup implements Closeable { /** edgegrams for searching short prefixes without Prefix Query * that's controlled by {@linkplain #minPrefixChars} */ protected final static String TEXTGRAMS_FIELD_NAME = "textgrams"; /** Field name used for the indexed text. */ protected final static String TEXT_FIELD_NAME = "text"; /** Field name used for the indexed text, as a * StringField, for exact lookup. */ protected final static String EXACT_TEXT_FIELD_NAME = "exacttext"; /** Field name used for the indexed context, as a * StringField and a SortedSetDVField, for filtering. */ protected final static String CONTEXTS_FIELD_NAME = "contexts"; /** Analyzer used at search time */ protected final Analyzer queryAnalyzer; /** Analyzer used at index time */ protected final Analyzer indexAnalyzer; private final Directory dir; final int minPrefixChars; private final boolean allTermsRequired; private final boolean highlight; private final boolean commitOnBuild; private final boolean closeIndexWriterOnBuild; /** Used for ongoing NRT additions/updates. */ protected IndexWriter writer; /** {@link IndexSearcher} used for lookups. */ protected SearcherManager searcherMgr; /** Used to manage concurrent access to searcherMgr */ protected final Object searcherMgrLock = new Object(); /** Default minimum number of leading characters before * PrefixQuery is used (4). */ public static final int DEFAULT_MIN_PREFIX_CHARS = 4; /** Default boolean clause option for multiple terms matching (all terms required). */ public static final boolean DEFAULT_ALL_TERMS_REQUIRED = true; /** Default higlighting option. */ public static final boolean DEFAULT_HIGHLIGHT = true; /** Default option to close the IndexWriter once the index has been built. */ protected final static boolean DEFAULT_CLOSE_INDEXWRITER_ON_BUILD = true; /** How we sort the postings and search results. */ private static final Sort SORT = new Sort(new SortField("weight", SortField.Type.LONG, true)); /** Create a new instance, loading from a previously built * AnalyzingInfixSuggester directory, if it exists. This directory must be * private to the infix suggester (i.e., not an external * Lucene index). Note that {@link #close} * will also close the provided directory. */ public AnalyzingInfixSuggester(Directory dir, Analyzer analyzer) throws IOException { this(dir, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS, false, DEFAULT_ALL_TERMS_REQUIRED, DEFAULT_HIGHLIGHT); } /** Create a new instance, loading from a previously built * AnalyzingInfixSuggester directory, if it exists. This directory must be * private to the infix suggester (i.e., not an external * Lucene index). Note that {@link #close} * will also close the provided directory. * * @param minPrefixChars Minimum number of leading characters * before PrefixQuery is used (default 4). * Prefixes shorter than this are indexed as character * ngrams (increasing index size but making lookups * faster). * * @param commitOnBuild Call commit after the index has finished building. This would persist the * suggester index to disk and future instances of this suggester can use this pre-built dictionary. */ public AnalyzingInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, boolean commitOnBuild) throws IOException { this(dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild, DEFAULT_ALL_TERMS_REQUIRED, DEFAULT_HIGHLIGHT); } /** Create a new instance, loading from a previously built * AnalyzingInfixSuggester directory, if it exists. This directory must be * private to the infix suggester (i.e., not an external * Lucene index). Note that {@link #close} * will also close the provided directory. * * @param minPrefixChars Minimum number of leading characters * before PrefixQuery is used (default 4). * Prefixes shorter than this are indexed as character * ngrams (increasing index size but making lookups * faster). * * @param commitOnBuild Call commit after the index has finished building. This would persist the * suggester index to disk and future instances of this suggester can use this pre-built dictionary. * * @param allTermsRequired All terms in the suggest query must be matched. * @param highlight Highlight suggest query in suggestions. * */ public AnalyzingInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, boolean commitOnBuild, boolean allTermsRequired, boolean highlight) throws IOException { this(dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild, allTermsRequired, highlight, DEFAULT_CLOSE_INDEXWRITER_ON_BUILD); } /** Create a new instance, loading from a previously built * AnalyzingInfixSuggester directory, if it exists. This directory must be * private to the infix suggester (i.e., not an external * Lucene index). Note that {@link #close} * will also close the provided directory. * * @param minPrefixChars Minimum number of leading characters * before PrefixQuery is used (default 4). * Prefixes shorter than this are indexed as character * ngrams (increasing index size but making lookups * faster). * * @param commitOnBuild Call commit after the index has finished building. This would persist the * suggester index to disk and future instances of this suggester can use this pre-built dictionary. * * @param allTermsRequired All terms in the suggest query must be matched. * @param highlight Highlight suggest query in suggestions. * @param closeIndexWriterOnBuild If true, the IndexWriter will be closed after the index has finished building. */ public AnalyzingInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, boolean commitOnBuild, boolean allTermsRequired, boolean highlight, boolean closeIndexWriterOnBuild) throws IOException { if (minPrefixChars < 0) { throw new IllegalArgumentException("minPrefixChars must be >= 0; got: " + minPrefixChars); } this.queryAnalyzer = queryAnalyzer; this.indexAnalyzer = indexAnalyzer; this.dir = dir; this.minPrefixChars = minPrefixChars; this.commitOnBuild = commitOnBuild; this.allTermsRequired = allTermsRequired; this.highlight = highlight; this.closeIndexWriterOnBuild = closeIndexWriterOnBuild; if (DirectoryReader.indexExists(dir)) { // Already built; open it: searcherMgr = new SearcherManager(dir, null); } } /** Override this to customize index settings, e.g. which * codec to use. */ protected IndexWriterConfig getIndexWriterConfig(Analyzer indexAnalyzer, IndexWriterConfig.OpenMode openMode) { IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer); iwc.setOpenMode(openMode); // This way all merged segments will be sorted at // merge time, allow for per-segment early termination // when those segments are searched: iwc.setIndexSort(SORT); return iwc; } /** Subclass can override to choose a specific {@link * Directory} implementation. */ protected Directory getDirectory(Path path) throws IOException { return FSDirectory.open(path); } @Override public void build(InputIterator iter) throws IOException { synchronized (searcherMgrLock) { if (searcherMgr != null) { searcherMgr.close(); searcherMgr = null; } if (writer != null) { writer.close(); writer = null; } boolean success = false; try { // First pass: build a temporary normal Lucene index, // just indexing the suggestions as they iterate: writer = new IndexWriter(dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.CREATE)); //long t0 = System.nanoTime(); // TODO: use threads? BytesRef text; while ((text = iter.next()) != null) { BytesRef payload; if (iter.hasPayloads()) { payload = iter.payload(); } else { payload = null; } add(text, iter.contexts(), iter.weight(), payload); } //System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec"); if (commitOnBuild || closeIndexWriterOnBuild) { commit(); } searcherMgr = new SearcherManager(writer, null); success = true; } finally { if (success) { if (closeIndexWriterOnBuild) { writer.close(); writer = null; } } else { // failure if (writer != null) { writer.rollback(); writer = null; } } } } } /** Commits all pending changes made to this suggester to disk. * * @see IndexWriter#commit */ public void commit() throws IOException { if (writer == null) { if (searcherMgr == null || closeIndexWriterOnBuild == false) { throw new IllegalStateException("Cannot commit on an closed writer. Add documents first"); } // else no-op: writer was committed and closed after the index was built, so commit is unnecessary } else { writer.commit(); } } private Analyzer getGramAnalyzer() { return new AnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) { @Override protected Analyzer getWrappedAnalyzer(String fieldName) { return indexAnalyzer; } @Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { assert !(fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars == 0) : "no need \"textgrams\" when minPrefixChars="+minPrefixChars; if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) { // TODO: should use an EdgeNGramTokenFilterFactory here TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars, false); return new TokenStreamComponents(components.getSource(), filter); } else { return components; } } }; } private synchronized void ensureOpen() throws IOException { if (writer == null) { if (DirectoryReader.indexExists(dir)) { // Already built; open it: writer = new IndexWriter(dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.APPEND)); } else { writer = new IndexWriter(dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.CREATE)); } synchronized (searcherMgrLock) { SearcherManager oldSearcherMgr = searcherMgr; searcherMgr = new SearcherManager(writer, null); if (oldSearcherMgr != null) { oldSearcherMgr.close(); } } } } /** Adds a new suggestion. Be sure to use {@link #update} * instead if you want to replace a previous suggestion. * After adding or updating a batch of new suggestions, * you must call {@link #refresh} in the end in order to * see the suggestions in {@link #lookup} */ public void add(BytesRef text, Set contexts, long weight, BytesRef payload) throws IOException { ensureOpen(); writer.addDocument(buildDocument(text, contexts, weight, payload)); } /** Updates a previous suggestion, matching the exact same * text as before. Use this to change the weight or * payload of an already added suggestion. If you know * this text is not already present you can use {@link * #add} instead. After adding or updating a batch of * new suggestions, you must call {@link #refresh} in the * end in order to see the suggestions in {@link #lookup} */ public void update(BytesRef text, Set contexts, long weight, BytesRef payload) throws IOException { ensureOpen(); writer.updateDocument(new Term(EXACT_TEXT_FIELD_NAME, text.utf8ToString()), buildDocument(text, contexts, weight, payload)); } private Document buildDocument(BytesRef text, Set contexts, long weight, BytesRef payload) throws IOException { String textString = text.utf8ToString(); Document doc = new Document(); FieldType ft = getTextFieldType(); doc.add(new Field(TEXT_FIELD_NAME, textString, ft)); if (minPrefixChars>0) { doc.add(new Field(TEXTGRAMS_FIELD_NAME, textString, ft)); } doc.add(new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO)); doc.add(new BinaryDocValuesField(TEXT_FIELD_NAME, text)); doc.add(new NumericDocValuesField("weight", weight)); if (payload != null) { doc.add(new BinaryDocValuesField("payloads", payload)); } if (contexts != null) { for(BytesRef context : contexts) { doc.add(new StringField(CONTEXTS_FIELD_NAME, context, Field.Store.NO)); doc.add(new SortedSetDocValuesField(CONTEXTS_FIELD_NAME, context)); } } return doc; } /** Reopens the underlying searcher; it's best to "batch * up" many additions/updates, and then call refresh * once in the end. */ public void refresh() throws IOException { if (searcherMgr == null) { throw new IllegalStateException("suggester was not built"); } if (writer != null) { searcherMgr.maybeRefreshBlocking(); } // else no-op: writer was committed and closed after the index was built // and before searchMgr was constructed, so refresh is unnecessary } /** * Subclass can override this method to change the field type of the text field * e.g. to change the index options */ protected FieldType getTextFieldType(){ FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS); ft.setOmitNorms(true); return ft; } @Override public List lookup(CharSequence key, Set contexts, boolean onlyMorePopular, int num) throws IOException { return lookup(key, contexts, num, allTermsRequired, highlight); } /** Lookup, without any context. */ public List lookup(CharSequence key, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { return lookup(key, (BooleanQuery)null, num, allTermsRequired, doHighlight); } /** Lookup, with context but without booleans. Context booleans default to SHOULD, * so each suggestion must have at least one of the contexts. */ public List lookup(CharSequence key, Set contexts, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { return lookup(key, toQuery(contexts), num, allTermsRequired, doHighlight); } /** This is called if the last token isn't ended * (e.g. user did not type a space after it). Return an * appropriate Query clause to add to the BooleanQuery. */ protected Query getLastTokenQuery(String token) throws IOException { if (token.length() < minPrefixChars) { // The leading ngram was directly indexed: return new TermQuery(new Term(TEXTGRAMS_FIELD_NAME, token)); } return new PrefixQuery(new Term(TEXT_FIELD_NAME, token)); } /** Retrieve suggestions, specifying whether all terms * must match ({@code allTermsRequired}) and whether the hits * should be highlighted ({@code doHighlight}). */ public List lookup(CharSequence key, Map contextInfo, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { return lookup(key, toQuery(contextInfo), num, allTermsRequired, doHighlight); } private BooleanQuery toQuery(Map contextInfo) { if (contextInfo == null || contextInfo.isEmpty()) { return null; } BooleanQuery.Builder contextFilter = new BooleanQuery.Builder(); for (Map.Entry entry : contextInfo.entrySet()) { addContextToQuery(contextFilter, entry.getKey(), entry.getValue()); } return contextFilter.build(); } private BooleanQuery toQuery(Set contextInfo) { if (contextInfo == null || contextInfo.isEmpty()) { return null; } BooleanQuery.Builder contextFilter = new BooleanQuery.Builder(); for (BytesRef context : contextInfo) { addContextToQuery(contextFilter, context, BooleanClause.Occur.SHOULD); } return contextFilter.build(); } /** * This method is handy as we do not need access to internal fields such as CONTEXTS_FIELD_NAME in order to build queries * However, here may not be its best location. * * @param query an instance of @See {@link BooleanQuery} * @param context the context * @param clause one of {@link Occur} */ public void addContextToQuery(BooleanQuery.Builder query, BytesRef context, BooleanClause.Occur clause) { // NOTE: we "should" wrap this in // ConstantScoreQuery, or maybe send this as a // Filter instead to search. // TODO: if we had a BinaryTermField we could fix // this "must be valid ut8f" limitation: query.add(new TermQuery(new Term(CONTEXTS_FIELD_NAME, context)), clause); } /** * This is an advanced method providing the capability to send down to the suggester any * arbitrary lucene query to be used to filter the result of the suggester * * @param key the keyword being looked for * @param contextQuery an arbitrary Lucene query to be used to filter the result of the suggester. {@link #addContextToQuery} could be used to build this contextQuery. * @param num number of items to return * @param allTermsRequired all searched terms must match or not * @param doHighlight if true, the matching term will be highlighted in the search result * @return the result of the suggester * @throws IOException f the is IO exception while reading data from the index */ public List lookup(CharSequence key, BooleanQuery contextQuery, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { if (searcherMgr == null) { throw new IllegalStateException("suggester was not built"); } final BooleanClause.Occur occur; if (allTermsRequired) { occur = BooleanClause.Occur.MUST; } else { occur = BooleanClause.Occur.SHOULD; } BooleanQuery.Builder query; Set matchedTokens; String prefixToken = null; try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()))) { //long t0 = System.currentTimeMillis(); ts.reset(); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); String lastToken = null; query = new BooleanQuery.Builder(); int maxEndOffset = -1; matchedTokens = new HashSet<>(); while (ts.incrementToken()) { if (lastToken != null) { matchedTokens.add(lastToken); query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur); } lastToken = termAtt.toString(); if (lastToken != null) { maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset()); } } ts.end(); if (lastToken != null) { Query lastQuery; if (maxEndOffset == offsetAtt.endOffset()) { // Use PrefixQuery (or the ngram equivalent) when // there was no trailing discarded chars in the // string (e.g. whitespace), so that if query does // not end with a space we show prefix matches for // that token: lastQuery = getLastTokenQuery(lastToken); prefixToken = lastToken; } else { // Use TermQuery for an exact match if there were // trailing discarded chars (e.g. whitespace), so // that if query ends with a space we only show // exact matches for that term: matchedTokens.add(lastToken); lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)); } if (lastQuery != null) { query.add(lastQuery, occur); } } if (contextQuery != null) { boolean allMustNot = true; for (BooleanClause clause : contextQuery.clauses()) { if (clause.getOccur() != BooleanClause.Occur.MUST_NOT) { allMustNot = false; break; } } if (allMustNot) { // All are MUST_NOT: add the contextQuery to the main query instead (not as sub-query) for (BooleanClause clause : contextQuery.clauses()) { query.add(clause); } } else if (allTermsRequired == false) { // We must carefully upgrade the query clauses to MUST: BooleanQuery.Builder newQuery = new BooleanQuery.Builder(); newQuery.add(query.build(), BooleanClause.Occur.MUST); newQuery.add(contextQuery, BooleanClause.Occur.MUST); query = newQuery; } else { // Add contextQuery as sub-query query.add(contextQuery, BooleanClause.Occur.MUST); } } } // TODO: we could allow blended sort here, combining // weight w/ score. Now we ignore score and sort only // by weight: Query finalQuery = finishQuery(query, allTermsRequired); //System.out.println("finalQuery=" + finalQuery); // Sort by weight, descending: TopFieldCollector c = TopFieldCollector.create(SORT, num, 1); List results = null; SearcherManager mgr; IndexSearcher searcher; synchronized (searcherMgrLock) { mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference searcher = mgr.acquire(); } try { //System.out.println("got searcher=" + searcher); searcher.search(finalQuery, c); TopFieldDocs hits = c.topDocs(); // Slower way if postings are not pre-sorted by weight: // hits = searcher.search(query, null, num, SORT); results = createResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken); } finally { mgr.release(searcher); } //System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest"); //System.out.println(results); return results; } /** * Create the results based on the search hits. * Can be overridden by subclass to add particular behavior (e.g. weight transformation). * Note that there is no prefix token (the {@code prefixToken} argument will * be null) whenever the final token in the incoming request was in fact finished * (had trailing characters, such as white-space). * * @throws IOException If there are problems reading fields from the underlying Lucene index. */ protected List createResults(IndexSearcher searcher, TopFieldDocs hits, int num, CharSequence charSequence, boolean doHighlight, Set matchedTokens, String prefixToken) throws IOException { List leaves = searcher.getIndexReader().leaves(); List results = new ArrayList<>(); for (int i=0;i contexts; if (contextsDV != null) { contexts = new HashSet(); int targetDocID = fd.doc - leaves.get(segment).docBase; if (contextsDV.advance(targetDocID) == targetDocID) { long ord; while ((ord = contextsDV.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { BytesRef context = BytesRef.deepCopyOf(contextsDV.lookupOrd(ord)); contexts.add(context); } } } else { contexts = null; } LookupResult result; if (doHighlight) { result = new LookupResult(text, highlight(text, matchedTokens, prefixToken), score, payload, contexts); } else { result = new LookupResult(text, score, payload, contexts); } results.add(result); } return results; } /** Subclass can override this to tweak the Query before * searching. */ protected Query finishQuery(BooleanQuery.Builder in, boolean allTermsRequired) { return in.build(); } /** Override this method to customize the Object * representing a single highlighted suggestions; the * result is set on each {@link * org.apache.lucene.search.suggest.Lookup.LookupResult#highlightKey} member. */ protected Object highlight(String text, Set matchedTokens, String prefixToken) throws IOException { try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); ts.reset(); StringBuilder sb = new StringBuilder(); int upto = 0; while (ts.incrementToken()) { String token = termAtt.toString(); int startOffset = offsetAtt.startOffset(); int endOffset = offsetAtt.endOffset(); if (upto < startOffset) { addNonMatch(sb, text.substring(upto, startOffset)); upto = startOffset; } else if (upto > startOffset) { continue; } if (matchedTokens.contains(token)) { // Token matches. addWholeMatch(sb, text.substring(startOffset, endOffset), token); upto = endOffset; } else if (prefixToken != null && token.startsWith(prefixToken)) { addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken); upto = endOffset; } } ts.end(); int endOffset = offsetAtt.endOffset(); if (upto < endOffset) { addNonMatch(sb, text.substring(upto)); } return sb.toString(); } } /** Called while highlighting a single result, to append a * non-matching chunk of text from the suggestion to the * provided fragments list. * @param sb The {@code StringBuilder} to append to * @param text The text chunk to add */ protected void addNonMatch(StringBuilder sb, String text) { sb.append(text); } /** Called while highlighting a single result, to append * the whole matched token to the provided fragments list. * @param sb The {@code StringBuilder} to append to * @param surface The surface form (original) text * @param analyzed The analyzed token corresponding to the surface form text */ protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) { sb.append(""); sb.append(surface); sb.append(""); } /** Called while highlighting a single result, to append a * matched prefix token, to the provided fragments list. * @param sb The {@code StringBuilder} to append to * @param surface The fragment of the surface form * (indexed during {@link #build}, corresponding to * this match * @param analyzed The analyzed token that matched * @param prefixToken The prefix of the token that matched */ protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, String prefixToken) { // TODO: apps can try to invert their analysis logic // here, e.g. downcase the two before checking prefix: if (prefixToken.length() >= surface.length()) { addWholeMatch(sb, surface, analyzed); return; } sb.append(""); sb.append(surface.substring(0, prefixToken.length())); sb.append(""); sb.append(surface.substring(prefixToken.length())); } @Override public boolean store(DataOutput in) throws IOException { return false; } @Override public boolean load(DataInput out) throws IOException { return false; } @Override public void close() throws IOException { if (searcherMgr != null) { searcherMgr.close(); searcherMgr = null; } if (writer != null) { writer.close(); writer = null; } if (dir != null) { dir.close(); } } @Override public long ramBytesUsed() { long mem = RamUsageEstimator.shallowSizeOf(this); try { if (searcherMgr != null) { SearcherManager mgr; IndexSearcher searcher; synchronized (searcherMgrLock) { mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference searcher = mgr.acquire(); } try { for (LeafReaderContext context : searcher.getIndexReader().leaves()) { LeafReader reader = FilterLeafReader.unwrap(context.reader()); if (reader instanceof SegmentReader) { mem += ((SegmentReader) context.reader()).ramBytesUsed(); } } } finally { mgr.release(searcher); } } return mem; } catch (IOException ioe) { throw new RuntimeException(ioe); } } @Override public Collection getChildResources() { List resources = new ArrayList<>(); try { if (searcherMgr != null) { SearcherManager mgr; IndexSearcher searcher; synchronized (searcherMgrLock) { mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference searcher = mgr.acquire(); } try { for (LeafReaderContext context : searcher.getIndexReader().leaves()) { LeafReader reader = FilterLeafReader.unwrap(context.reader()); if (reader instanceof SegmentReader) { resources.add(Accountables.namedAccountable("segment", (SegmentReader)reader)); } } } finally { mgr.release(searcher); } } return Collections.unmodifiableList(resources); } catch (IOException ioe) { throw new RuntimeException(ioe); } } @Override public long getCount() throws IOException { if (searcherMgr == null) { return 0; } SearcherManager mgr; IndexSearcher searcher; synchronized (searcherMgrLock) { mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference searcher = mgr.acquire(); } try { return searcher.getIndexReader().numDocs(); } finally { mgr.release(searcher); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy