All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.suggest.document.NRTSuggester Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.document;

import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseSurfaceForm;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.search.suggest.analyzing.FSTUtil;
import org.apache.lucene.search.suggest.document.CompletionPostingsFormat.FSTLoadMode;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.OffHeapFSTStore;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;

/**
 * NRTSuggester executes Top N search on a weighted FST specified by a {@link CompletionScorer}
 *
 * 

See {@link #lookup(CompletionScorer, Bits, TopSuggestDocsCollector)} for more implementation * details. * *

FST Format: * *

    *
  • Input: analyzed forms of input terms *
  • Output: Pair<Long, BytesRef> containing weight, surface form and docID *
* *

NOTE: * *

    *
  • having too many deletions or using a very restrictive filter can make the search * inadmissible due to over-pruning of potential paths. See {@link * CompletionScorer#accept(int, Bits)} *
  • when matched documents are arbitrarily filtered ({@link CompletionScorer#filtered} set to * true, it is assumed that the filter will roughly filter out half the number of * documents that match the provided automaton *
  • lookup performance will degrade as more accepted completions lead to filtered out documents *
* * @lucene.experimental */ public final class NRTSuggester implements Accountable { /** * FST: input is the analyzed form, with a null byte between terms and a {@link * NRTSuggesterBuilder#END_BYTE} to denote the end of the input weight is a long surface is the * original, unanalyzed form followed by the docID */ private final FST> fst; /** * Highest number of analyzed paths we saw for any single input surface form. This can be > 1, * when index analyzer creates graphs or if multiple surface form(s) yields the same analyzed form */ private final int maxAnalyzedPathsPerOutput; /** Separator used between surface form and its docID in the FST output */ private final int payloadSep; /** * Maximum queue depth for TopNSearcher * *

NOTE: value should be <= Integer.MAX_VALUE */ private static final long MAX_TOP_N_QUEUE_SIZE = 5000; private NRTSuggester( FST> fst, int maxAnalyzedPathsPerOutput, int payloadSep) { this.fst = fst; this.maxAnalyzedPathsPerOutput = maxAnalyzedPathsPerOutput; this.payloadSep = payloadSep; } @Override public long ramBytesUsed() { return fst == null ? 0 : fst.ramBytesUsed(); } @Override public Collection getChildResources() { return Collections.emptyList(); } /** * Collects at most {@link TopSuggestDocsCollector#getCountToCollect()} completions that match the * provided {@link CompletionScorer}. * *

The {@link CompletionScorer#automaton} is intersected with the {@link #fst}. {@link * CompletionScorer#weight} is used to compute boosts and/or extract context for each matched * partial paths. A top N search is executed on {@link #fst} seeded with the matched partial * paths. Upon reaching a completed path, {@link CompletionScorer#accept(int, Bits)} and {@link * CompletionScorer#score(float, float)} is used on the document id, index weight and query boost * to filter and score the entry, before being collected via {@link * TopSuggestDocsCollector#collect(int, CharSequence, CharSequence, float)} */ public void lookup( final CompletionScorer scorer, Bits acceptDocs, final TopSuggestDocsCollector collector) throws IOException { final double liveDocsRatio = calculateLiveDocRatio(scorer.reader.numDocs(), scorer.reader.maxDoc()); if (liveDocsRatio == -1) { return; } final List>> prefixPaths = FSTUtil.intersectPrefixPaths(scorer.automaton, fst); // The topN is increased by a factor of # of intersected path // to ensure search admissibility. For example, one suggestion can // have multiple contexts, resulting in num_context paths for the // suggestion instead of 1 in the FST. When queried for the suggestion, // the topN value ensures that all paths to the suggestion are evaluated // (in case of a match all context query). // Note that collectors will early terminate as soon as enough suggestions // have been collected, regardless of the set topN value. This value is the // maximum number of suggestions that can be collected. final int topN = collector.getCountToCollect() * prefixPaths.size(); final int queueSize = getMaxTopNSearcherQueueSize(topN, scorer.reader.numDocs(), liveDocsRatio, scorer.filtered); final CharsRefBuilder spare = new CharsRefBuilder(); Util.TopNSearcher> searcher = new Util.TopNSearcher<>( fst, topN, queueSize, (o1, o2) -> Long.compare(o1.output1, o2.output1), new ScoringPathComparator(scorer)) { private final ByteArrayDataInput scratchInput = new ByteArrayDataInput(); @Override protected boolean acceptPartialPath(Util.FSTPath> path) { if (collector.doSkipDuplicates()) { // We are removing dups if (path.payload == -1) { // This path didn't yet see the complete surface form; let's see if it just did with // the arc output we just added: BytesRef arcOutput = path.arc.output().output2; BytesRef output = path.output.output2; for (int i = 0; i < arcOutput.length; i++) { if (arcOutput.bytes[arcOutput.offset + i] == payloadSep) { // OK this arc that the path was just extended by contains the payloadSep, so we // now have a full surface form in this path path.payload = output.length - arcOutput.length + i; assert output.bytes[output.offset + path.payload] == payloadSep; break; } } } if (path.payload != -1) { BytesRef output = path.output.output2; spare.copyUTF8Bytes(output.bytes, output.offset, path.payload); if (collector.seenSurfaceForms.contains(spare.chars(), 0, spare.length())) { return false; } } } return true; } @Override protected boolean acceptResult(Util.FSTPath> path) { BytesRef output = path.output.output2; int payloadSepIndex; if (path.payload != -1) { payloadSepIndex = path.payload; spare.copyUTF8Bytes(output.bytes, output.offset, payloadSepIndex); } else { assert collector.doSkipDuplicates() == false; payloadSepIndex = parseSurfaceForm(output, payloadSep, spare); } scratchInput.reset( output.bytes, output.offset + payloadSepIndex + 1, output.length - payloadSepIndex - 1); int docID = scratchInput.readVInt(); if (!scorer.accept(docID, acceptDocs)) { return false; } if (collector.doSkipDuplicates()) { // now record that we've seen this surface form: char[] key = new char[spare.length()]; System.arraycopy(spare.chars(), 0, key, 0, spare.length()); if (collector.seenSurfaceForms.contains(key)) { // we already collected a higher scoring document with this key, in this segment: return false; } collector.seenSurfaceForms.add(key); } try { float score = scorer.score((float) decode(path.output.output1), path.boost); collector.collect(docID, spare.toCharsRef(), path.context, score); return true; } catch (IOException e) { throw new RuntimeException(e); } } }; for (FSTUtil.Path> path : prefixPaths) { scorer.weight.setNextMatch(path.input().get()); BytesRef output = path.output().output2; int payload = -1; if (collector.doSkipDuplicates()) { for (int j = 0; j < output.length; j++) { if (output.bytes[output.offset + j] == payloadSep) { // Important to cache this, else we have a possibly O(N^2) cost where N is the length of // suggestions payload = j; break; } } } searcher.addStartPaths( path.fstNode(), path.output(), false, path.input(), scorer.weight.boost(), scorer.weight.context(), payload); } // hits are also returned by search() // we do not use it, instead collect at acceptResult searcher.search(); // search admissibility is not guaranteed // see comment on getMaxTopNSearcherQueueSize // assert search.isComplete; } /** * Compares partial completion paths using {@link CompletionScorer#score(float, float)}, breaks * ties comparing path inputs */ private record ScoringPathComparator(CompletionScorer scorer) implements Comparator>> { @Override public int compare( Util.FSTPath> first, Util.FSTPath> second) { int cmp = Float.compare( scorer.score((float) decode(second.output.output1), second.boost), scorer.score((float) decode(first.output.output1), first.boost)); return (cmp != 0) ? cmp : first.input.get().compareTo(second.input.get()); } } /** * Simple heuristics to try to avoid over-pruning potential suggestions by the TopNSearcher. Since * suggestion entries can be rejected if they belong to a deleted document, the length of the * TopNSearcher queue has to be increased by some factor, to account for the filtered out * suggestions. This heuristic will try to make the searcher admissible, but the search can still * lead to over-pruning * *

If a filter is applied, the queue size is increased by half the number of live * documents. * *

The maximum queue size is {@link #MAX_TOP_N_QUEUE_SIZE} */ private int getMaxTopNSearcherQueueSize( int topN, int numDocs, double liveDocsRatio, boolean filterEnabled) { long maxQueueSize = topN * (long) maxAnalyzedPathsPerOutput; // liveDocRatio can be at most 1.0 (if no docs were deleted) assert liveDocsRatio <= 1.0d; maxQueueSize = (long) (maxQueueSize / liveDocsRatio); if (filterEnabled) { maxQueueSize = maxQueueSize + (numDocs / 2); } return (int) Math.min(MAX_TOP_N_QUEUE_SIZE, maxQueueSize); } private static double calculateLiveDocRatio(int numDocs, int maxDocs) { return (numDocs > 0) ? ((double) numDocs / maxDocs) : -1; } private static boolean shouldLoadFSTOffHeap(IndexInput input, FSTLoadMode fstLoadMode) { switch (fstLoadMode) { case ON_HEAP: return false; case OFF_HEAP: return true; case AUTO: // TODO: Make this less hacky to maybe expose "off-heap" feature using a marker interface on // the IndexInput return input.getClass().getName().contains(".MemorySegmentIndexInput"); default: throw new IllegalStateException("unknown enum constant: " + fstLoadMode); } } /** * Loads a {@link NRTSuggester} from {@link org.apache.lucene.store.IndexInput} on or off-heap * depending on the provided fstLoadMode */ public static NRTSuggester load(IndexInput input, FSTLoadMode fstLoadMode) throws IOException { final FST> fst; PairOutputs outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); if (shouldLoadFSTOffHeap(input, fstLoadMode)) { final FST.FSTMetadata> fstMetadata = FST.readMetadata(input, outputs); OffHeapFSTStore store = new OffHeapFSTStore(input, input.getFilePointer(), fstMetadata); fst = FST.fromFSTReader(fstMetadata, store); input.skipBytes(store.size()); } else { fst = new FST<>(FST.readMetadata(input, outputs), input); } /* read some meta info */ int maxAnalyzedPathsPerOutput = input.readVInt(); /* * Label used to denote the end of an input in the FST and * the beginning of dedup bytes */ input.readVInt(); // endByte int payloadSep = input.readVInt(); return new NRTSuggester(fst, maxAnalyzedPathsPerOutput, payloadSep); } static long encode(long input) { if (input < 0 || input > Integer.MAX_VALUE) { throw new UnsupportedOperationException("cannot encode value: " + input); } return Integer.MAX_VALUE - input; } static long decode(long output) { assert output >= 0 && output <= Integer.MAX_VALUE : "decoded output: " + output + " is not within 0 and Integer.MAX_VALUE"; return Integer.MAX_VALUE - output; } /** Helper to encode/decode payload (surface + PAYLOAD_SEP + docID) output */ static final class PayLoadProcessor { private static final int MAX_DOC_ID_LEN_WITH_SEP = 6; // vint takes at most 5 bytes static int parseSurfaceForm(final BytesRef output, int payloadSep, CharsRefBuilder spare) { int surfaceFormLen = -1; for (int i = 0; i < output.length; i++) { if (output.bytes[output.offset + i] == payloadSep) { surfaceFormLen = i; break; } } assert surfaceFormLen != -1 : "no payloadSep found, unable to determine surface form"; spare.copyUTF8Bytes(output.bytes, output.offset, surfaceFormLen); return surfaceFormLen; } static BytesRef make(final BytesRef surface, int docID, int payloadSep) throws IOException { int len = surface.length + MAX_DOC_ID_LEN_WITH_SEP; byte[] buffer = new byte[len]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); output.writeBytes(surface.bytes, surface.length - surface.offset); output.writeByte((byte) payloadSep); output.writeVInt(docID); return new BytesRef(buffer, 0, output.getPosition()); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy