org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.suggest.analyzing;
import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.fst.Util.Result;
import org.apache.lucene.util.fst.Util.TopResults;
/**
* Suggester that first analyzes the surface form, adds the analyzed form to a weighted FST, and
* then does the same thing at lookup time. This means lookup is based on the analyzed form while
* suggestions are still the surface form(s).
*
* This can result in powerful suggester functionality. For example, if you use an analyzer
* removing stop words, then the partial text "ghost chr..." could see the suggestion "The Ghost of
* Christmas Past". Note that position increments MUST NOT be preserved for this example to work, so
* you should call the constructor with preservePositionIncrements
parameter set to
* false
*
*
If SynonymFilter is used to map wifi and wireless network to hotspot then the partial text
* "wirele..." could suggest "wifi router". Token normalization like stemmers, accent removal, etc.,
* would allow suggestions to ignore such variations.
*
*
When two matching suggestions have the same weight, they are tie-broken by the analyzed form.
* If their analyzed form is the same then the order is undefined.
*
*
There are some limitations:
*
*
* - A lookup from a query like "net" in English won't be any different than "net " (ie, user
* added a trailing space) because analyzers don't reflect when they've seen a token separator
* and when they haven't.
*
- If you're using {@code StopFilter}, and the user will type "fast apple", but so far all
* they've typed is "fast a", again because the analyzer doesn't convey whether it's seen a
* token separator after the "a", {@code StopFilter} will remove that "a" causing far more
* matches than you'd expect.
*
- Lookups with the empty string return no results instead of all results.
*
*
* @lucene.experimental
*/
public class AnalyzingSuggester extends Lookup {
/**
* FST<Weight,Surface>: input is the analyzed form, with a null byte between terms weights
* are encoded as costs: (Integer.MAX_VALUE-weight) surface is the original, unanalyzed form.
*/
private FST> fst = null;
/** Analyzer that will be used for analyzing suggestions at index time. */
private final Analyzer indexAnalyzer;
/** Analyzer that will be used for analyzing suggestions at query time. */
private final Analyzer queryAnalyzer;
/** True if exact match suggestions should always be returned first. */
private final boolean exactFirst;
/** True if separator between tokens should be preserved. */
private final boolean preserveSep;
/**
* Include this flag in the options parameter to {@link
* #AnalyzingSuggester(Directory,String,Analyzer,Analyzer,int,int,int,boolean)} to always return
* the exact match first, regardless of score. This has no performance impact but could result in
* low-quality suggestions.
*/
public static final int EXACT_FIRST = 1;
/**
* Include this flag in the options parameter to {@link
* #AnalyzingSuggester(Directory,String,Analyzer,Analyzer,int,int,int,boolean)} to preserve token
* separators when matching.
*/
public static final int PRESERVE_SEP = 2;
/** Represents the separation between tokens, if PRESERVE_SEP was specified */
private static final int SEP_LABEL = '\u001F';
/** Marks end of the analyzed input and start of dedup byte. */
private static final int END_BYTE = 0x0;
/** Maximum number of dup surface forms (different surface forms for the same analyzed form). */
private final int maxSurfaceFormsPerAnalyzedForm;
/**
* Maximum graph paths to index for a single analyzed surface form. This only matters if your
* analyzer makes lots of alternate paths (e.g. contains SynonymFilter).
*/
private final int maxGraphExpansions;
private final Directory tempDir;
private final String tempFileNamePrefix;
/**
* Highest number of analyzed paths we saw for any single input surface form. For analyzers that
* never create graphs this will always be 1.
*/
private int maxAnalyzedPathsForOneInput;
private boolean hasPayloads;
private static final int PAYLOAD_SEP = '\u001f';
/** Whether position holes should appear in the automaton. */
private boolean preservePositionIncrements;
/** Number of entries the lookup was built with */
private volatile long count = 0;
/**
* Calls {@link #AnalyzingSuggester(Directory,String,Analyzer,Analyzer,int,int,int,boolean)
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true)}
*/
public AnalyzingSuggester(Directory tempDir, String tempFileNamePrefix, Analyzer analyzer) {
this(
tempDir, tempFileNamePrefix, analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true);
}
/**
* Calls {@link #AnalyzingSuggester(Directory,String,Analyzer,Analyzer,int,int,int,boolean)
* AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true)}
*/
public AnalyzingSuggester(
Directory tempDir,
String tempFileNamePrefix,
Analyzer indexAnalyzer,
Analyzer queryAnalyzer) {
this(
tempDir,
tempFileNamePrefix,
indexAnalyzer,
queryAnalyzer,
EXACT_FIRST | PRESERVE_SEP,
256,
-1,
true);
}
/**
* Creates a new suggester.
*
* @param indexAnalyzer Analyzer that will be used for analyzing suggestions while building the
* index.
* @param queryAnalyzer Analyzer that will be used for analyzing query text during lookup
* @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
* @param maxSurfaceFormsPerAnalyzedForm Maximum number of surface forms to keep for a single
* analyzed form. When there are too many surface forms we discard the lowest weighted ones.
* @param maxGraphExpansions Maximum number of graph paths to expand from the analyzed form. Set
* this to -1 for no limit.
* @param preservePositionIncrements Whether position holes should appear in the automata
*/
public AnalyzingSuggester(
Directory tempDir,
String tempFileNamePrefix,
Analyzer indexAnalyzer,
Analyzer queryAnalyzer,
int options,
int maxSurfaceFormsPerAnalyzedForm,
int maxGraphExpansions,
boolean preservePositionIncrements) {
this.indexAnalyzer = indexAnalyzer;
this.queryAnalyzer = queryAnalyzer;
if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {
throw new IllegalArgumentException(
"options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options);
}
this.exactFirst = (options & EXACT_FIRST) != 0;
this.preserveSep = (options & PRESERVE_SEP) != 0;
// NOTE: this is just an implementation limitation; if
// somehow this is a problem we could fix it by using
// more than one byte to disambiguate ... but 256 seems
// like it should be way more then enough.
if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {
throw new IllegalArgumentException(
"maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: "
+ maxSurfaceFormsPerAnalyzedForm
+ ")");
}
this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {
throw new IllegalArgumentException(
"maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
}
this.maxGraphExpansions = maxGraphExpansions;
this.preservePositionIncrements = preservePositionIncrements;
this.tempDir = tempDir;
this.tempFileNamePrefix = tempFileNamePrefix;
}
/** Returns byte size of the underlying FST. */
@Override
public long ramBytesUsed() {
return fst == null ? 0 : fst.ramBytesUsed();
}
@Override
public Collection getChildResources() {
if (fst == null) {
return Collections.emptyList();
} else {
return Collections.singletonList(Accountables.namedAccountable("fst", fst));
}
}
// Replaces SEP with epsilon or remaps them if
// we were asked to preserve them:
private Automaton replaceSep(Automaton a) {
int numStates = a.getNumStates();
Automaton.Builder result = new Automaton.Builder(numStates, a.getNumTransitions());
// Copy all states over
result.copyStates(a);
// Go in reverse topo sort so we know we only have to
// make one pass:
Transition t = new Transition();
int[] topoSortStates = Operations.topoSortStates(a);
for (int i = 0; i < topoSortStates.length; i++) {
int state = topoSortStates[topoSortStates.length - 1 - i];
int count = a.initTransition(state, t);
for (int j = 0; j < count; j++) {
a.getNextTransition(t);
if (t.min == TokenStreamToAutomaton.POS_SEP) {
assert t.max == TokenStreamToAutomaton.POS_SEP;
if (preserveSep) {
// Remap to SEP_LABEL:
result.addTransition(state, t.dest, SEP_LABEL);
} else {
result.addEpsilon(state, t.dest);
}
} else if (t.min == TokenStreamToAutomaton.HOLE) {
assert t.max == TokenStreamToAutomaton.HOLE;
// Just remove the hole: there will then be two
// SEP tokens next to each other, which will only
// match another hole at search time. Note that
// it will also match an empty-string token ... if
// that's somehow a problem we can always map HOLE
// to a dedicated byte (and escape it in the
// input).
result.addEpsilon(state, t.dest);
} else {
result.addTransition(state, t.dest, t.min, t.max);
}
}
}
return result.finish();
}
/** Used by subclass to change the lookup automaton, if necessary. */
protected Automaton convertAutomaton(Automaton a) {
return a;
}
TokenStreamToAutomaton getTokenStreamToAutomaton() {
final TokenStreamToAutomaton tsta = new TokenStreamToAutomaton();
tsta.setPreservePositionIncrements(preservePositionIncrements);
tsta.setFinalOffsetGapAsHole(true);
return tsta;
}
private static class AnalyzingComparator implements Comparator {
private final boolean hasPayloads;
public AnalyzingComparator(boolean hasPayloads) {
this.hasPayloads = hasPayloads;
}
private final ByteArrayDataInput readerA = new ByteArrayDataInput();
private final ByteArrayDataInput readerB = new ByteArrayDataInput();
private final BytesRef scratchA = new BytesRef();
private final BytesRef scratchB = new BytesRef();
@Override
public int compare(BytesRef a, BytesRef b) {
// First by analyzed form:
readerA.reset(a.bytes, a.offset, a.length);
scratchA.length = readerA.readShort();
scratchA.bytes = a.bytes;
scratchA.offset = readerA.getPosition();
readerB.reset(b.bytes, b.offset, b.length);
scratchB.bytes = b.bytes;
scratchB.length = readerB.readShort();
scratchB.offset = readerB.getPosition();
int cmp = scratchA.compareTo(scratchB);
if (cmp != 0) {
return cmp;
}
readerA.skipBytes(scratchA.length);
readerB.skipBytes(scratchB.length);
// Next by cost:
long aCost = readerA.readInt();
long bCost = readerB.readInt();
assert decodeWeight(aCost) >= 0;
assert decodeWeight(bCost) >= 0;
if (aCost < bCost) {
return -1;
} else if (aCost > bCost) {
return 1;
}
// Finally by surface form:
if (hasPayloads) {
scratchA.length = readerA.readShort();
scratchB.length = readerB.readShort();
scratchA.offset = readerA.getPosition();
scratchB.offset = readerB.getPosition();
} else {
scratchA.offset = readerA.getPosition();
scratchB.offset = readerB.getPosition();
scratchA.length = readerA.length() - readerA.getPosition();
scratchB.length = readerB.length() - readerB.getPosition();
}
assert scratchA.isValid();
assert scratchB.isValid();
return scratchA.compareTo(scratchB);
}
}
@Override
public void build(InputIterator iterator) throws IOException {
if (iterator.hasContexts()) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
hasPayloads = iterator.hasPayloads();
OfflineSorter sorter =
new OfflineSorter(tempDir, tempFileNamePrefix, new AnalyzingComparator(hasPayloads));
IndexOutput tempInput =
tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
OfflineSorter.ByteSequencesReader reader = null;
BytesRefBuilder scratch = new BytesRefBuilder();
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
String tempSortedFileName = null;
long newCount = 0;
byte[] buffer = new byte[8];
try {
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
LimitedFiniteStringsIterator finiteStrings =
new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
for (IntsRef string; (string = finiteStrings.next()) != null; newCount++) {
Util.toBytesRef(string, scratch);
// length of the analyzed text (FST input)
if (scratch.length() > Short.MAX_VALUE - 2) {
throw new IllegalArgumentException(
"cannot handle analyzed forms > "
+ (Short.MAX_VALUE - 2)
+ " in length (got "
+ scratch.length()
+ ")");
}
short analyzedLength = (short) scratch.length();
// compute the required length:
// analyzed sequence + weight (4) + surface + analyzedLength (short)
int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
BytesRef payload;
if (hasPayloads) {
if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
throw new IllegalArgumentException(
"cannot handle surface form > "
+ (Short.MAX_VALUE - 2)
+ " in length (got "
+ surfaceForm.length
+ ")");
}
payload = iterator.payload();
// payload + surfaceLength (short)
requiredLength += payload.length + 2;
} else {
payload = null;
}
buffer = ArrayUtil.growNoCopy(buffer, requiredLength);
output.reset(buffer);
output.writeShort(analyzedLength);
output.writeBytes(scratch.bytes(), 0, scratch.length());
output.writeInt(encodeWeight(iterator.weight()));
if (hasPayloads) {
for (int i = 0; i < surfaceForm.length; i++) {
if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
throw new IllegalArgumentException(
"surface form cannot contain unit separator character U+001F; this character is reserved");
}
}
output.writeShort((short) surfaceForm.length);
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
output.writeBytes(payload.bytes, payload.offset, payload.length);
} else {
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
}
assert output.getPosition() == requiredLength
: output.getPosition() + " vs " + requiredLength;
writer.write(buffer, 0, output.getPosition());
}
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
}
CodecUtil.writeFooter(tempInput);
writer.close();
// Sort all input/output pairs (required by FST.Builder):
tempSortedFileName = sorter.sort(tempInput.getName());
// Free disk space:
tempDir.deleteFile(tempInput.getName());
reader =
new OfflineSorter.ByteSequencesReader(
tempDir.openChecksumInput(tempSortedFileName), tempSortedFileName);
PairOutputs outputs =
new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
FSTCompiler> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
// Build FST:
BytesRefBuilder previousAnalyzed = null;
BytesRefBuilder analyzed = new BytesRefBuilder();
BytesRef surface = new BytesRef();
IntsRefBuilder scratchInts = new IntsRefBuilder();
ByteArrayDataInput input = new ByteArrayDataInput();
// Used to remove duplicate surface forms (but we
// still index the hightest-weight one). We clear
// this when we see a new analyzed form, so it cannot
// grow unbounded (at most 256 entries):
Set seenSurfaceForms = new HashSet<>();
int dedup = 0;
while (true) {
BytesRef bytes = reader.next();
if (bytes == null) {
break;
}
input.reset(bytes.bytes, bytes.offset, bytes.length);
short analyzedLength = input.readShort();
analyzed.growNoCopy(analyzedLength + 2);
input.readBytes(analyzed.bytes(), 0, analyzedLength);
analyzed.setLength(analyzedLength);
long cost = input.readInt();
surface.bytes = bytes.bytes;
if (hasPayloads) {
surface.length = input.readShort();
surface.offset = input.getPosition();
} else {
surface.offset = input.getPosition();
surface.length = bytes.length - surface.offset;
}
if (previousAnalyzed == null) {
previousAnalyzed = new BytesRefBuilder();
previousAnalyzed.copyBytes(analyzed.get());
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else if (analyzed.get().equals(previousAnalyzed.get())) {
dedup++;
if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
// More than maxSurfaceFormsPerAnalyzedForm
// dups: skip the rest:
continue;
}
if (seenSurfaceForms.contains(surface)) {
continue;
}
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else {
dedup = 0;
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.clear();
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
}
// TODO: I think we can avoid the extra 2 bytes when
// there is no dup (dedup==0), but we'd have to fix
// the exactFirst logic ... which would be sort of
// hairy because we'd need to special case the two
// (dup/not dup)...
// NOTE: must be byte 0 so we sort before whatever
// is next
analyzed.append((byte) 0);
analyzed.append((byte) dedup);
Util.toIntsRef(analyzed.get(), scratchInts);
// System.out.println("ADD: " + scratchInts + " -> " + cost + ": " +
// surface.utf8ToString());
if (!hasPayloads) {
fstCompiler.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
} else {
int payloadOffset = input.getPosition() + surface.length;
int payloadLength = bytes.length - payloadOffset;
BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
br.bytes[surface.length] = PAYLOAD_SEP;
System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
br.length = br.bytes.length;
fstCompiler.add(scratchInts.get(), outputs.newPair(cost, br));
}
}
fst = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader());
count = newCount;
// Util.dotToFile(fst, "/tmp/suggest.dot");
} finally {
IOUtils.closeWhileHandlingException(reader, writer);
IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
}
}
@Override
public boolean store(DataOutput output) throws IOException {
output.writeVLong(count);
if (fst == null) {
return false;
}
fst.save(output, output);
output.writeVInt(maxAnalyzedPathsForOneInput);
output.writeByte((byte) (hasPayloads ? 1 : 0));
return true;
}
@Override
public boolean load(DataInput input) throws IOException {
count = input.readVLong();
PairOutputs outputs =
new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
this.fst = new FST<>(FST.readMetadata(input, outputs), input);
maxAnalyzedPathsForOneInput = input.readVInt();
hasPayloads = input.readByte() == 1;
return true;
}
private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) {
LookupResult result;
if (hasPayloads) {
int sepIndex = -1;
for (int i = 0; i < output2.length; i++) {
if (output2.bytes[output2.offset + i] == PAYLOAD_SEP) {
sepIndex = i;
break;
}
}
assert sepIndex != -1;
spare.grow(sepIndex);
final int payloadLen = output2.length - sepIndex - 1;
spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex);
BytesRef payload = new BytesRef(payloadLen);
System.arraycopy(output2.bytes, sepIndex + 1, payload.bytes, 0, payloadLen);
payload.length = payloadLen;
result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
} else {
spare.grow(output2.length);
spare.copyUTF8Bytes(output2);
result = new LookupResult(spare.toString(), decodeWeight(output1));
}
return result;
}
private boolean sameSurfaceForm(BytesRef key, BytesRef output2) {
if (hasPayloads) {
// output2 has at least PAYLOAD_SEP byte:
if (key.length >= output2.length) {
return false;
}
for (int i = 0; i < key.length; i++) {
if (key.bytes[key.offset + i] != output2.bytes[output2.offset + i]) {
return false;
}
}
return output2.bytes[output2.offset + key.length] == PAYLOAD_SEP;
} else {
return key.bytesEquals(output2);
}
}
@Override
public List lookup(
final CharSequence key, Set contexts, boolean onlyMorePopular, int num) {
assert num > 0;
if (onlyMorePopular) {
throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
}
if (contexts != null) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
if (fst == null) {
return Collections.emptyList();
}
// System.out.println("lookup key=" + key + " num=" + num);
for (int i = 0; i < key.length(); i++) {
if (key.charAt(i) == 0x1E) {
throw new IllegalArgumentException(
"lookup key cannot contain HOLE character U+001E; this character is reserved");
}
if (key.charAt(i) == 0x1F) {
throw new IllegalArgumentException(
"lookup key cannot contain unit separator character U+001F; this character is reserved");
}
}
final BytesRef utf8Key = new BytesRef(key);
try {
Automaton lookupAutomaton = toLookupAutomaton(key);
final CharsRefBuilder spare = new CharsRefBuilder();
// System.out.println(" now intersect exactFirst=" + exactFirst);
// Intersect automaton w/ suggest wFST and get all
// prefix starting nodes & their outputs:
// final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
// System.out.println(" prefixPaths: " + prefixPaths.size());
BytesReader bytesReader = fst.getBytesReader();
FST.Arc> scratchArc = new FST.Arc<>();
final List results = new ArrayList<>();
List>> prefixPaths =
FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
if (exactFirst) {
int count = 0;
for (FSTUtil.Path> path : prefixPaths) {
if (fst.findTargetArc(END_BYTE, path.fstNode(), scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
count++;
}
}
// Searcher just to find the single exact only
// match, if present:
Util.TopNSearcher> searcher;
searcher =
new Util.TopNSearcher<>(
fst,
count * maxSurfaceFormsPerAnalyzedForm,
count * maxSurfaceFormsPerAnalyzedForm,
weightComparator);
// NOTE: we could almost get away with only using
// the first start node. The only catch is if
// maxSurfaceFormsPerAnalyzedForm had kicked in and
// pruned our exact match from one of these nodes
// ...:
for (FSTUtil.Path> path : prefixPaths) {
if (fst.findTargetArc(END_BYTE, path.fstNode(), scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
searcher.addStartPaths(
scratchArc,
fst.outputs.add(path.output(), scratchArc.output()),
false,
path.input());
}
}
TopResults> completions = searcher.search();
assert completions.isComplete;
// NOTE: this is rather inefficient: we enumerate
// every matching "exactly the same analyzed form"
// path, and then do linear scan to see if one of
// these exactly matches the input. It should be
// possible (though hairy) to do something similar
// to getByOutput, since the surface form is encoded
// into the FST output, so we more efficiently hone
// in on the exact surface-form match. Still, I
// suspect very little time is spent in this linear
// seach: it's bounded by how many prefix start
// nodes we have and the
// maxSurfaceFormsPerAnalyzedForm:
for (Result> completion : completions) {
BytesRef output2 = completion.output().output2;
if (sameSurfaceForm(utf8Key, output2)) {
results.add(getLookupResult(completion.output().output1, output2, spare));
break;
}
}
if (results.size() == num) {
// That was quick:
return results;
}
}
Util.TopNSearcher> searcher;
searcher =
new Util.TopNSearcher>(
fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {
private final Set seen = new HashSet<>();
@Override
protected boolean acceptResult(IntsRef input, Pair output) {
// Dedup: when the input analyzes to a graph we
// can get duplicate surface forms:
if (seen.contains(output.output2)) {
return false;
}
seen.add(output.output2);
if (!exactFirst) {
return true;
} else {
// In exactFirst mode, don't accept any paths
// matching the surface form since that will
// create duplicate results:
if (sameSurfaceForm(utf8Key, output.output2)) {
// We found exact match, which means we should
// have already found it in the first search:
assert results.size() == 1;
return false;
} else {
return true;
}
}
}
};
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
for (FSTUtil.Path> path : prefixPaths) {
searcher.addStartPaths(path.fstNode(), path.output(), true, path.input());
}
TopResults> completions = searcher.search();
assert completions.isComplete;
for (Result> completion : completions) {
LookupResult result =
getLookupResult(completion.output().output1, completion.output().output2, spare);
// TODO: for fuzzy case would be nice to return
// how many edits were required
// System.out.println(" result=" + result);
results.add(result);
if (results.size() == num) {
// In the exactFirst=true case the search may
// produce one extra path
break;
}
}
return results;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
@Override
public long getCount() {
return count;
}
/** Returns all prefix paths to initialize the search. */
protected List>> getFullPrefixPaths(
List>> prefixPaths,
Automaton lookupAutomaton,
FST> fst)
throws IOException {
return prefixPaths;
}
final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a)
throws IOException {
// Analyze surface form:
Automaton automaton;
try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
// Create corresponding automaton: labels are bytes
// from each analyzed token, with byte 0 used as
// separator between tokens:
automaton = ts2a.toAutomaton(ts);
}
automaton = replaceSep(automaton);
automaton = convertAutomaton(automaton);
// Get all paths from the automaton (there can be
// more than one path, eg if the analyzer created a
// graph using SynFilter or WDF):
return automaton;
}
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
// TODO: is there a Reader from a CharSequence?
// Turn tokenstream into automaton:
Automaton automaton = null;
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
automaton = getTokenStreamToAutomaton().toAutomaton(ts);
}
automaton = replaceSep(automaton);
// TODO: we can optimize this somewhat by determinizing
// while we convert
automaton = Operations.determinize(automaton, DEFAULT_DETERMINIZE_WORK_LIMIT);
return automaton;
}
/** Returns the weight associated with an input string, or null if it does not exist. */
public Object get(CharSequence key) {
throw new UnsupportedOperationException();
}
/** cost -> weight */
private static int decodeWeight(long encoded) {
return (int) (Integer.MAX_VALUE - encoded);
}
/** weight -> cost */
private static int encodeWeight(long value) {
if (value < 0 || value > Integer.MAX_VALUE) {
throw new UnsupportedOperationException("cannot encode value: " + value);
}
return Integer.MAX_VALUE - (int) value;
}
static final Comparator> weightComparator =
new Comparator>() {
@Override
public int compare(Pair left, Pair right) {
return left.output1.compareTo(right.output1);
}
};
}