org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester Maven / Gradle / Ivy
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.suggest.analyzing;
import com.carrotsearch.hppc.ObjectIntOpenHashMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.Sort;
import org.apache.lucene.store.*;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.*;
import org.apache.lucene.util.automaton.*;
import org.apache.lucene.util.fst.*;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.Util.MinResult;
import org.elasticsearch.common.collect.HppcMaps;
import java.io.*;
import java.util.*;
/**
* Suggester that first analyzes the surface form, adds the
* analyzed form to a weighted FST, and then does the same
* thing at lookup time. This means lookup is based on the
* analyzed form while suggestions are still the surface
* form(s).
*
*
* This can result in powerful suggester functionality. For
* example, if you use an analyzer removing stop words,
* then the partial text "ghost chr..." could see the
* suggestion "The Ghost of Christmas Past". Note that
* position increments MUST NOT be preserved for this example
* to work, so you should call the constructor with
* preservePositionIncrements
parameter set to
* false
*
*
* If SynonymFilter is used to map wifi and wireless network to
* hotspot then the partial text "wirele..." could suggest
* "wifi router". Token normalization like stemmers, accent
* removal, etc., would allow suggestions to ignore such
* variations.
*
*
* When two matching suggestions have the same weight, they
* are tie-broken by the analyzed form. If their analyzed
* form is the same then the order is undefined.
*
*
* There are some limitations:
*
*
* - A lookup from a query like "net" in English won't
* be any different than "net " (ie, user added a
* trailing space) because analyzers don't reflect
* when they've seen a token separator and when they
* haven't.
*
*
- If you're using {@code StopFilter}, and the user will
* type "fast apple", but so far all they've typed is
* "fast a", again because the analyzer doesn't convey whether
* it's seen a token separator after the "a",
* {@code StopFilter} will remove that "a" causing
* far more matches than you'd expect.
*
*
- Lookups with the empty string return no results
* instead of all results.
*
*
* @lucene.experimental
*/
public class XAnalyzingSuggester extends Lookup {
/**
* FST:
* input is the analyzed form, with a null byte between terms
* weights are encoded as costs: (Integer.MAX_VALUE-weight)
* surface is the original, unanalyzed form.
*/
private FST> fst = null;
/**
* Analyzer that will be used for analyzing suggestions at
* index time.
*/
private final Analyzer indexAnalyzer;
/**
* Analyzer that will be used for analyzing suggestions at
* query time.
*/
private final Analyzer queryAnalyzer;
/**
* True if exact match suggestions should always be returned first.
*/
private final boolean exactFirst;
/**
* True if separator between tokens should be preserved.
*/
private final boolean preserveSep;
/** Include this flag in the options parameter to {@link
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to always
* return the exact match first, regardless of score. This
* has no performance impact but could result in
* low-quality suggestions. */
public static final int EXACT_FIRST = 1;
/** Include this flag in the options parameter to {@link
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to preserve
* token separators when matching. */
public static final int PRESERVE_SEP = 2;
/** Represents the separation between tokens, if
* PRESERVE_SEP was specified */
public static final int SEP_LABEL = '\u001F';
/** Marks end of the analyzed input and start of dedup
* byte. */
public static final int END_BYTE = 0x0;
/** Maximum number of dup surface forms (different surface
* forms for the same analyzed form). */
private final int maxSurfaceFormsPerAnalyzedForm;
/** Maximum graph paths to index for a single analyzed
* surface form. This only matters if your analyzer
* makes lots of alternate paths (e.g. contains
* SynonymFilter). */
private final int maxGraphExpansions;
/** Highest number of analyzed paths we saw for any single
* input surface form. For analyzers that never create
* graphs this will always be 1. */
private int maxAnalyzedPathsForOneInput;
private boolean hasPayloads;
private final int sepLabel;
private final int payloadSep;
private final int endByte;
private final int holeCharacter;
public static final int PAYLOAD_SEP = '\u001F';
public static final int HOLE_CHARACTER = '\u001E';
/** Whether position holes should appear in the automaton. */
private boolean preservePositionIncrements;
/**
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
* PRESERVE_SEP, 256, -1)}
*/
public XAnalyzingSuggester(Analyzer analyzer) {
this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER);
}
/**
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)
* AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |
* PRESERVE_SEP, 256, -1)}
*/
public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER);
}
/**
* Creates a new suggester.
*
* @param indexAnalyzer Analyzer that will be used for
* analyzing suggestions while building the index.
* @param queryAnalyzer Analyzer that will be used for
* analyzing query text during lookup
* @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
* @param maxSurfaceFormsPerAnalyzedForm Maximum number of
* surface forms to keep for a single analyzed form.
* When there are too many surface forms we discard the
* lowest weighted ones.
* @param maxGraphExpansions Maximum number of graph paths
* to expand from the analyzed form. Set this to -1 for
* no limit.
*/
public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
boolean preservePositionIncrements, FST> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
int sepLabel, int payloadSep, int endByte, int holeCharacter) {
// SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput
this.indexAnalyzer = indexAnalyzer;
this.queryAnalyzer = queryAnalyzer;
this.fst = fst;
this.hasPayloads = hasPayloads;
if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {
throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options);
}
this.exactFirst = (options & EXACT_FIRST) != 0;
this.preserveSep = (options & PRESERVE_SEP) != 0;
// NOTE: this is just an implementation limitation; if
// somehow this is a problem we could fix it by using
// more than one byte to disambiguate ... but 256 seems
// like it should be way more then enough.
if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {
throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")");
}
this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {
throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
}
this.maxGraphExpansions = maxGraphExpansions;
this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
this.preservePositionIncrements = preservePositionIncrements;
this.sepLabel = sepLabel;
this.payloadSep = payloadSep;
this.endByte = endByte;
this.holeCharacter = holeCharacter;
}
/** Returns byte size of the underlying FST. */
public long sizeInBytes() {
return fst == null ? 0 : fst.sizeInBytes();
}
private static void copyDestTransitions(State from, State to, List transitions) {
if (to.isAccept()) {
from.setAccept(true);
}
for(Transition t : to.getTransitions()) {
transitions.add(t);
}
}
// Replaces SEP with epsilon or remaps them if
// we were asked to preserve them:
private static void replaceSep(Automaton a, boolean preserveSep, int replaceSep) {
State[] states = a.getNumberedStates();
// Go in reverse topo sort so we know we only have to
// make one pass:
for(int stateNumber=states.length-1;stateNumber >=0;stateNumber--) {
final State state = states[stateNumber];
List newTransitions = new ArrayList();
for(Transition t : state.getTransitions()) {
assert t.getMin() == t.getMax();
if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {
if (preserveSep) {
// Remap to SEP_LABEL:
newTransitions.add(new Transition(replaceSep, t.getDest()));
} else {
copyDestTransitions(state, t.getDest(), newTransitions);
a.setDeterministic(false);
}
} else if (t.getMin() == TokenStreamToAutomaton.HOLE) {
// Just remove the hole: there will then be two
// SEP tokens next to each other, which will only
// match another hole at search time. Note that
// it will also match an empty-string token ... if
// that's somehow a problem we can always map HOLE
// to a dedicated byte (and escape it in the
// input).
copyDestTransitions(state, t.getDest(), newTransitions);
a.setDeterministic(false);
} else {
newTransitions.add(t);
}
}
state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()]));
}
}
protected Automaton convertAutomaton(Automaton a) {
return a;
}
/** Just escapes the 0xff byte (which we still for SEP). */
private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
final BytesRef spare = new BytesRef();
private char sepLabel;
public EscapingTokenStreamToAutomaton(char sepLabel) {
this.sepLabel = sepLabel;
}
@Override
protected BytesRef changeToken(BytesRef in) {
int upto = 0;
for(int i=0;i {
private final boolean hasPayloads;
public AnalyzingComparator(boolean hasPayloads) {
this.hasPayloads = hasPayloads;
}
private final ByteArrayDataInput readerA = new ByteArrayDataInput();
private final ByteArrayDataInput readerB = new ByteArrayDataInput();
private final BytesRef scratchA = new BytesRef();
private final BytesRef scratchB = new BytesRef();
@Override
public int compare(BytesRef a, BytesRef b) {
// First by analyzed form:
readerA.reset(a.bytes, a.offset, a.length);
scratchA.length = readerA.readShort();
scratchA.bytes = a.bytes;
scratchA.offset = readerA.getPosition();
readerB.reset(b.bytes, b.offset, b.length);
scratchB.bytes = b.bytes;
scratchB.length = readerB.readShort();
scratchB.offset = readerB.getPosition();
int cmp = scratchA.compareTo(scratchB);
if (cmp != 0) {
return cmp;
}
readerA.skipBytes(scratchA.length);
readerB.skipBytes(scratchB.length);
// Next by cost:
long aCost = readerA.readInt();
long bCost = readerB.readInt();
if (aCost < bCost) {
return -1;
} else if (aCost > bCost) {
return 1;
}
// Finally by surface form:
if (hasPayloads) {
scratchA.length = readerA.readShort();
scratchA.offset = readerA.getPosition();
scratchB.length = readerB.readShort();
scratchB.offset = readerB.getPosition();
} else {
scratchA.offset = readerA.getPosition();
scratchA.length = a.length - scratchA.offset;
scratchB.offset = readerB.getPosition();
scratchB.length = b.length - scratchB.offset;
}
return scratchA.compareTo(scratchB);
}
}
@Override
public void build(InputIterator iterator) throws IOException {
String prefix = getClass().getSimpleName();
File directory = Sort.defaultTempDir();
File tempInput = File.createTempFile(prefix, ".input", directory);
File tempSorted = File.createTempFile(prefix, ".sorted", directory);
hasPayloads = iterator.hasPayloads();
Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
Sort.ByteSequencesReader reader = null;
BytesRef scratch = new BytesRef();
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
boolean success = false;
byte buffer[] = new byte[8];
try {
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
BytesRef surfaceForm;
while ((surfaceForm = iterator.next()) != null) {
Set paths = toFiniteStrings(surfaceForm, ts2a);
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
for (IntsRef path : paths) {
Util.toBytesRef(path, scratch);
// length of the analyzed text (FST input)
if (scratch.length > Short.MAX_VALUE-2) {
throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length + ")");
}
short analyzedLength = (short) scratch.length;
// compute the required length:
// analyzed sequence + weight (4) + surface + analyzedLength (short)
int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
BytesRef payload;
if (hasPayloads) {
if (surfaceForm.length > (Short.MAX_VALUE-2)) {
throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")");
}
payload = iterator.payload();
// payload + surfaceLength (short)
requiredLength += payload.length + 2;
} else {
payload = null;
}
buffer = ArrayUtil.grow(buffer, requiredLength);
output.reset(buffer);
output.writeShort(analyzedLength);
output.writeBytes(scratch.bytes, scratch.offset, scratch.length);
output.writeInt(encodeWeight(iterator.weight()));
if (hasPayloads) {
for(int i=0;i outputs = new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs);
// Build FST:
BytesRef previousAnalyzed = null;
BytesRef analyzed = new BytesRef();
BytesRef surface = new BytesRef();
IntsRef scratchInts = new IntsRef();
ByteArrayDataInput input = new ByteArrayDataInput();
// Used to remove duplicate surface forms (but we
// still index the hightest-weight one). We clear
// this when we see a new analyzed form, so it cannot
// grow unbounded (at most 256 entries):
Set seenSurfaceForms = new HashSet();
int dedup = 0;
while (reader.read(scratch)) {
input.reset(scratch.bytes, scratch.offset, scratch.length);
short analyzedLength = input.readShort();
analyzed.grow(analyzedLength+2);
input.readBytes(analyzed.bytes, 0, analyzedLength);
analyzed.length = analyzedLength;
long cost = input.readInt();
surface.bytes = scratch.bytes;
if (hasPayloads) {
surface.length = input.readShort();
surface.offset = input.getPosition();
} else {
surface.offset = input.getPosition();
surface.length = scratch.length - surface.offset;
}
if (previousAnalyzed == null) {
previousAnalyzed = new BytesRef();
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else if (analyzed.equals(previousAnalyzed)) {
dedup++;
if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
// More than maxSurfaceFormsPerAnalyzedForm
// dups: skip the rest:
continue;
}
if (seenSurfaceForms.contains(surface)) {
continue;
}
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else {
dedup = 0;
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.clear();
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
}
// TODO: I think we can avoid the extra 2 bytes when
// there is no dup (dedup==0), but we'd have to fix
// the exactFirst logic ... which would be sort of
// hairy because we'd need to special case the two
// (dup/not dup)...
// NOTE: must be byte 0 so we sort before whatever
// is next
analyzed.bytes[analyzed.offset+analyzed.length] = 0;
analyzed.bytes[analyzed.offset+analyzed.length+1] = (byte) dedup;
analyzed.length += 2;
Util.toIntsRef(analyzed, scratchInts);
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
if (!hasPayloads) {
builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
} else {
int payloadOffset = input.getPosition() + surface.length;
int payloadLength = scratch.length - payloadOffset;
BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
br.bytes[surface.length] = (byte) payloadSep;
System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
br.length = br.bytes.length;
builder.add(scratchInts, outputs.newPair(cost, br));
}
}
fst = builder.finish();
//PrintWriter pw = new PrintWriter("/tmp/out.dot");
//Util.toDot(fst, pw, true, true);
//pw.close();
success = true;
} finally {
if (success) {
IOUtils.close(reader, writer);
} else {
IOUtils.closeWhileHandlingException(reader, writer);
}
tempInput.delete();
tempSorted.delete();
}
}
@Override
public boolean store(OutputStream output) throws IOException {
DataOutput dataOut = new OutputStreamDataOutput(output);
try {
if (fst == null) {
return false;
}
fst.save(dataOut);
dataOut.writeVInt(maxAnalyzedPathsForOneInput);
dataOut.writeByte((byte) (hasPayloads ? 1 : 0));
} finally {
IOUtils.close(output);
}
return true;
}
@Override
public boolean load(InputStream input) throws IOException {
DataInput dataIn = new InputStreamDataInput(input);
try {
this.fst = new FST>(dataIn, new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
maxAnalyzedPathsForOneInput = dataIn.readVInt();
hasPayloads = dataIn.readByte() == 1;
} finally {
IOUtils.close(input);
}
return true;
}
private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRef spare) {
LookupResult result;
if (hasPayloads) {
int sepIndex = -1;
for(int i=0;i= output2.length) {
return false;
}
for(int i=0;i lookup(final CharSequence key, boolean onlyMorePopular, int num) {
assert num > 0;
if (onlyMorePopular) {
throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
}
if (fst == null) {
return Collections.emptyList();
}
//System.out.println("lookup key=" + key + " num=" + num);
for (int i = 0; i < key.length(); i++) {
if (key.charAt(i) == holeCharacter) {
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
}
if (key.charAt(i) == sepLabel) {
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
}
}
final BytesRef utf8Key = new BytesRef(key);
try {
Automaton lookupAutomaton = toLookupAutomaton(key);
final CharsRef spare = new CharsRef();
//System.out.println(" now intersect exactFirst=" + exactFirst);
// Intersect automaton w/ suggest wFST and get all
// prefix starting nodes & their outputs:
//final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
//System.out.println(" prefixPaths: " + prefixPaths.size());
BytesReader bytesReader = fst.getBytesReader();
FST.Arc> scratchArc = new FST.Arc>();
final List results = new ArrayList();
List>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
if (exactFirst) {
int count = 0;
for (FSTUtil.Path> path : prefixPaths) {
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
count++;
}
}
// Searcher just to find the single exact only
// match, if present:
Util.TopNSearcher> searcher;
searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
// NOTE: we could almost get away with only using
// the first start node. The only catch is if
// maxSurfaceFormsPerAnalyzedForm had kicked in and
// pruned our exact match from one of these nodes
// ...:
for (FSTUtil.Path> path : prefixPaths) {
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
}
}
MinResult> completions[] = searcher.search();
// NOTE: this is rather inefficient: we enumerate
// every matching "exactly the same analyzed form"
// path, and then do linear scan to see if one of
// these exactly matches the input. It should be
// possible (though hairy) to do something similar
// to getByOutput, since the surface form is encoded
// into the FST output, so we more efficiently hone
// in on the exact surface-form match. Still, I
// suspect very little time is spent in this linear
// seach: it's bounded by how many prefix start
// nodes we have and the
// maxSurfaceFormsPerAnalyzedForm:
for(MinResult> completion : completions) {
BytesRef output2 = completion.output.output2;
if (sameSurfaceForm(utf8Key, output2)) {
results.add(getLookupResult(completion.output.output1, output2, spare));
break;
}
}
if (results.size() == num) {
// That was quick:
return results;
}
}
Util.TopNSearcher> searcher;
searcher = new Util.TopNSearcher>(fst,
num - results.size(),
num * maxAnalyzedPathsForOneInput,
weightComparator) {
private final Set seen = new HashSet();
@Override
protected boolean acceptResult(IntsRef input, Pair output) {
// Dedup: when the input analyzes to a graph we
// can get duplicate surface forms:
if (seen.contains(output.output2)) {
return false;
}
seen.add(output.output2);
if (!exactFirst) {
return true;
} else {
// In exactFirst mode, don't accept any paths
// matching the surface form since that will
// create duplicate results:
if (sameSurfaceForm(utf8Key, output.output2)) {
// We found exact match, which means we should
// have already found it in the first search:
assert results.size() == 1;
return false;
} else {
return true;
}
}
}
};
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
for (FSTUtil.Path> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
}
MinResult> completions[] = searcher.search();
for(MinResult> completion : completions) {
LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
// TODO: for fuzzy case would be nice to return
// how many edits were required
//System.out.println(" result=" + result);
results.add(result);
if (results.size() == num) {
// In the exactFirst=true case the search may
// produce one extra path
break;
}
}
return results;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
/** Returns all completion paths to initialize the search. */
protected List>> getFullPrefixPaths(List>> prefixPaths,
Automaton lookupAutomaton,
FST> fst)
throws IOException {
return prefixPaths;
}
public final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
// Analyze surface form:
TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString());
return toFiniteStrings(ts2a, ts);
}
public final Set toFiniteStrings(final TokenStreamToAutomaton ts2a, TokenStream ts) throws IOException {
// Analyze surface form:
// Create corresponding automaton: labels are bytes
// from each analyzed token, with byte 0 used as
// separator between tokens:
Automaton automaton = ts2a.toAutomaton(ts);
ts.close();
replaceSep(automaton, preserveSep, sepLabel);
assert SpecialOperations.isFinite(automaton);
// Get all paths from the automaton (there can be
// more than one path, eg if the analyzer created a
// graph using SynFilter or WDF):
// TODO: we could walk & add simultaneously, so we
// don't have to alloc [possibly biggish]
// intermediate HashSet in RAM:
return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
}
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
// Turn tokenstream into automaton:
TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
ts.close();
// TODO: we could use the end offset to "guess"
// whether the final token was a partial token; this
// would only be a heuristic ... but maybe an OK one.
// This way we could eg differentiate "net" from "net ",
// which we can't today...
replaceSep(automaton, preserveSep, sepLabel);
// TODO: we can optimize this somewhat by determinizing
// while we convert
BasicOperations.determinize(automaton);
return automaton;
}
/**
* Returns the weight associated with an input string,
* or null if it does not exist.
*/
public Object get(CharSequence key) {
throw new UnsupportedOperationException();
}
/** cost -> weight */
public static int decodeWeight(long encoded) {
return (int)(Integer.MAX_VALUE - encoded);
}
/** weight -> cost */
public static int encodeWeight(long value) {
if (value < 0 || value > Integer.MAX_VALUE) {
throw new UnsupportedOperationException("cannot encode value: " + value);
}
return Integer.MAX_VALUE - (int)value;
}
static final Comparator> weightComparator = new Comparator> () {
@Override
public int compare(Pair left, Pair right) {
return left.output1.compareTo(right.output1);
}
};
public static class XBuilder {
private Builder> builder;
private int maxSurfaceFormsPerAnalyzedForm;
private IntsRef scratchInts = new IntsRef();
private final PairOutputs outputs;
private boolean hasPayloads;
private BytesRef analyzed = new BytesRef();
private final SurfaceFormAndPayload[] surfaceFormsAndPayload;
private int count;
private ObjectIntOpenHashMap seenSurfaceForms = HppcMaps.Object.Integer.ensureNoNullKeys(256, 0.75f);
private int payloadSep;
public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads, int payloadSep) {
this.payloadSep = payloadSep;
this.outputs = new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
this.builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs);
this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
this.hasPayloads = hasPayloads;
surfaceFormsAndPayload = new SurfaceFormAndPayload[maxSurfaceFormsPerAnalyzedForm];
}
public void startTerm(BytesRef analyzed) {
this.analyzed.copyBytes(analyzed);
this.analyzed.grow(analyzed.length+2);
}
private final static class SurfaceFormAndPayload implements Comparable {
BytesRef payload;
long weight;
public SurfaceFormAndPayload(BytesRef payload, long cost) {
super();
this.payload = payload;
this.weight = cost;
}
@Override
public int compareTo(SurfaceFormAndPayload o) {
int res = compare(weight, o.weight);
if (res == 0 ){
return payload.compareTo(o.payload);
}
return res;
}
public static int compare(long x, long y) {
return (x < y) ? -1 : ((x == y) ? 0 : 1);
}
}
public void addSurface(BytesRef surface, BytesRef payload, long cost) throws IOException {
int surfaceIndex = -1;
long encodedWeight = cost == -1 ? cost : encodeWeight(cost);
/*
* we need to check if we have seen this surface form, if so only use the
* the surface form with the highest weight and drop the rest no matter if
* the payload differs.
*/
if (count >= maxSurfaceFormsPerAnalyzedForm) {
// More than maxSurfaceFormsPerAnalyzedForm
// dups: skip the rest:
return;
}
BytesRef surfaceCopy;
if (count > 0 && seenSurfaceForms.containsKey(surface)) {
surfaceIndex = seenSurfaceForms.lget();
SurfaceFormAndPayload surfaceFormAndPayload = surfaceFormsAndPayload[surfaceIndex];
if (encodedWeight >= surfaceFormAndPayload.weight) {
return;
}
surfaceCopy = BytesRef.deepCopyOf(surface);
} else {
surfaceIndex = count++;
surfaceCopy = BytesRef.deepCopyOf(surface);
seenSurfaceForms.put(surfaceCopy, surfaceIndex);
}
BytesRef payloadRef;
if (!hasPayloads) {
payloadRef = surfaceCopy;
} else {
int len = surface.length + 1 + payload.length;
final BytesRef br = new BytesRef(len);
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
br.bytes[surface.length] = (byte) payloadSep;
System.arraycopy(payload.bytes, payload.offset, br.bytes, surface.length + 1, payload.length);
br.length = len;
payloadRef = br;
}
if (surfaceFormsAndPayload[surfaceIndex] == null) {
surfaceFormsAndPayload[surfaceIndex] = new SurfaceFormAndPayload(payloadRef, encodedWeight);
} else {
surfaceFormsAndPayload[surfaceIndex].payload = payloadRef;
surfaceFormsAndPayload[surfaceIndex].weight = encodedWeight;
}
}
public void finishTerm(long defaultWeight) throws IOException {
ArrayUtil.timSort(surfaceFormsAndPayload, 0, count);
int deduplicator = 0;
analyzed.bytes[analyzed.offset + analyzed.length] = 0;
analyzed.length += 2;
for (int i = 0; i < count; i++) {
analyzed.bytes[analyzed.offset + analyzed.length - 1 ] = (byte) deduplicator++;
Util.toIntsRef(analyzed, scratchInts);
SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i];
long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight;
builder.add(scratchInts, outputs.newPair(cost, candiate.payload));
}
seenSurfaceForms.clear();
count = 0;
}
public FST> build() throws IOException {
return builder.finish();
}
public boolean hasPayloads() {
return hasPayloads;
}
public int maxSurfaceFormsPerAnalyzedForm() {
return maxSurfaceFormsPerAnalyzedForm;
}
}
}