org.apache.lucene.search.suggest.fst.FSTCompletionBuilder Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.suggest.fst;
import java.io.Closeable;
import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.search.suggest.InMemorySorter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.*;
import org.apache.lucene.util.packed.PackedInts;
/**
* Finite state automata based implementation of "autocomplete" functionality.
*
* Implementation details
*
*
* The construction step in {@link #finalize()} works as follows:
*
* - A set of input terms and their buckets is given.
* - All terms in the input are prefixed with a synthetic pseudo-character
* (code) of the weight bucket the term fell into. For example a term
*
abc
with a discretized weight equal '1' would become
* 1abc
.
* - The terms are then sorted by their raw value of UTF-8 character values
* (including the synthetic bucket code in front).
* - A finite state automaton ({@link FST}) is constructed from the input. The
* root node has arcs labeled with all possible weights. We cache all these
* arcs, highest-weight first.
*
*
*
* At runtime, in {@link FSTCompletion#lookup(CharSequence, int)},
* the automaton is utilized as follows:
*
* - For each possible term weight encoded in the automaton (cached arcs from
* the root above), starting with the highest one, we descend along the path of
* the input key. If the key is not a prefix of a sequence in the automaton
* (path ends prematurely), we exit immediately -- no completions.
* - Otherwise, we have found an internal automaton node that ends the key.
* The entire subautomaton (all paths) starting from this node form the key's
* completions. We start the traversal of this subautomaton. Every time we
* reach a final state (arc), we add a single suggestion to the list of results
* (the weight of this suggestion is constant and equal to the root path we
* started from). The tricky part is that because automaton edges are sorted and
* we scan depth-first, we can terminate the entire procedure as soon as we
* collect enough suggestions the user requested.
* - In case the number of suggestions collected in the step above is still
* insufficient, we proceed to the next (smaller) weight leaving the root node
* and repeat the same algorithm again.
*
*
* Runtime behavior and performance characteristic
*
* The algorithm described above is optimized for finding suggestions to short
* prefixes in a top-weights-first order. This is probably the most common use
* case: it allows presenting suggestions early and sorts them by the global
* frequency (and then alphabetically).
*
*
* If there is an exact match in the automaton, it is returned first on the
* results list (even with by-weight sorting).
*
*
* Note that the maximum lookup time for any prefix is the time of
* descending to the subtree, plus traversal of the subtree up to the number of
* requested suggestions (because they are already presorted by weight on the
* root level and alphabetically at any node level).
*
*
* To order alphabetically only (no ordering by priorities), use identical term
* weights for all terms. Alphabetical suggestions are returned even if
* non-constant weights are used, but the algorithm for doing this is
* suboptimal.
*
*
* "alphabetically" in any of the documentation above indicates UTF-8
* representation order, nothing else.
*
*
* NOTE: the FST file format is experimental and subject to suddenly
* change, requiring you to rebuild the FST suggest index.
*
* @see FSTCompletion
* @lucene.experimental
*/
public class FSTCompletionBuilder {
/**
* Default number of buckets.
*/
public static final int DEFAULT_BUCKETS = 10;
/**
* The number of separate buckets for weights (discretization). The more
* buckets, the more fine-grained term weights (priorities) can be assigned.
* The speed of lookup will not decrease for prefixes which have
* highly-weighted completions (because these are filled-in first), but will
* decrease significantly for low-weighted terms (but these should be
* infrequent, so it is all right).
*
*
* The number of buckets must be within [1, 255] range.
*/
private final int buckets;
/**
* Finite state automaton encoding all the lookup terms. See class notes for
* details.
*/
FST