All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.suggest.fst.FSTCompletionBuilder Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.fst;

import java.io.Closeable;
import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.search.suggest.InMemorySorter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.NoOutputs;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;

/**
 * Finite state automata based implementation of "autocomplete" functionality.
 *
 * 

Implementation details

* *

The construction step in {@link #build()} works as follows: * *

    *
  • A set of input terms and their buckets is given. *
  • All terms in the input are prefixed with a synthetic pseudo-character (code) of the weight * bucket the term fell into. For example a term abc with a discretized weight * equal '1' would become 1abc. *
  • The terms are then sorted by their raw value of UTF-8 character values (including the * synthetic bucket code in front). *
  • A finite state automaton ({@link FST}) is constructed from the input. The root node has * arcs labeled with all possible weights. We cache all these arcs, highest-weight first. *
* *

At runtime, in {@link FSTCompletion#lookup(CharSequence, int)}, the automaton is utilized as * follows: * *

    *
  • For each possible term weight encoded in the automaton (cached arcs from the root above), * starting with the highest one, we descend along the path of the input key. If the key is * not a prefix of a sequence in the automaton (path ends prematurely), we exit immediately -- * no completions. *
  • Otherwise, we have found an internal automaton node that ends the key. The entire * subautomaton (all paths) starting from this node form the key's completions. We start * the traversal of this subautomaton. Every time we reach a final state (arc), we add a * single suggestion to the list of results (the weight of this suggestion is constant and * equal to the root path we started from). The tricky part is that because automaton edges * are sorted and we scan depth-first, we can terminate the entire procedure as soon as we * collect enough suggestions the user requested. *
  • In case the number of suggestions collected in the step above is still insufficient, we * proceed to the next (smaller) weight leaving the root node and repeat the same algorithm * again. *
* *

Runtime behavior and performance characteristic

* * The algorithm described above is optimized for finding suggestions to short prefixes in a * top-weights-first order. This is probably the most common use case: it allows presenting * suggestions early and sorts them by the global frequency (and then alphabetically). * *

If there is an exact match in the automaton, it is returned first on the results list (even * with by-weight sorting). * *

Note that the maximum lookup time for any prefix is the time of descending to the * subtree, plus traversal of the subtree up to the number of requested suggestions (because they * are already presorted by weight on the root level and alphabetically at any node level). * *

To order alphabetically only (no ordering by priorities), use identical term weights for all * terms. Alphabetical suggestions are returned even if non-constant weights are used, but the * algorithm for doing this is suboptimal. * *

"alphabetically" in any of the documentation above indicates UTF-8 representation order, * nothing else. * *

NOTE: the FST file format is experimental and subject to suddenly change, requiring you * to rebuild the FST suggest index. * * @see FSTCompletion * @lucene.experimental */ public class FSTCompletionBuilder { /** Default number of buckets. */ public static final int DEFAULT_BUCKETS = 10; /** * The number of separate buckets for weights (discretization). The more buckets, the more * fine-grained term weights (priorities) can be assigned. The speed of lookup will not decrease * for prefixes which have highly-weighted completions (because these are filled-in first), but * will decrease significantly for low-weighted terms (but these should be infrequent, so it is * all right). * *

The number of buckets must be within [1, 255] range. */ private final int buckets; /** Finite state automaton encoding all the lookup terms. See class notes for details. */ FST automaton; /** * FST construction require re-sorting the input. This is the class that collects all the input * entries, their weights and then provides sorted order. */ private final BytesRefSorter sorter; /** Scratch buffer for {@link #add(BytesRef, int)}. */ private final BytesRefBuilder scratch = new BytesRefBuilder(); /** Max tail sharing RAM (MB). */ private final double suffixRAMLimitMB; /** * Creates an {@link FSTCompletion} with default options: 10 buckets, exact match promoted to * first position and {@link InMemorySorter} with a comparator obtained from {@link * Comparator#naturalOrder()}. */ public FSTCompletionBuilder() { this(DEFAULT_BUCKETS, new InMemorySorter(Comparator.naturalOrder()), Double.POSITIVE_INFINITY); } /** * Creates an FSTCompletion with the specified options. * * @param buckets The number of buckets for weight discretization. Buckets are used in {@link * #add(BytesRef, int)} and must be smaller than the number given here. * @param sorter {@link BytesRefSorter} used for re-sorting input for the automaton. For large * inputs, use on-disk sorting implementations. The sorter is closed automatically in {@link * #build()} if it implements {@link Closeable}. * @param suffixRAMLimitMB Max shared suffix RAM size (MB). *

See the description of this parameter in {@link * org.apache.lucene.util.fst.FSTCompiler.Builder}. In general, for very large inputs you'll * want to construct a non-minimal automaton which will be larger, but the construction will * take far less ram. For minimal automata, set it to {@link Double#MAX_VALUE}. */ public FSTCompletionBuilder(int buckets, BytesRefSorter sorter, double suffixRAMLimitMB) { if (buckets < 1 || buckets > 255) { throw new IllegalArgumentException("Buckets must be >= 1 and <= 255: " + buckets); } if (sorter == null) { throw new IllegalArgumentException("BytesRefSorter must not be null."); } this.sorter = sorter; this.buckets = buckets; this.suffixRAMLimitMB = suffixRAMLimitMB; } /** * Appends a single suggestion and its weight to the internal buffers. * * @param utf8 The suggestion (utf8 representation) to be added. The content is copied and the * object can be reused. * @param bucket The bucket to place this suggestion in. Must be non-negative and smaller than the * number of buckets passed in the constructor. Higher numbers indicate suggestions that * should be presented before suggestions placed in smaller buckets. */ public void add(BytesRef utf8, int bucket) throws IOException { if (bucket < 0 || bucket >= buckets) { throw new IllegalArgumentException( "Bucket outside of the allowed range [0, " + buckets + "): " + bucket); } scratch.growNoCopy(utf8.length + 10); scratch.clear(); scratch.append((byte) bucket); scratch.append(utf8); sorter.add(scratch.get()); } /** * Builds the final automaton from a list of added entries. This method may take a longer while as * it needs to build the automaton. */ public FSTCompletion build() throws IOException { this.automaton = buildAutomaton(sorter); if (sorter instanceof Closeable) { ((Closeable) sorter).close(); } return new FSTCompletion(automaton); } /** Builds the final automaton from a list of entries. */ private FST buildAutomaton(BytesRefSorter sorter) throws IOException { // Build the automaton. final Outputs outputs = NoOutputs.getSingleton(); final Object empty = outputs.getNoOutput(); final FSTCompiler fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs) .suffixRAMLimitMB(suffixRAMLimitMB) .build(); BytesRefBuilder scratch = new BytesRefBuilder(); BytesRef entry; final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); int count = 0; BytesRefIterator iter = sorter.iterator(); while ((entry = iter.next()) != null) { count++; if (scratch.get().compareTo(entry) != 0) { fstCompiler.add(Util.toIntsRef(entry, scratchIntsRef), empty); scratch.copyBytes(entry); } } return count == 0 ? null : FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()); } }