org.apache.lucene.search.suggest.fst.FSTCompletionLookup Maven / Gradle / Ivy
Show all versions of lucene-suggest Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.suggest.fst;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.NoOutputs;
/**
* An adapter from {@link Lookup} API to {@link FSTCompletion}.
*
* This adapter differs from {@link FSTCompletion} in that it attempts to discretize any
* "weights" as passed from in {@link InputIterator#weight()} to match the number of buckets. For
* the rationale for bucketing, see {@link FSTCompletion}.
*
*
Note:Discretization requires an additional sorting pass.
*
*
The range of weights for bucketing/ discretization is determined by sorting the input by
* weight and then dividing into equal ranges. Then, scores within each range are assigned to that
* bucket.
*
*
Note that this means that even large differences in weights may be lost during automaton
* construction, but the overall distinction between "classes" of weights will be preserved
* regardless of the distribution of weights.
*
*
For fine-grained control over which weights are assigned to which buckets, use {@link
* FSTCompletion} directly or {@link TSTLookup}, for example.
*
* @see FSTCompletion
* @lucene.experimental
*/
public class FSTCompletionLookup extends Lookup {
/**
* An invalid bucket count if we're creating an object of this class from an existing FST.
*
* @see #FSTCompletionLookup(Directory, String, FSTCompletion, boolean)
*/
private static int INVALID_BUCKETS_COUNT = -1;
/**
* Shared tail length for conflating in the created automaton. Setting this to larger values
* ({@link Integer#MAX_VALUE}) will create smaller (or minimal) automata at the cost of RAM for
* keeping nodes hash in the {@link FST}.
*
*
Empirical pick.
*/
private static final int sharedTailLength = 5;
private final Directory tempDir;
private final String tempFileNamePrefix;
private int buckets;
private boolean exactMatchFirst;
/** Automaton used for completions with higher weights reordering. */
private FSTCompletion higherWeightsCompletion;
/** Automaton used for normal completions. */
private FSTCompletion normalCompletion;
/** Number of entries the lookup was built with */
private volatile long count = 0;
/** This constructor should only be used to read a previously saved suggester. */
public FSTCompletionLookup() {
this(null, null);
}
/**
* This constructor prepares for creating a suggested FST using the {@link #build(InputIterator)}
* method. The number of weight discretization buckets is set to {@link
* FSTCompletion#DEFAULT_BUCKETS} and exact matches are promoted to the top of the suggestions
* list.
*/
public FSTCompletionLookup(Directory tempDir, String tempFileNamePrefix) {
this(tempDir, tempFileNamePrefix, FSTCompletion.DEFAULT_BUCKETS, true);
}
/**
* This constructor prepares for creating a suggested FST using the {@link #build(InputIterator)}
* method.
*
* @param buckets The number of weight discretization buckets (see {@link FSTCompletion} for
* details).
* @param exactMatchFirst If true
exact matches are promoted to the top of the
* suggestions list. Otherwise they appear in the order of discretized weight and alphabetical
* within the bucket.
*/
public FSTCompletionLookup(
Directory tempDir, String tempFileNamePrefix, int buckets, boolean exactMatchFirst) {
this.buckets = buckets;
this.exactMatchFirst = exactMatchFirst;
this.tempDir = tempDir;
this.tempFileNamePrefix = tempFileNamePrefix;
}
/**
* This constructor takes a pre-built automaton.
*
* @param completion An instance of {@link FSTCompletion}.
* @param exactMatchFirst If true
exact matches are promoted to the top of the
* suggestions list. Otherwise they appear in the order of discretized weight and alphabetical
* within the bucket.
*/
public FSTCompletionLookup(
Directory tempDir,
String tempFileNamePrefix,
FSTCompletion completion,
boolean exactMatchFirst) {
this(tempDir, tempFileNamePrefix, INVALID_BUCKETS_COUNT, exactMatchFirst);
this.normalCompletion = new FSTCompletion(completion.getFST(), false, exactMatchFirst);
this.higherWeightsCompletion = new FSTCompletion(completion.getFST(), true, exactMatchFirst);
}
@Override
public void build(InputIterator iterator) throws IOException {
if (iterator.hasPayloads()) {
throw new IllegalArgumentException("this suggester doesn't support payloads");
}
if (iterator.hasContexts()) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix);
ExternalRefSorter externalSorter = new ExternalRefSorter(sorter);
IndexOutput tempInput =
tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
String tempSortedFileName = null;
OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
OfflineSorter.ByteSequencesReader reader = null;
// Push floats up front before sequences to sort them. For now, assume they are non-negative.
// If negative floats are allowed some trickery needs to be done to find their byte order.
try {
byte[] buffer = new byte[0];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
BytesRef spare;
int inputLineCount = 0;
while ((spare = iterator.next()) != null) {
if (spare.length + 4 >= buffer.length) {
buffer = ArrayUtil.growNoCopy(buffer, spare.length + 4);
}
output.reset(buffer);
final int encodedWeight = encodeWeight(iterator.weight());
// write bytes for comparing in lexicographically order
output.writeInt(Integer.reverseBytes(encodedWeight));
output.writeBytes(spare.bytes, spare.offset, spare.length);
writer.write(buffer, 0, output.getPosition());
inputLineCount++;
}
CodecUtil.writeFooter(tempInput);
writer.close();
// We don't know the distribution of scores and we need to bucket them, so we'll sort
// and divide into equal buckets.
tempSortedFileName = sorter.sort(tempInput.getName());
tempDir.deleteFile(tempInput.getName());
FSTCompletionBuilder builder =
new FSTCompletionBuilder(buckets, externalSorter, sharedTailLength);
reader =
new OfflineSorter.ByteSequencesReader(
tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE),
tempSortedFileName);
long line = 0;
int previousBucket = 0;
int previousScore = 0;
ByteArrayDataInput input = new ByteArrayDataInput();
BytesRef tmp2 = new BytesRef();
long newCount = 0;
while (true) {
BytesRef scratch = reader.next();
if (scratch == null) {
break;
}
input.reset(scratch.bytes, scratch.offset, scratch.length);
int currentScore = input.readInt();
int bucket;
if (line > 0 && currentScore == previousScore) {
bucket = previousBucket;
} else {
bucket = (int) (line * buckets / inputLineCount);
}
previousScore = currentScore;
previousBucket = bucket;
// Only append the input, discard the weight.
tmp2.bytes = scratch.bytes;
tmp2.offset = scratch.offset + input.getPosition();
tmp2.length = scratch.length - input.getPosition();
builder.add(tmp2, bucket);
line++;
newCount++;
}
// The two FSTCompletions share the same automaton.
this.higherWeightsCompletion = builder.build();
this.normalCompletion =
new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst);
this.count = newCount;
} finally {
IOUtils.closeWhileHandlingException(reader, writer, externalSorter);
IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
}
}
/** weight -> cost */
private static int encodeWeight(long value) {
if (value < Integer.MIN_VALUE || value > Integer.MAX_VALUE) {
throw new UnsupportedOperationException("cannot encode value: " + value);
}
return (int) value;
}
@Override
public List lookup(
CharSequence key, Set contexts, boolean higherWeightsFirst, int num) {
if (contexts != null) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
final List completions;
if (higherWeightsFirst) {
completions = higherWeightsCompletion.lookup(key, num);
} else {
completions = normalCompletion.lookup(key, num);
}
final ArrayList results = new ArrayList<>(completions.size());
CharsRefBuilder spare = new CharsRefBuilder();
for (Completion c : completions) {
spare.copyUTF8Bytes(c.utf8);
results.add(new LookupResult(spare.toString(), c.bucket));
}
return results;
}
/**
* Returns the bucket (weight) as a Long for the provided key if it exists, otherwise null if it
* does not.
*/
public Object get(CharSequence key) {
final int bucket = normalCompletion.getBucket(key);
return bucket == -1 ? null : Long.valueOf(bucket);
}
@Override
public synchronized boolean store(DataOutput output) throws IOException {
output.writeVLong(count);
if (normalCompletion == null || normalCompletion.getFST() == null) {
return false;
}
normalCompletion.getFST().save(output, output);
return true;
}
@Override
public synchronized boolean load(DataInput input) throws IOException {
count = input.readVLong();
this.higherWeightsCompletion =
new FSTCompletion(new FST<>(readMetadata(input, NoOutputs.getSingleton()), input));
this.normalCompletion =
new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst);
return true;
}
@Override
public long ramBytesUsed() {
long mem =
RamUsageEstimator.shallowSizeOf(this)
+ RamUsageEstimator.shallowSizeOf(normalCompletion)
+ RamUsageEstimator.shallowSizeOf(higherWeightsCompletion);
if (normalCompletion != null) {
mem += normalCompletion.getFST().ramBytesUsed();
}
if (higherWeightsCompletion != null
&& (normalCompletion == null
|| normalCompletion.getFST() != higherWeightsCompletion.getFST())) {
// the fst should be shared between the 2 completion instances, don't count it twice
mem += higherWeightsCompletion.getFST().ramBytesUsed();
}
return mem;
}
@Override
public Collection getChildResources() {
List resources = new ArrayList<>();
if (normalCompletion != null) {
resources.add(Accountables.namedAccountable("fst", normalCompletion.getFST()));
}
if (higherWeightsCompletion != null
&& (normalCompletion == null
|| normalCompletion.getFST() != higherWeightsCompletion.getFST())) {
resources.add(
Accountables.namedAccountable("higher weights fst", higherWeightsCompletion.getFST()));
}
return Collections.unmodifiableList(resources);
}
@Override
public long getCount() {
return count;
}
}