org.apache.lucene.search.suggest.fst.FSTCompletionLookup Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-suggest Show documentation
Apache Lucene (module: suggest)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.fst;

import static org.apache.lucene.util.fst.FST.readMetadata;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.NoOutputs;

/**
 * An adapter from {@link Lookup} API to {@link FSTCompletion}.
 *
 * This adapter differs from {@link FSTCompletion} in that it attempts to discretize any
 * "weights" as passed from in {@link InputIterator#weight()} to match the number of buckets. For
 * the rationale for bucketing, see {@link FSTCompletion}.
 *
 * 
Note:Discretization requires an additional sorting pass.
 *
 * 
The range of weights for bucketing/ discretization is determined by sorting the input by
 * weight and then dividing into equal ranges. Then, scores within each range are assigned to that
 * bucket.
 *
 * 
Note that this means that even large differences in weights may be lost during automaton
 * construction, but the overall distinction between "classes" of weights will be preserved
 * regardless of the distribution of weights.
 *
 * 
For fine-grained control over which weights are assigned to which buckets, use {@link
 * FSTCompletion} directly or {@link TSTLookup}, for example.
 *
 * @see FSTCompletion
 * @lucene.experimental
 */
public class FSTCompletionLookup extends Lookup {
  /**
   * An invalid bucket count if we're creating an object of this class from an existing FST.
   *
   * @see #FSTCompletionLookup(Directory, String, FSTCompletion, boolean)
   */
  private static int INVALID_BUCKETS_COUNT = -1;

  /**
   * Shared tail length for conflating in the created automaton. Setting this to larger values
   * ({@link Integer#MAX_VALUE}) will create smaller (or minimal) automata at the cost of RAM for
   * keeping nodes hash in the {@link FST}.
   *
   * Empirical pick.
   */
  private static final int sharedTailLength = 5;

  private final Directory tempDir;
  private final String tempFileNamePrefix;

  private int buckets;
  private boolean exactMatchFirst;

  /** Automaton used for completions with higher weights reordering. */
  private FSTCompletion higherWeightsCompletion;

  /** Automaton used for normal completions. */
  private FSTCompletion normalCompletion;

  /** Number of entries the lookup was built with */
  private volatile long count = 0;

  /** This constructor should only be used to read a previously saved suggester. */
  public FSTCompletionLookup() {
    this(null, null);
  }

  /**
   * This constructor prepares for creating a suggested FST using the {@link #build(InputIterator)}
   * method. The number of weight discretization buckets is set to {@link
   * FSTCompletion#DEFAULT_BUCKETS} and exact matches are promoted to the top of the suggestions
   * list.
   */
  public FSTCompletionLookup(Directory tempDir, String tempFileNamePrefix) {
    this(tempDir, tempFileNamePrefix, FSTCompletion.DEFAULT_BUCKETS, true);
  }

  /**
   * This constructor prepares for creating a suggested FST using the {@link #build(InputIterator)}
   * method.
   *
   * @param buckets The number of weight discretization buckets (see {@link FSTCompletion} for
   *     details).
   * @param exactMatchFirst If true exact matches are promoted to the top of the
   *     suggestions list. Otherwise they appear in the order of discretized weight and alphabetical
   *     within the bucket.
   */
  public FSTCompletionLookup(
      Directory tempDir, String tempFileNamePrefix, int buckets, boolean exactMatchFirst) {
    this.buckets = buckets;
    this.exactMatchFirst = exactMatchFirst;
    this.tempDir = tempDir;
    this.tempFileNamePrefix = tempFileNamePrefix;
  }

  /**
   * This constructor takes a pre-built automaton.
   *
   * @param completion An instance of {@link FSTCompletion}.
   * @param exactMatchFirst If true exact matches are promoted to the top of the
   *     suggestions list. Otherwise they appear in the order of discretized weight and alphabetical
   *     within the bucket.
   */
  public FSTCompletionLookup(
      Directory tempDir,
      String tempFileNamePrefix,
      FSTCompletion completion,
      boolean exactMatchFirst) {
    this(tempDir, tempFileNamePrefix, INVALID_BUCKETS_COUNT, exactMatchFirst);
    this.normalCompletion = new FSTCompletion(completion.getFST(), false, exactMatchFirst);
    this.higherWeightsCompletion = new FSTCompletion(completion.getFST(), true, exactMatchFirst);
  }

  @Override
  public void build(InputIterator iterator) throws IOException {
    if (iterator.hasPayloads()) {
      throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    if (iterator.hasContexts()) {
      throw new IllegalArgumentException("this suggester doesn't support contexts");
    }

    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix);
    ExternalRefSorter externalSorter = new ExternalRefSorter(sorter);
    IndexOutput tempInput =
        tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
    String tempSortedFileName = null;

    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;

    // Push floats up front before sequences to sort them. For now, assume they are non-negative.
    // If negative floats are allowed some trickery needs to be done to find their byte order.
    try {
      byte[] buffer = new byte[0];
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
      BytesRef spare;
      int inputLineCount = 0;
      while ((spare = iterator.next()) != null) {
        if (spare.length + 4 >= buffer.length) {
          buffer = ArrayUtil.grow(buffer, spare.length + 4);
        }

        output.reset(buffer);
        final int encodedWeight = encodeWeight(iterator.weight());
        // write bytes for comparing in lexicographically order
        output.writeInt(Integer.reverseBytes(encodedWeight));
        output.writeBytes(spare.bytes, spare.offset, spare.length);
        writer.write(buffer, 0, output.getPosition());
        inputLineCount++;
      }
      CodecUtil.writeFooter(tempInput);
      writer.close();

      // We don't know the distribution of scores and we need to bucket them, so we'll sort
      // and divide into equal buckets.
      tempSortedFileName = sorter.sort(tempInput.getName());
      tempDir.deleteFile(tempInput.getName());

      FSTCompletionBuilder builder =
          new FSTCompletionBuilder(buckets, externalSorter, sharedTailLength);

      reader =
          new OfflineSorter.ByteSequencesReader(
              tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE),
              tempSortedFileName);
      long line = 0;
      int previousBucket = 0;
      int previousScore = 0;
      ByteArrayDataInput input = new ByteArrayDataInput();
      BytesRef tmp2 = new BytesRef();
      long newCount = 0;
      while (true) {
        BytesRef scratch = reader.next();
        if (scratch == null) {
          break;
        }
        input.reset(scratch.bytes, scratch.offset, scratch.length);
        int currentScore = input.readInt();

        int bucket;
        if (line > 0 && currentScore == previousScore) {
          bucket = previousBucket;
        } else {
          bucket = (int) (line * buckets / inputLineCount);
        }
        previousScore = currentScore;
        previousBucket = bucket;

        // Only append the input, discard the weight.
        tmp2.bytes = scratch.bytes;
        tmp2.offset = scratch.offset + input.getPosition();
        tmp2.length = scratch.length - input.getPosition();
        builder.add(tmp2, bucket);

        line++;
        newCount++;
      }

      // The two FSTCompletions share the same automaton.
      this.higherWeightsCompletion = builder.build();
      this.normalCompletion =
          new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst);
      this.count = newCount;

    } finally {
      IOUtils.closeWhileHandlingException(reader, writer, externalSorter);
      IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
  }

  /** weight -> cost */
  private static int encodeWeight(long value) {
    if (value < Integer.MIN_VALUE || value > Integer.MAX_VALUE) {
      throw new UnsupportedOperationException("cannot encode value: " + value);
    }
    return (int) value;
  }

  @Override
  public List lookup(
      CharSequence key, Set contexts, boolean higherWeightsFirst, int num) {
    if (contexts != null) {
      throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    final List completions;
    if (higherWeightsFirst) {
      completions = higherWeightsCompletion.lookup(key, num);
    } else {
      completions = normalCompletion.lookup(key, num);
    }

    final ArrayList results = new ArrayList<>(completions.size());
    CharsRefBuilder spare = new CharsRefBuilder();
    for (Completion c : completions) {
      spare.copyUTF8Bytes(c.utf8);
      results.add(new LookupResult(spare.toString(), c.bucket));
    }
    return results;
  }

  /**
   * Returns the bucket (weight) as a Long for the provided key if it exists, otherwise null if it
   * does not.
   */
  public Object get(CharSequence key) {
    final int bucket = normalCompletion.getBucket(key);
    return bucket == -1 ? null : Long.valueOf(bucket);
  }

  @Override
  public synchronized boolean store(DataOutput output) throws IOException {
    output.writeVLong(count);
    if (normalCompletion == null || normalCompletion.getFST() == null) {
      return false;
    }
    normalCompletion.getFST().save(output, output);
    return true;
  }

  @Override
  public synchronized boolean load(DataInput input) throws IOException {
    count = input.readVLong();
    this.higherWeightsCompletion =
        new FSTCompletion(new FST<>(readMetadata(input, NoOutputs.getSingleton()), input));
    this.normalCompletion =
        new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst);
    return true;
  }

  @Override
  public long ramBytesUsed() {
    long mem =
        RamUsageEstimator.shallowSizeOf(this)
            + RamUsageEstimator.shallowSizeOf(normalCompletion)
            + RamUsageEstimator.shallowSizeOf(higherWeightsCompletion);
    if (normalCompletion != null) {
      mem += normalCompletion.getFST().ramBytesUsed();
    }
    if (higherWeightsCompletion != null
        && (normalCompletion == null
            || normalCompletion.getFST() != higherWeightsCompletion.getFST())) {
      // the fst should be shared between the 2 completion instances, don't count it twice
      mem += higherWeightsCompletion.getFST().ramBytesUsed();
    }
    return mem;
  }

  @Override
  public Collection getChildResources() {
    List resources = new ArrayList<>();
    if (normalCompletion != null) {
      resources.add(Accountables.namedAccountable("fst", normalCompletion.getFST()));
    }
    if (higherWeightsCompletion != null
        && (normalCompletion == null
            || normalCompletion.getFST() != higherWeightsCompletion.getFST())) {
      resources.add(
          Accountables.namedAccountable("higher weights fst", higherWeightsCompletion.getFST()));
    }
    return Collections.unmodifiableList(resources);
  }

  @Override
  public long getCount() {
    return count;
  }
}