org.apache.lucene.search.suggest.fst.FSTCompletionLookup Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-suggest Show documentation
Apache Lucene (module: suggest)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.fst;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.NoOutputs;

/**
 * An adapter from {@link Lookup} API to {@link FSTCompletion}.
 * 
 * This adapter differs from {@link FSTCompletion} in that it attempts
 * to discretize any "weights" as passed from in {@link InputIterator#weight()}
 * to match the number of buckets. For the rationale for bucketing, see
 * {@link FSTCompletion}.
 * 
 * 
Note:Discretization requires an additional sorting pass.
 * 
 * 
The range of weights for bucketing/ discretization is determined 
 * by sorting the input by weight and then dividing into
 * equal ranges. Then, scores within each range are assigned to that bucket. 
 * 
 * 
Note that this means that even large differences in weights may be lost 
 * during automaton construction, but the overall distinction between "classes"
 * of weights will be preserved regardless of the distribution of weights. 
 * 
 * 
For fine-grained control over which weights are assigned to which buckets,
 * use {@link FSTCompletion} directly or {@link TSTLookup}, for example.
 * 
 * @see FSTCompletion
 * @lucene.experimental
 */
public class FSTCompletionLookup extends Lookup implements Accountable {
  /** 
   * An invalid bucket count if we're creating an object
   * of this class from an existing FST.
   * 
   * @see #FSTCompletionLookup(Directory, String, FSTCompletion, boolean)
   */
  private static int INVALID_BUCKETS_COUNT = -1;
  
  /**
   * Shared tail length for conflating in the created automaton. Setting this
   * to larger values ({@link Integer#MAX_VALUE}) will create smaller (or minimal) 
   * automata at the cost of RAM for keeping nodes hash in the {@link FST}. 
   *  
   * Empirical pick.
   */
  private final static int sharedTailLength = 5;

  private final Directory tempDir;
  private final String tempFileNamePrefix;

  private int buckets;
  private boolean exactMatchFirst;

  /**
   * Automaton used for completions with higher weights reordering.
   */
  private FSTCompletion higherWeightsCompletion;

  /**
   * Automaton used for normal completions.
   */
  private FSTCompletion normalCompletion;

  /** Number of entries the lookup was built with */
  private long count = 0;

  /**
   * This constructor should only be used to read a previously saved suggester.
   */
  public FSTCompletionLookup() {
    this(null, null);
  }

  /**
   * This constructor prepares for creating a suggested FST using the
   * {@link #build(InputIterator)} method. The number of weight
   * discretization buckets is set to {@link FSTCompletion#DEFAULT_BUCKETS} and
   * exact matches are promoted to the top of the suggestions list.
   */
  public FSTCompletionLookup(Directory tempDir, String tempFileNamePrefix) {
    this(tempDir, tempFileNamePrefix, FSTCompletion.DEFAULT_BUCKETS, true);
  }

  /**
   * This constructor prepares for creating a suggested FST using the
   * {@link #build(InputIterator)} method.
   * 
   * @param buckets
   *          The number of weight discretization buckets (see
   *          {@link FSTCompletion} for details).
   * 
   * @param exactMatchFirst
   *          If true exact matches are promoted to the top of the
   *          suggestions list. Otherwise they appear in the order of
   *          discretized weight and alphabetical within the bucket.
   */
  public FSTCompletionLookup(Directory tempDir, String tempFileNamePrefix, int buckets, boolean exactMatchFirst) {
    this.buckets = buckets;
    this.exactMatchFirst = exactMatchFirst;
    this.tempDir = tempDir;
    this.tempFileNamePrefix = tempFileNamePrefix;
  }

  /**
   * This constructor takes a pre-built automaton.
   * 
   *  @param completion 
   *          An instance of {@link FSTCompletion}.
   *  @param exactMatchFirst
   *          If true exact matches are promoted to the top of the
   *          suggestions list. Otherwise they appear in the order of
   *          discretized weight and alphabetical within the bucket.
   */
  public FSTCompletionLookup(Directory tempDir, String tempFileNamePrefix, FSTCompletion completion, boolean exactMatchFirst) {
    this(tempDir, tempFileNamePrefix, INVALID_BUCKETS_COUNT, exactMatchFirst);
    this.normalCompletion = new FSTCompletion(
        completion.getFST(), false, exactMatchFirst);
    this.higherWeightsCompletion =  new FSTCompletion(
        completion.getFST(), true, exactMatchFirst);
  }

  @Override
  public void build(InputIterator iterator) throws IOException {
    if (iterator.hasPayloads()) {
      throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    if (iterator.hasContexts()) {
      throw new IllegalArgumentException("this suggester doesn't support contexts");
    }

    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix);
    ExternalRefSorter externalSorter = new ExternalRefSorter(sorter);
    IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
    String tempSortedFileName = null;

    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;

    // Push floats up front before sequences to sort them. For now, assume they are non-negative.
    // If negative floats are allowed some trickery needs to be done to find their byte order.
    count = 0;
    try {
      byte [] buffer = new byte [0];
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
      BytesRef spare;
      int inputLineCount = 0;
      while ((spare = iterator.next()) != null) {
        if (spare.length + 4 >= buffer.length) {
          buffer = ArrayUtil.grow(buffer, spare.length + 4);
        }

        output.reset(buffer);
        output.writeInt(encodeWeight(iterator.weight()));
        output.writeBytes(spare.bytes, spare.offset, spare.length);
        writer.write(buffer, 0, output.getPosition());
        inputLineCount++;
      }
      CodecUtil.writeFooter(tempInput);
      writer.close();

      // We don't know the distribution of scores and we need to bucket them, so we'll sort
      // and divide into equal buckets.
      tempSortedFileName = sorter.sort(tempInput.getName());
      tempDir.deleteFile(tempInput.getName());

      FSTCompletionBuilder builder = new FSTCompletionBuilder(
          buckets, externalSorter, sharedTailLength);

      reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
      long line = 0;
      int previousBucket = 0;
      int previousScore = 0;
      ByteArrayDataInput input = new ByteArrayDataInput();
      BytesRef tmp2 = new BytesRef();
      while (true) {
        BytesRef scratch = reader.next();
        if (scratch == null) {
          break;
        }
        input.reset(scratch.bytes, scratch.offset, scratch.length);
        int currentScore = input.readInt();

        int bucket;
        if (line > 0 && currentScore == previousScore) {
          bucket = previousBucket;
        } else {
          bucket = (int) (line * buckets / inputLineCount);
        }
        previousScore = currentScore;
        previousBucket = bucket;

        // Only append the input, discard the weight.
        tmp2.bytes = scratch.bytes;
        tmp2.offset = scratch.offset + input.getPosition();
        tmp2.length = scratch.length - input.getPosition();
        builder.add(tmp2, bucket);

        line++;
        count++;
      }

      // The two FSTCompletions share the same automaton.
      this.higherWeightsCompletion = builder.build();
      this.normalCompletion = new FSTCompletion(
          higherWeightsCompletion.getFST(), false, exactMatchFirst);
      
    } finally {
      IOUtils.closeWhileHandlingException(reader, writer, externalSorter);
      IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
  }
  
  /** weight -> cost */
  private static int encodeWeight(long value) {
    if (value < Integer.MIN_VALUE || value > Integer.MAX_VALUE) {
      throw new UnsupportedOperationException("cannot encode value: " + value);
    }
    return (int)value;
  }

  @Override
  public List lookup(CharSequence key, Set contexts, boolean higherWeightsFirst, int num) {
    if (contexts != null) {
      throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    final List completions;
    if (higherWeightsFirst) {
      completions = higherWeightsCompletion.lookup(key, num);
    } else {
      completions = normalCompletion.lookup(key, num);
    }
    
    final ArrayList results = new ArrayList<>(completions.size());
    CharsRefBuilder spare = new CharsRefBuilder();
    for (Completion c : completions) {
      spare.copyUTF8Bytes(c.utf8);
      results.add(new LookupResult(spare.toString(), c.bucket));
    }
    return results;
  }

  /**
   * Returns the bucket (weight) as a Long for the provided key if it exists,
   * otherwise null if it does not.
   */
  public Object get(CharSequence key) {
    final int bucket = normalCompletion.getBucket(key);
    return bucket == -1 ? null : Long.valueOf(bucket);
  }


  @Override
  public synchronized boolean store(DataOutput output) throws IOException {
    output.writeVLong(count);
    if (normalCompletion == null || normalCompletion.getFST() == null) {
      return false;
    }
    normalCompletion.getFST().save(output, output);
    return true;
  }

  @Override
  public synchronized boolean load(DataInput input) throws IOException {
    count = input.readVLong();
    this.higherWeightsCompletion = new FSTCompletion(new FST<>(
        input, input, NoOutputs.getSingleton()));
    this.normalCompletion = new FSTCompletion(
        higherWeightsCompletion.getFST(), false, exactMatchFirst);
    return true;
  }

  @Override
  public long ramBytesUsed() {
    long mem = RamUsageEstimator.shallowSizeOf(this) + RamUsageEstimator.shallowSizeOf(normalCompletion) + RamUsageEstimator.shallowSizeOf(higherWeightsCompletion);
    if (normalCompletion != null) {
      mem += normalCompletion.getFST().ramBytesUsed();
    }
    if (higherWeightsCompletion != null && (normalCompletion == null || normalCompletion.getFST() != higherWeightsCompletion.getFST())) {
      // the fst should be shared between the 2 completion instances, don't count it twice
      mem += higherWeightsCompletion.getFST().ramBytesUsed();
    }
    return mem;
  }

  @Override
  public Collection getChildResources() {
    List resources = new ArrayList<>();
    if (normalCompletion != null) {
      resources.add(Accountables.namedAccountable("fst", normalCompletion.getFST()));
    }
    if (higherWeightsCompletion != null && (normalCompletion == null || normalCompletion.getFST() != higherWeightsCompletion.getFST())) {
      resources.add(Accountables.namedAccountable("higher weights fst", higherWeightsCompletion.getFST()));
    }
    return Collections.unmodifiableList(resources);
  }

  @Override
  public long getCount() {
    return count;
  }
}