org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-suggest Show documentation
Apache Lucene (module: suggest)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.analyzing;

import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.fst.Util.Result;
import org.apache.lucene.util.fst.Util.TopResults;

/**
 * Suggester that first analyzes the surface form, adds the analyzed form to a weighted FST, and
 * then does the same thing at lookup time. This means lookup is based on the analyzed form while
 * suggestions are still the surface form(s).
 *
 * This can result in powerful suggester functionality. For example, if you use an analyzer
 * removing stop words, then the partial text "ghost chr..." could see the suggestion "The Ghost of
 * Christmas Past". Note that position increments MUST NOT be preserved for this example to work, so
 * you should call the constructor with preservePositionIncrements parameter set to
 * false
 *
 * 
If SynonymFilter is used to map wifi and wireless network to hotspot then the partial text
 * "wirele..." could suggest "wifi router". Token normalization like stemmers, accent removal, etc.,
 * would allow suggestions to ignore such variations.
 *
 * 
When two matching suggestions have the same weight, they are tie-broken by the analyzed form.
 * If their analyzed form is the same then the order is undefined.
 *
 * 
There are some limitations:
 *
 * 

 *   A lookup from a query like "net" in English won't be any different than "net " (ie, user
 *       added a trailing space) because analyzers don't reflect when they've seen a token separator
 *       and when they haven't.
 *   
If you're using {@code StopFilter}, and the user will type "fast apple", but so far all
 *       they've typed is "fast a", again because the analyzer doesn't convey whether it's seen a
 *       token separator after the "a", {@code StopFilter} will remove that "a" causing far more
 *       matches than you'd expect.
 *   
Lookups with the empty string return no results instead of all results.
 * 
 *
 * @lucene.experimental
 */
public class AnalyzingSuggester extends Lookup {

  /**
   * FST<Weight,Surface>: input is the analyzed form, with a null byte between terms weights
   * are encoded as costs: (Integer.MAX_VALUE-weight) surface is the original, unanalyzed form.
   */
  private FST> fst = null;

  /** Analyzer that will be used for analyzing suggestions at index time. */
  private final Analyzer indexAnalyzer;

  /** Analyzer that will be used for analyzing suggestions at query time. */
  private final Analyzer queryAnalyzer;

  /** True if exact match suggestions should always be returned first. */
  private final boolean exactFirst;

  /** True if separator between tokens should be preserved. */
  private final boolean preserveSep;

  /**
   * Include this flag in the options parameter to {@link
   * #AnalyzingSuggester(Directory,String,Analyzer,Analyzer,int,int,int,boolean)} to always return
   * the exact match first, regardless of score. This has no performance impact but could result in
   * low-quality suggestions.
   */
  public static final int EXACT_FIRST = 1;

  /**
   * Include this flag in the options parameter to {@link
   * #AnalyzingSuggester(Directory,String,Analyzer,Analyzer,int,int,int,boolean)} to preserve token
   * separators when matching.
   */
  public static final int PRESERVE_SEP = 2;

  /** Represents the separation between tokens, if PRESERVE_SEP was specified */
  private static final int SEP_LABEL = '\u001F';

  /** Marks end of the analyzed input and start of dedup byte. */
  private static final int END_BYTE = 0x0;

  /** Maximum number of dup surface forms (different surface forms for the same analyzed form). */
  private final int maxSurfaceFormsPerAnalyzedForm;

  /**
   * Maximum graph paths to index for a single analyzed surface form. This only matters if your
   * analyzer makes lots of alternate paths (e.g. contains SynonymFilter).
   */
  private final int maxGraphExpansions;

  private final Directory tempDir;
  private final String tempFileNamePrefix;

  /**
   * Highest number of analyzed paths we saw for any single input surface form. For analyzers that
   * never create graphs this will always be 1.
   */
  private int maxAnalyzedPathsForOneInput;

  private boolean hasPayloads;

  private static final int PAYLOAD_SEP = '\u001f';

  /** Whether position holes should appear in the automaton. */
  private boolean preservePositionIncrements;

  /** Number of entries the lookup was built with */
  private volatile long count = 0;

  /**
   * Calls {@link #AnalyzingSuggester(Directory,String,Analyzer,Analyzer,int,int,int,boolean)
   * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true)}
   */
  public AnalyzingSuggester(Directory tempDir, String tempFileNamePrefix, Analyzer analyzer) {
    this(
        tempDir, tempFileNamePrefix, analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true);
  }

  /**
   * Calls {@link #AnalyzingSuggester(Directory,String,Analyzer,Analyzer,int,int,int,boolean)
   * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true)}
   */
  public AnalyzingSuggester(
      Directory tempDir,
      String tempFileNamePrefix,
      Analyzer indexAnalyzer,
      Analyzer queryAnalyzer) {
    this(
        tempDir,
        tempFileNamePrefix,
        indexAnalyzer,
        queryAnalyzer,
        EXACT_FIRST | PRESERVE_SEP,
        256,
        -1,
        true);
  }

  /**
   * Creates a new suggester.
   *
   * @param indexAnalyzer Analyzer that will be used for analyzing suggestions while building the
   *     index.
   * @param queryAnalyzer Analyzer that will be used for analyzing query text during lookup
   * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
   * @param maxSurfaceFormsPerAnalyzedForm Maximum number of surface forms to keep for a single
   *     analyzed form. When there are too many surface forms we discard the lowest weighted ones.
   * @param maxGraphExpansions Maximum number of graph paths to expand from the analyzed form. Set
   *     this to -1 for no limit.
   * @param preservePositionIncrements Whether position holes should appear in the automata
   */
  public AnalyzingSuggester(
      Directory tempDir,
      String tempFileNamePrefix,
      Analyzer indexAnalyzer,
      Analyzer queryAnalyzer,
      int options,
      int maxSurfaceFormsPerAnalyzedForm,
      int maxGraphExpansions,
      boolean preservePositionIncrements) {
    this.indexAnalyzer = indexAnalyzer;
    this.queryAnalyzer = queryAnalyzer;
    if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {
      throw new IllegalArgumentException(
          "options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options);
    }
    this.exactFirst = (options & EXACT_FIRST) != 0;
    this.preserveSep = (options & PRESERVE_SEP) != 0;

    // NOTE: this is just an implementation limitation; if
    // somehow this is a problem we could fix it by using
    // more than one byte to disambiguate ... but 256 seems
    // like it should be way more then enough.
    if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {
      throw new IllegalArgumentException(
          "maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: "
              + maxSurfaceFormsPerAnalyzedForm
              + ")");
    }
    this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;

    if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {
      throw new IllegalArgumentException(
          "maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
    }
    this.maxGraphExpansions = maxGraphExpansions;
    this.preservePositionIncrements = preservePositionIncrements;
    this.tempDir = tempDir;
    this.tempFileNamePrefix = tempFileNamePrefix;
  }

  /** Returns byte size of the underlying FST. */
  @Override
  public long ramBytesUsed() {
    return fst == null ? 0 : fst.ramBytesUsed();
  }

  @Override
  public Collection getChildResources() {
    if (fst == null) {
      return Collections.emptyList();
    } else {
      return Collections.singletonList(Accountables.namedAccountable("fst", fst));
    }
  }

  // Replaces SEP with epsilon or remaps them if
  // we were asked to preserve them:
  private Automaton replaceSep(Automaton a) {

    int numStates = a.getNumStates();
    Automaton.Builder result = new Automaton.Builder(numStates, a.getNumTransitions());
    // Copy all states over
    result.copyStates(a);

    // Go in reverse topo sort so we know we only have to
    // make one pass:
    Transition t = new Transition();
    int[] topoSortStates = Operations.topoSortStates(a);
    for (int i = 0; i < topoSortStates.length; i++) {
      int state = topoSortStates[topoSortStates.length - 1 - i];
      int count = a.initTransition(state, t);
      for (int j = 0; j < count; j++) {
        a.getNextTransition(t);
        if (t.min == TokenStreamToAutomaton.POS_SEP) {
          assert t.max == TokenStreamToAutomaton.POS_SEP;
          if (preserveSep) {
            // Remap to SEP_LABEL:
            result.addTransition(state, t.dest, SEP_LABEL);
          } else {
            result.addEpsilon(state, t.dest);
          }
        } else if (t.min == TokenStreamToAutomaton.HOLE) {
          assert t.max == TokenStreamToAutomaton.HOLE;

          // Just remove the hole: there will then be two
          // SEP tokens next to each other, which will only
          // match another hole at search time.  Note that
          // it will also match an empty-string token ... if
          // that's somehow a problem we can always map HOLE
          // to a dedicated byte (and escape it in the
          // input).
          result.addEpsilon(state, t.dest);
        } else {
          result.addTransition(state, t.dest, t.min, t.max);
        }
      }
    }

    return result.finish();
  }

  /** Used by subclass to change the lookup automaton, if necessary. */
  protected Automaton convertAutomaton(Automaton a) {
    return a;
  }

  TokenStreamToAutomaton getTokenStreamToAutomaton() {
    final TokenStreamToAutomaton tsta = new TokenStreamToAutomaton();
    tsta.setPreservePositionIncrements(preservePositionIncrements);
    tsta.setFinalOffsetGapAsHole(true);
    return tsta;
  }

  private static class AnalyzingComparator implements Comparator {

    private final boolean hasPayloads;

    public AnalyzingComparator(boolean hasPayloads) {
      this.hasPayloads = hasPayloads;
    }

    private final ByteArrayDataInput readerA = new ByteArrayDataInput();
    private final ByteArrayDataInput readerB = new ByteArrayDataInput();
    private final BytesRef scratchA = new BytesRef();
    private final BytesRef scratchB = new BytesRef();

    @Override
    public int compare(BytesRef a, BytesRef b) {

      // First by analyzed form:
      readerA.reset(a.bytes, a.offset, a.length);
      scratchA.length = readerA.readShort();
      scratchA.bytes = a.bytes;
      scratchA.offset = readerA.getPosition();

      readerB.reset(b.bytes, b.offset, b.length);
      scratchB.bytes = b.bytes;
      scratchB.length = readerB.readShort();
      scratchB.offset = readerB.getPosition();

      int cmp = scratchA.compareTo(scratchB);
      if (cmp != 0) {
        return cmp;
      }
      readerA.skipBytes(scratchA.length);
      readerB.skipBytes(scratchB.length);

      // Next by cost:
      long aCost = readerA.readInt();
      long bCost = readerB.readInt();
      assert decodeWeight(aCost) >= 0;
      assert decodeWeight(bCost) >= 0;
      if (aCost < bCost) {
        return -1;
      } else if (aCost > bCost) {
        return 1;
      }

      // Finally by surface form:
      if (hasPayloads) {
        scratchA.length = readerA.readShort();
        scratchB.length = readerB.readShort();
        scratchA.offset = readerA.getPosition();
        scratchB.offset = readerB.getPosition();
      } else {
        scratchA.offset = readerA.getPosition();
        scratchB.offset = readerB.getPosition();
        scratchA.length = readerA.length() - readerA.getPosition();
        scratchB.length = readerB.length() - readerB.getPosition();
      }
      assert scratchA.isValid();
      assert scratchB.isValid();

      return scratchA.compareTo(scratchB);
    }
  }

  @Override
  public void build(InputIterator iterator) throws IOException {
    if (iterator.hasContexts()) {
      throw new IllegalArgumentException("this suggester doesn't support contexts");
    }

    hasPayloads = iterator.hasPayloads();

    OfflineSorter sorter =
        new OfflineSorter(tempDir, tempFileNamePrefix, new AnalyzingComparator(hasPayloads));

    IndexOutput tempInput =
        tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);

    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    BytesRefBuilder scratch = new BytesRefBuilder();

    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();

    String tempSortedFileName = null;

    long newCount = 0;
    byte[] buffer = new byte[8];
    try {
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);

      for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
        LimitedFiniteStringsIterator finiteStrings =
            new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);

        for (IntsRef string; (string = finiteStrings.next()) != null; newCount++) {
          Util.toBytesRef(string, scratch);

          // length of the analyzed text (FST input)
          if (scratch.length() > Short.MAX_VALUE - 2) {
            throw new IllegalArgumentException(
                "cannot handle analyzed forms > "
                    + (Short.MAX_VALUE - 2)
                    + " in length (got "
                    + scratch.length()
                    + ")");
          }
          short analyzedLength = (short) scratch.length();

          // compute the required length:
          // analyzed sequence + weight (4) + surface + analyzedLength (short)
          int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;

          BytesRef payload;

          if (hasPayloads) {
            if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
              throw new IllegalArgumentException(
                  "cannot handle surface form > "
                      + (Short.MAX_VALUE - 2)
                      + " in length (got "
                      + surfaceForm.length
                      + ")");
            }
            payload = iterator.payload();
            // payload + surfaceLength (short)
            requiredLength += payload.length + 2;
          } else {
            payload = null;
          }

          buffer = ArrayUtil.growNoCopy(buffer, requiredLength);

          output.reset(buffer);

          output.writeShort(analyzedLength);

          output.writeBytes(scratch.bytes(), 0, scratch.length());

          output.writeInt(encodeWeight(iterator.weight()));

          if (hasPayloads) {
            for (int i = 0; i < surfaceForm.length; i++) {
              if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
                throw new IllegalArgumentException(
                    "surface form cannot contain unit separator character U+001F; this character is reserved");
              }
            }
            output.writeShort((short) surfaceForm.length);
            output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
            output.writeBytes(payload.bytes, payload.offset, payload.length);
          } else {
            output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
          }

          assert output.getPosition() == requiredLength
              : output.getPosition() + " vs " + requiredLength;
          writer.write(buffer, 0, output.getPosition());
        }

        maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
      }
      CodecUtil.writeFooter(tempInput);
      writer.close();

      // Sort all input/output pairs (required by FST.Builder):
      tempSortedFileName = sorter.sort(tempInput.getName());

      // Free disk space:
      tempDir.deleteFile(tempInput.getName());

      reader =
          new OfflineSorter.ByteSequencesReader(
              tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE),
              tempSortedFileName);

      PairOutputs outputs =
          new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
      FSTCompiler> fstCompiler =
          new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();

      // Build FST:
      BytesRefBuilder previousAnalyzed = null;
      BytesRefBuilder analyzed = new BytesRefBuilder();
      BytesRef surface = new BytesRef();
      IntsRefBuilder scratchInts = new IntsRefBuilder();
      ByteArrayDataInput input = new ByteArrayDataInput();

      // Used to remove duplicate surface forms (but we
      // still index the hightest-weight one).  We clear
      // this when we see a new analyzed form, so it cannot
      // grow unbounded (at most 256 entries):
      Set seenSurfaceForms = new HashSet<>();

      int dedup = 0;
      while (true) {
        BytesRef bytes = reader.next();
        if (bytes == null) {
          break;
        }
        input.reset(bytes.bytes, bytes.offset, bytes.length);
        short analyzedLength = input.readShort();
        analyzed.growNoCopy(analyzedLength + 2);
        input.readBytes(analyzed.bytes(), 0, analyzedLength);
        analyzed.setLength(analyzedLength);

        long cost = input.readInt();

        surface.bytes = bytes.bytes;
        if (hasPayloads) {
          surface.length = input.readShort();
          surface.offset = input.getPosition();
        } else {
          surface.offset = input.getPosition();
          surface.length = bytes.length - surface.offset;
        }

        if (previousAnalyzed == null) {
          previousAnalyzed = new BytesRefBuilder();
          previousAnalyzed.copyBytes(analyzed.get());
          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        } else if (analyzed.get().equals(previousAnalyzed.get())) {
          dedup++;
          if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
            // More than maxSurfaceFormsPerAnalyzedForm
            // dups: skip the rest:
            continue;
          }
          if (seenSurfaceForms.contains(surface)) {
            continue;
          }
          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        } else {
          dedup = 0;
          previousAnalyzed.copyBytes(analyzed);
          seenSurfaceForms.clear();
          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        }

        // TODO: I think we can avoid the extra 2 bytes when
        // there is no dup (dedup==0), but we'd have to fix
        // the exactFirst logic ... which would be sort of
        // hairy because we'd need to special case the two
        // (dup/not dup)...

        // NOTE: must be byte 0 so we sort before whatever
        // is next
        analyzed.append((byte) 0);
        analyzed.append((byte) dedup);

        Util.toIntsRef(analyzed.get(), scratchInts);
        // System.out.println("ADD: " + scratchInts + " -> " + cost + ": " +
        // surface.utf8ToString());
        if (!hasPayloads) {
          fstCompiler.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
        } else {
          int payloadOffset = input.getPosition() + surface.length;
          int payloadLength = bytes.length - payloadOffset;
          BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
          System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
          br.bytes[surface.length] = PAYLOAD_SEP;
          System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
          br.length = br.bytes.length;
          fstCompiler.add(scratchInts.get(), outputs.newPair(cost, br));
        }
      }
      fst = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader());
      count = newCount;

      // Util.dotToFile(fst, "/tmp/suggest.dot");
    } finally {
      IOUtils.closeWhileHandlingException(reader, writer);
      IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
  }

  @Override
  public boolean store(DataOutput output) throws IOException {
    output.writeVLong(count);
    if (fst == null) {
      return false;
    }

    fst.save(output, output);
    output.writeVInt(maxAnalyzedPathsForOneInput);
    output.writeByte((byte) (hasPayloads ? 1 : 0));
    return true;
  }

  @Override
  public boolean load(DataInput input) throws IOException {
    count = input.readVLong();
    PairOutputs outputs =
        new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
    this.fst = new FST<>(FST.readMetadata(input, outputs), input);
    maxAnalyzedPathsForOneInput = input.readVInt();
    hasPayloads = input.readByte() == 1;
    return true;
  }

  private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) {
    LookupResult result;
    if (hasPayloads) {
      int sepIndex = -1;
      for (int i = 0; i < output2.length; i++) {
        if (output2.bytes[output2.offset + i] == PAYLOAD_SEP) {
          sepIndex = i;
          break;
        }
      }
      assert sepIndex != -1;
      spare.grow(sepIndex);
      final int payloadLen = output2.length - sepIndex - 1;
      spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex);
      BytesRef payload = new BytesRef(payloadLen);
      System.arraycopy(output2.bytes, sepIndex + 1, payload.bytes, 0, payloadLen);
      payload.length = payloadLen;
      result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
    } else {
      spare.grow(output2.length);
      spare.copyUTF8Bytes(output2);
      result = new LookupResult(spare.toString(), decodeWeight(output1));
    }

    return result;
  }

  private boolean sameSurfaceForm(BytesRef key, BytesRef output2) {
    if (hasPayloads) {
      // output2 has at least PAYLOAD_SEP byte:
      if (key.length >= output2.length) {
        return false;
      }
      for (int i = 0; i < key.length; i++) {
        if (key.bytes[key.offset + i] != output2.bytes[output2.offset + i]) {
          return false;
        }
      }
      return output2.bytes[output2.offset + key.length] == PAYLOAD_SEP;
    } else {
      return key.bytesEquals(output2);
    }
  }

  @Override
  public List lookup(
      final CharSequence key, Set contexts, boolean onlyMorePopular, int num) {
    assert num > 0;

    if (onlyMorePopular) {
      throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (contexts != null) {
      throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    if (fst == null) {
      return Collections.emptyList();
    }

    // System.out.println("lookup key=" + key + " num=" + num);
    for (int i = 0; i < key.length(); i++) {
      if (key.charAt(i) == 0x1E) {
        throw new IllegalArgumentException(
            "lookup key cannot contain HOLE character U+001E; this character is reserved");
      }
      if (key.charAt(i) == 0x1F) {
        throw new IllegalArgumentException(
            "lookup key cannot contain unit separator character U+001F; this character is reserved");
      }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {
      Automaton lookupAutomaton = toLookupAutomaton(key);

      final CharsRefBuilder spare = new CharsRefBuilder();

      // System.out.println("  now intersect exactFirst=" + exactFirst);

      // Intersect automaton w/ suggest wFST and get all
      // prefix starting nodes & their outputs:
      // final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);

      // System.out.println("  prefixPaths: " + prefixPaths.size());

      BytesReader bytesReader = fst.getBytesReader();

      FST.Arc> scratchArc = new FST.Arc<>();

      final List results = new ArrayList<>();

      List>> prefixPaths =
          FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);

      if (exactFirst) {

        int count = 0;
        for (FSTUtil.Path> path : prefixPaths) {
          if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
            // This node has END_BYTE arc leaving, meaning it's an
            // "exact" match:
            count++;
          }
        }

        // Searcher just to find the single exact only
        // match, if present:
        Util.TopNSearcher> searcher;
        searcher =
            new Util.TopNSearcher<>(
                fst,
                count * maxSurfaceFormsPerAnalyzedForm,
                count * maxSurfaceFormsPerAnalyzedForm,
                weightComparator);

        // NOTE: we could almost get away with only using
        // the first start node.  The only catch is if
        // maxSurfaceFormsPerAnalyzedForm had kicked in and
        // pruned our exact match from one of these nodes
        // ...:
        for (FSTUtil.Path> path : prefixPaths) {
          if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
            // This node has END_BYTE arc leaving, meaning it's an
            // "exact" match:
            searcher.addStartPaths(
                scratchArc, fst.outputs.add(path.output, scratchArc.output()), false, path.input);
          }
        }

        TopResults> completions = searcher.search();
        assert completions.isComplete;

        // NOTE: this is rather inefficient: we enumerate
        // every matching "exactly the same analyzed form"
        // path, and then do linear scan to see if one of
        // these exactly matches the input.  It should be
        // possible (though hairy) to do something similar
        // to getByOutput, since the surface form is encoded
        // into the FST output, so we more efficiently hone
        // in on the exact surface-form match.  Still, I
        // suspect very little time is spent in this linear
        // seach: it's bounded by how many prefix start
        // nodes we have and the
        // maxSurfaceFormsPerAnalyzedForm:
        for (Result> completion : completions) {
          BytesRef output2 = completion.output.output2;
          if (sameSurfaceForm(utf8Key, output2)) {
            results.add(getLookupResult(completion.output.output1, output2, spare));
            break;
          }
        }

        if (results.size() == num) {
          // That was quick:
          return results;
        }
      }

      Util.TopNSearcher> searcher;
      searcher =
          new Util.TopNSearcher>(
              fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {
            private final Set seen = new HashSet<>();

            @Override
            protected boolean acceptResult(IntsRef input, Pair output) {

              // Dedup: when the input analyzes to a graph we
              // can get duplicate surface forms:
              if (seen.contains(output.output2)) {
                return false;
              }
              seen.add(output.output2);

              if (!exactFirst) {
                return true;
              } else {
                // In exactFirst mode, don't accept any paths
                // matching the surface form since that will
                // create duplicate results:
                if (sameSurfaceForm(utf8Key, output.output2)) {
                  // We found exact match, which means we should
                  // have already found it in the first search:
                  assert results.size() == 1;
                  return false;
                } else {
                  return true;
                }
              }
            }
          };

      prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);

      for (FSTUtil.Path> path : prefixPaths) {
        searcher.addStartPaths(path.fstNode, path.output, true, path.input);
      }

      TopResults> completions = searcher.search();
      assert completions.isComplete;

      for (Result> completion : completions) {

        LookupResult result =
            getLookupResult(completion.output.output1, completion.output.output2, spare);

        // TODO: for fuzzy case would be nice to return
        // how many edits were required

        // System.out.println("    result=" + result);
        results.add(result);

        if (results.size() == num) {
          // In the exactFirst=true case the search may
          // produce one extra path
          break;
        }
      }

      return results;
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
  }

  @Override
  public long getCount() {
    return count;
  }

  /** Returns all prefix paths to initialize the search. */
  protected List>> getFullPrefixPaths(
      List>> prefixPaths,
      Automaton lookupAutomaton,
      FST> fst)
      throws IOException {
    return prefixPaths;
  }

  final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a)
      throws IOException {
    // Analyze surface form:
    Automaton automaton;
    try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {

      // Create corresponding automaton: labels are bytes
      // from each analyzed token, with byte 0 used as
      // separator between tokens:
      automaton = ts2a.toAutomaton(ts);
    }

    automaton = replaceSep(automaton);
    automaton = convertAutomaton(automaton);

    // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
    // assert SpecialOperations.isFinite(automaton);

    // Get all paths from the automaton (there can be
    // more than one path, eg if the analyzer created a
    // graph using SynFilter or WDF):
    return automaton;
  }

  final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
    Automaton automaton = null;
    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
      automaton = getTokenStreamToAutomaton().toAutomaton(ts);
    }

    automaton = replaceSep(automaton);

    // TODO: we can optimize this somewhat by determinizing
    // while we convert
    automaton = Operations.determinize(automaton, DEFAULT_DETERMINIZE_WORK_LIMIT);
    return automaton;
  }

  /** Returns the weight associated with an input string, or null if it does not exist. */
  public Object get(CharSequence key) {
    throw new UnsupportedOperationException();
  }

  /** cost -> weight */
  private static int decodeWeight(long encoded) {
    return (int) (Integer.MAX_VALUE - encoded);
  }

  /** weight -> cost */
  private static int encodeWeight(long value) {
    if (value < 0 || value > Integer.MAX_VALUE) {
      throw new UnsupportedOperationException("cannot encode value: " + value);
    }
    return Integer.MAX_VALUE - (int) value;
  }

  static final Comparator> weightComparator =
      new Comparator>() {
        @Override
        public int compare(Pair left, Pair right) {
          return left.output1.compareTo(right.output1);
        }
      };
}