org.apache.lucene.search.spell.SpellChecker Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.spell;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.StoredFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;

/**
 * Spell Checker class (Main class).

 * (initially inspired by the David Spencer code).
 *
 * Example Usage:
 *
 * 
 *  SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
 *  // To index a field of a user index:
 *  spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
 *  // To index a file containing words:
 *  spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
 *  String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
 * 
 */
public class SpellChecker implements java.io.Closeable {

  /** The default minimum score to use, if not specified by calling {@link #setAccuracy(float)} . */
  public static final float DEFAULT_ACCURACY = 0.5f;

  /** Field name for each word in the ngram index. */
  public static final String F_WORD = "word";

  /** the spell index */
  // don't modify the directory directly - see #swapSearcher()
  // TODO: why is this package private?
  Directory spellIndex;

  /** Boost value for start and end grams */
  private float bStart = 2.0f;

  private float bEnd = 1.0f;
  // don't use this searcher directly - see #swapSearcher()

  private IndexSearcher searcher;
  /*
   * this locks all modifications to the current searcher.
   */

  private final Object searcherLock = new Object();
  /*
   * this lock synchronizes all possible modifications to the
   * current index directory. It should not be possible to try modifying
   * the same index concurrently. Note: Do not acquire the searcher lock
   * before acquiring this lock!
   */
  private final Object modifyCurrentIndexLock = new Object();

  private volatile boolean closed = false;
  // minimum score for hits generated by the spell checker query

  private float accuracy = DEFAULT_ACCURACY;

  private StringDistance sd;
  private Comparator comparator;

  /**
   * Use the given directory as a spell checker index. The directory is created if it doesn't exist
   * yet.
   *
   * @param spellIndex the spell index directory
   * @param sd the {@link StringDistance} measurement to use
   * @throws IOException if Spellchecker can not open the directory
   */
  public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException {
    this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR);
  }

  /**
   * Use the given directory as a spell checker index with a {@link LevenshteinDistance} as the
   * default {@link StringDistance}. The directory is created if it doesn't exist yet.
   *
   * @param spellIndex the spell index directory
   * @throws IOException if spellchecker can not open the directory
   */
  public SpellChecker(Directory spellIndex) throws IOException {
    this(spellIndex, new LevenshteinDistance());
  }

  /**
   * Use the given directory as a spell checker index with the given {@link
   * org.apache.lucene.search.spell.StringDistance} measure and the given {@link
   * java.util.Comparator} for sorting the results.
   *
   * @param spellIndex The spelling index
   * @param sd The distance
   * @param comparator The comparator
   * @throws IOException if there is a problem opening the index
   */
  public SpellChecker(Directory spellIndex, StringDistance sd, Comparator comparator)
      throws IOException {
    setSpellIndex(spellIndex);
    setStringDistance(sd);
    this.comparator = comparator;
  }

  /**
   * Use a different index as the spell checker index or re-open the existing index if 
   * spellIndex is the same value as given in the constructor.
   *
   * @param spellIndexDir the spell directory to use
   * @throws AlreadyClosedException if the Spellchecker is already closed
   * @throws IOException if spellchecker can not open the directory
   */
  // TODO: we should make this final as it is called in the constructor
  public void setSpellIndex(Directory spellIndexDir) throws IOException {
    // this could be the same directory as the current spellIndex
    // modifications to the directory should be synchronized
    synchronized (modifyCurrentIndexLock) {
      ensureOpen();
      if (!DirectoryReader.indexExists(spellIndexDir)) {
        IndexWriter writer = new IndexWriter(spellIndexDir, new IndexWriterConfig(null));
        writer.close();
      }
      swapSearcher(spellIndexDir);
    }
  }

  /**
   * Sets the {@link java.util.Comparator} for the {@link SuggestWordQueue}.
   *
   * @param comparator the comparator
   */
  public void setComparator(Comparator comparator) {
    this.comparator = comparator;
  }

  /**
   * Gets the comparator in use for ranking suggestions.
   *
   * @see #setComparator(Comparator)
   */
  public Comparator getComparator() {
    return comparator;
  }

  /**
   * Sets the {@link StringDistance} implementation for this {@link SpellChecker} instance.
   *
   * @param sd the {@link StringDistance} implementation for this {@link SpellChecker} instance
   */
  public void setStringDistance(StringDistance sd) {
    this.sd = sd;
  }

  /**
   * Returns the {@link StringDistance} instance used by this {@link SpellChecker} instance.
   *
   * @return the {@link StringDistance} instance used by this {@link SpellChecker} instance.
   */
  public StringDistance getStringDistance() {
    return sd;
  }

  /**
   * Sets the accuracy 0 < minScore < 1; default {@link #DEFAULT_ACCURACY}
   *
   * @param acc The new accuracy
   */
  public void setAccuracy(float acc) {
    this.accuracy = acc;
  }

  /**
   * The accuracy (minimum score) to be used, unless overridden in {@link #suggestSimilar(String,
   * int, IndexReader, String, SuggestMode, float)}, to decide whether a suggestion is included or
   * not.
   *
   * @return The current accuracy setting
   */
  public float getAccuracy() {
    return accuracy;
  }

  /**
   * Suggest similar words.
   *
   * As the Lucene similarity that is used to fetch the most relevant n-grammed terms is not the
   * same as the edit distance strategy used to calculate the best matching spell-checked word from
   * the hits that Lucene found, one usually has to retrieve a couple of numSug's in order to get
   * the true best match.
   *
   * 
I.e. if numSug == 1, don't count on that suggestion being the best one. Thus, you should set
   * this value to at least 5 for a good suggestion.
   *
   * @param word the word you want a spell check done on
   * @param numSug the number of suggested words
   * @throws IOException if the underlying index throws an {@link IOException}
   * @throws AlreadyClosedException if the Spellchecker is already closed
   * @return String[]
   * @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
   */
  public String[] suggestSimilar(String word, int numSug) throws IOException {
    return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
  }

  /**
   * Suggest similar words.
   *
   * 
As the Lucene similarity that is used to fetch the most relevant n-grammed terms is not the
   * same as the edit distance strategy used to calculate the best matching spell-checked word from
   * the hits that Lucene found, one usually has to retrieve a couple of numSug's in order to get
   * the true best match.
   *
   * 
I.e. if numSug == 1, don't count on that suggestion being the best one. Thus, you should set
   * this value to at least 5 for a good suggestion.
   *
   * @param word the word you want a spell check done on
   * @param numSug the number of suggested words
   * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in
   *     the results
   * @throws IOException if the underlying index throws an {@link IOException}
   * @throws AlreadyClosedException if the Spellchecker is already closed
   * @return String[]
   * @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
   */
  public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException {
    return this.suggestSimilar(
        word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy);
  }

  /**
   * Calls {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
   * suggestSimilar(word, numSug, ir, suggestMode, field, this.accuracy)}
   */
  public String[] suggestSimilar(
      String word, int numSug, IndexReader ir, String field, SuggestMode suggestMode)
      throws IOException {
    return suggestSimilar(word, numSug, ir, field, suggestMode, this.accuracy);
  }

  /**
   * Suggest similar words (optionally restricted to a field of an index).
   *
   * 
As the Lucene similarity that is used to fetch the most relevant n-grammed terms is not the
   * same as the edit distance strategy used to calculate the best matching spell-checked word from
   * the hits that Lucene found, one usually has to retrieve a couple of numSug's in order to get
   * the true best match.
   *
   * I.e. if numSug == 1, don't count on that suggestion being the best one. Thus, you should set
   * this value to at least 5 for a good suggestion.
   *
   * @param word the word you want a spell check done on
   * @param numSug the number of suggested words
   * @param ir the indexReader of the user index (can be null see field param)
   * @param field the field of the user index: if field is not null, the suggested words are
   *     restricted to the words present in this field.
   * @param suggestMode (NOTE: if indexReader==null and/or field==null, then this is overridden with
   *     SuggestMode.SUGGEST_ALWAYS)
   * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in
   *     the results
   * @throws IOException if the underlying index throws an {@link IOException}
   * @throws AlreadyClosedException if the Spellchecker is already closed
   * @return String[] the sorted list of the suggest words with these 2 criteria: first criteria:
   *     the edit distance, second criteria (only if restricted mode): the popularity of the suggest
   *     words in the field of the user index
   */
  public String[] suggestSimilar(
      String word,
      int numSug,
      IndexReader ir,
      String field,
      SuggestMode suggestMode,
      float accuracy)
      throws IOException {
    // obtainSearcher calls ensureOpen
    final IndexSearcher indexSearcher = obtainSearcher();
    try {
      if (ir == null || field == null) {
        suggestMode = SuggestMode.SUGGEST_ALWAYS;
      }
      if (suggestMode == SuggestMode.SUGGEST_ALWAYS) {
        ir = null;
        field = null;
      }

      final int lengthWord = word.length();

      final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
      final int goalFreq = suggestMode == SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0;
      // if the word exists in the real index and we don't care for word frequency, return the word
      // itself
      if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) {
        return new String[] {word};
      }

      BooleanQuery.Builder query = new BooleanQuery.Builder();
      String[] grams;
      String key;

      for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {

        key = "gram" + ng; // form key

        grams = formGrams(word, ng); // form word into ngrams (allow dups too)

        if (grams.length == 0) {
          continue; // hmm
        }

        if (bStart > 0) { // should we boost prefixes?
          add(query, "start" + ng, grams[0], bStart); // matches start of word
        }
        if (bEnd > 0) { // should we boost suffixes
          add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word
        }
        for (int i = 0; i < grams.length; i++) {
          add(query, key, grams[i]);
        }
      }

      int maxHits = 10 * numSug;

      //    System.out.println("Q: " + query);
      ScoreDoc[] hits = indexSearcher.search(query.build(), maxHits).scoreDocs;
      //    System.out.println("HITS: " + hits.length());
      SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator);

      // go thru more than 'maxr' matches in case the distance filter triggers
      int stop = Math.min(hits.length, maxHits);
      SuggestWord sugWord = new SuggestWord();
      StoredFields storedFields = indexSearcher.storedFields();
      for (int i = 0; i < stop; i++) {

        sugWord.string = storedFields.document(hits[i].doc).get(F_WORD); // get orig word

        // don't suggest a word for itself, that would be silly
        if (sugWord.string.equals(word)) {
          continue;
        }

        // edit distance
        sugWord.score = sd.getDistance(word, sugWord.string);
        if (sugWord.score < accuracy) {
          continue;
        }

        if (ir != null && field != null) { // use the user index
          sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
          // don't suggest a word that is not present in the field
          if ((suggestMode == SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq)
              || sugWord.freq < 1) {
            continue;
          }
        }
        sugQueue.insertWithOverflow(sugWord);
        if (sugQueue.size() == numSug) {
          // if queue full, maintain the minScore score
          accuracy = sugQueue.top().score;
        }
        sugWord = new SuggestWord();
      }

      // convert to array string
      String[] list = new String[sugQueue.size()];
      for (int i = sugQueue.size() - 1; i >= 0; i--) {
        list[i] = sugQueue.pop().string;
      }

      return list;
    } finally {
      releaseSearcher(indexSearcher);
    }
  }

  /** Add a clause to a boolean query. */
  private static void add(BooleanQuery.Builder q, String name, String value, float boost) {
    Query tq = new TermQuery(new Term(name, value));
    q.add(new BooleanClause(new BoostQuery(tq, boost), BooleanClause.Occur.SHOULD));
  }

  /** Add a clause to a boolean query. */
  private static void add(BooleanQuery.Builder q, String name, String value) {
    q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
  }

  /**
   * Form all ngrams for a given word.
   *
   * @param text the word to parse
   * @param ng the ngram length e.g. 3
   * @return an array of all ngrams in the word and note that duplicates are not removed
   */
  private static String[] formGrams(String text, int ng) {
    int len = text.length();
    if (len < ng) {
      return new String[] {};
    }
    String[] res = new String[len - ng + 1];
    for (int i = 0; i < len - ng + 1; i++) {
      res[i] = text.substring(i, i + ng);
    }
    return res;
  }

  /**
   * Removes all terms from the spell check index.
   *
   * @throws IOException If there is a low-level I/O error.
   * @throws AlreadyClosedException if the Spellchecker is already closed
   */
  public void clearIndex() throws IOException {
    synchronized (modifyCurrentIndexLock) {
      ensureOpen();
      final Directory dir = this.spellIndex;
      final IndexWriter writer =
          new IndexWriter(dir, new IndexWriterConfig(null).setOpenMode(OpenMode.CREATE));
      writer.close();
      swapSearcher(dir);
    }
  }

  /**
   * Check whether the word exists in the index.
   *
   * @param word word to check
   * @throws IOException If there is a low-level I/O error.
   * @throws AlreadyClosedException if the Spellchecker is already closed
   * @return true if the word exists in the index
   */
  public boolean exist(String word) throws IOException {
    // obtainSearcher calls ensureOpen
    final IndexSearcher indexSearcher = obtainSearcher();
    try {
      // TODO: we should use ReaderUtil+seekExact, we dont care about the docFreq
      // this is just an existence check
      return indexSearcher.getIndexReader().docFreq(new Term(F_WORD, word)) > 0;
    } finally {
      releaseSearcher(indexSearcher);
    }
  }

  /**
   * Indexes the data from the given {@link Dictionary}.
   *
   * @param dict Dictionary to index
   * @param config {@link IndexWriterConfig} to use
   * @param fullMerge whether or not the spellcheck index should be fully merged
   * @throws AlreadyClosedException if the Spellchecker is already closed
   * @throws IOException If there is a low-level I/O error.
   */
  public final void indexDictionary(Dictionary dict, IndexWriterConfig config, boolean fullMerge)
      throws IOException {
    synchronized (modifyCurrentIndexLock) {
      ensureOpen();
      final Directory dir = this.spellIndex;
      final IndexWriter writer = new IndexWriter(dir, config);
      IndexSearcher indexSearcher = obtainSearcher();
      final List termsEnums = new ArrayList<>();

      final IndexReader reader = searcher.getIndexReader();
      if (reader.maxDoc() > 0) {
        for (final LeafReaderContext ctx : reader.leaves()) {
          Terms terms = ctx.reader().terms(F_WORD);
          if (terms != null) {
            termsEnums.add(terms.iterator());
          }
        }
      }

      boolean isEmpty = termsEnums.isEmpty();

      try {
        BytesRefIterator iter = dict.getEntryIterator();
        BytesRef currentTerm;

        terms:
        while ((currentTerm = iter.next()) != null) {

          String word = currentTerm.utf8ToString();
          int len = word.length();
          if (len < 3) {
            continue; // too short we bail but "too long" is fine...
          }

          if (!isEmpty) {
            for (TermsEnum te : termsEnums) {
              if (te.seekExact(currentTerm)) {
                continue terms;
              }
            }
          }

          // ok index the word
          Document doc = createDocument(word, getMin(len), getMax(len));
          writer.addDocument(doc);
        }
      } finally {
        releaseSearcher(indexSearcher);
      }
      if (fullMerge) {
        writer.forceMerge(1);
      }
      // close writer
      writer.close();
      // TODO: this isn't that great, maybe in the future SpellChecker should take
      // IWC in its ctor / keep its writer open?

      // also re-open the spell index to see our own changes when the next suggestion
      // is fetched:
      swapSearcher(dir);
    }
  }

  private static int getMin(int l) {
    if (l > 5) {
      return 3;
    }
    if (l == 5) {
      return 2;
    }
    return 1;
  }

  private static int getMax(int l) {
    if (l > 5) {
      return 4;
    }
    if (l == 5) {
      return 3;
    }
    return Math.min(l, 2);
  }

  private static Document createDocument(String text, int ng1, int ng2) {
    Document doc = new Document();
    // the word field is never queried on... it's indexed so it can be quickly
    // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos
    Field f = new StringField(F_WORD, text, Field.Store.YES);
    doc.add(f); // orig term
    addGram(text, doc, ng1, ng2);
    return doc;
  }

  private static void addGram(String text, Document doc, int ng1, int ng2) {
    int len = text.length();
    for (int ng = ng1; ng <= ng2; ng++) {
      String key = "gram" + ng;
      String end = null;
      for (int i = 0; i < len - ng + 1; i++) {
        String gram = text.substring(i, i + ng);
        FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
        ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        Field ngramField = new Field(key, gram, ft);
        // spellchecker does not use positional queries, but we want freqs
        // for scoring these multivalued n-gram fields.
        doc.add(ngramField);
        if (i == 0) {
          // only one term possible in the startXXField, TF/pos and norms aren't needed.
          Field startField = new StringField("start" + ng, gram, Field.Store.NO);
          doc.add(startField);
        }
        end = gram;
      }
      if (end != null) { // may not be present if len==ng1
        // only one term possible in the endXXField, TF/pos and norms aren't needed.
        Field endField = new StringField("end" + ng, end, Field.Store.NO);
        doc.add(endField);
      }
    }
  }

  private IndexSearcher obtainSearcher() {
    synchronized (searcherLock) {
      ensureOpen();
      searcher.getIndexReader().incRef();
      return searcher;
    }
  }

  private void releaseSearcher(final IndexSearcher aSearcher) throws IOException {
    // don't check if open - always decRef
    // don't decrement the private searcher - could have been swapped
    aSearcher.getIndexReader().decRef();
  }

  private void ensureOpen() {
    if (closed) {
      throw new AlreadyClosedException("Spellchecker has been closed");
    }
  }

  /**
   * Close the IndexSearcher used by this SpellChecker
   *
   * @throws IOException if the close operation causes an {@link IOException}
   * @throws AlreadyClosedException if the {@link SpellChecker} is already closed
   */
  @Override
  public void close() throws IOException {
    synchronized (searcherLock) {
      ensureOpen();
      closed = true;
      if (searcher != null) {
        searcher.getIndexReader().close();
      }
      searcher = null;
    }
  }

  private void swapSearcher(final Directory dir) throws IOException {
    /*
     * opening a searcher is possibly very expensive.
     * We rather close it again if the Spellchecker was closed during
     * this operation than block access to the current searcher while opening.
     */
    final IndexSearcher indexSearcher = createSearcher(dir);
    synchronized (searcherLock) {
      if (closed) {
        indexSearcher.getIndexReader().close();
        throw new AlreadyClosedException("Spellchecker has been closed");
      }
      if (searcher != null) {
        searcher.getIndexReader().close();
      }
      // set the spellindex in the sync block - ensure consistency.
      searcher = indexSearcher;
      this.spellIndex = dir;
    }
  }

  /**
   * Creates a new read-only IndexSearcher
   *
   * @param dir the directory used to open the searcher
   * @return a new read-only IndexSearcher
   * @throws IOException f there is a low-level IO error
   */
  // for testing purposes
  IndexSearcher createSearcher(final Directory dir) throws IOException {
    return new IndexSearcher(DirectoryReader.open(dir));
  }

  /**
   * Returns true if and only if the {@link SpellChecker} is closed, otherwise 
   * false.
   *
   * @return true if and only if the {@link SpellChecker} is closed, otherwise 
   *     false.
   */
  boolean isClosed() {
    return closed;
  }
}