org.apache.lucene.search.spell.CompassSpellChecker Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of compass Show documentation
Compass Search Engine Framework
There is a newer version: 2.2.2-ldh
package org.apache.lucene.search.spell;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Iterator;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;

/**
 * 
 *   Spell Checker class  (Main class) 

 *  (initially inspired by the David Spencer code).
 * 
 *
 * Example Usage:
 *
 * 
 *  SpellChecker spellcheck = new SpellChecker(spellIndexDirectory);
 *  // To index a field of a user index:
 *  spellcheck.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
 *  // To index a file containing words:
 *  spellcheck.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
 *  String[] suggestions = spellcheck.suggestSimilar("misspelt", 5);
 * 
 *
 *
 * @version 1.0
 */


// Specialized SpellChecker for Compass.

// List of changes: (Mainly to separte between two use cases: one is for indexing and one for searching).

// 1. Added a constructor that accepts a searcher and reader ("searching" spell checker)
// 2. Changed searcher type from IndexSearcher to Searcher
// 3. Added close method
// 4. In indexDictioanry, if the searcher is null, don't reopen it
// 5. Added a constructor that won't open an index searcher ("indexing" spell checker)
// 6. Added indexDictionary that accepts a dictionary and IndexWriter so we can configure it

// LUCENE MONITOR
public class CompassSpellChecker {

  /**
   * Field name for each word in the ngram index.
   */
  public static final String F_WORD = "word";

  /**
   * the spell index
   */
  Directory spellIndex;

  /**
   * Boost value for start and end grams
   */
  private float bStart = 2.0f;
  private float bEnd = 1.0f;

  private IndexReader reader;
  private Searcher searcher;

  // minimum score for hits generated by the spell checker query
  private float minScore = 0.5f;

  private StringDistance sd;


  public CompassSpellChecker(Searcher searcher, IndexReader reader) {
      this.searcher = searcher;
      this.reader = reader;
      setStringDistance(new LevensteinDistance());
  }

  /**
   * Use the given directory as a spell checker index. The directory
   * is created if it doesn't exist yet.
   *
   * @param spellIndex
   * @throws IOException
   */
  public CompassSpellChecker(Directory spellIndex) throws IOException {
    this.setSpellIndex(spellIndex);
    setStringDistance(new LevensteinDistance());
  }

    /**
     */
    public CompassSpellChecker(Directory spellIndex, boolean indexing) throws IOException {
      if (indexing) {
          this.spellIndex = spellIndex;
      } else {
          setSpellIndex(spellIndex);
      }
      setStringDistance(new LevensteinDistance());
    }

    public void close() {
        try {
            searcher.close();
        } catch (IOException e) {
            // do nothing
        }

        try {
            reader.close();
        } catch (IOException e) {
            // do nothing
        }
    }


    public void setStringDistance(StringDistance sd) {
      this.sd = sd;
    }

    public StringDistance getStringDistance() {
      return sd;
    }
    
  /**
   * Use a different index as the spell checker index or re-open
   * the existing index if spellIndex is the same value
   * as given in the constructor.
   *
   * @param spellIndex
   * @throws IOException
   */
  public void setSpellIndex(Directory spellIndex) throws IOException {
    this.spellIndex = spellIndex;
    if (!IndexReader.indexExists(spellIndex)) {
        IndexWriter writer = new IndexWriter(spellIndex, null, true);
        writer.close();
    }
    // close the old searcher, if there was one
    if (searcher != null) {
      searcher.close();
    }
    searcher = new IndexSearcher(this.spellIndex);
  }

  /**
   * Sets the accuracy 0 < minScore < 1; default 0.5
   */
  public void setAccuracy(float minScore) {
    this.minScore = minScore;
  }

    /**
     * Suggest similar words.
     *
     * As the Lucene similarity that is used to fetch the most relevant n-grammed terms
     * is not the same as the edit distance strategy used to calculate the best
     * matching spell-checked word from the hits that Lucene found, one usually has
     * to retrieve a couple of numSug's in order to get the true best match.
     *
     * 
I.e. if numSug == 1, don't count on that suggestion being the best one.
     * Thus, you should set this value to at least 5 for a good suggestion.
     *
     * @param word the word you want a spell check done on
     * @param numSug the number of suggested words
     * @throws IOException
     * @return String[]
     */
    public String[] suggestSimilar(String word, int numSug) throws IOException {
      return this.suggestSimilar(word, numSug, null, null, false);
    }

    /**
     * Suggest similar words (optionally restricted to a field of an index).
     *
     * 
As the Lucene similarity that is used to fetch the most relevant n-grammed terms
     * is not the same as the edit distance strategy used to calculate the best
     * matching spell-checked word from the hits that Lucene found, one usually has
     * to retrieve a couple of numSug's in order to get the true best match.
     *
     * I.e. if numSug == 1, don't count on that suggestion being the best one.
     * Thus, you should set this value to at least 5 for a good suggestion.
     *
     * @param word the word you want a spell check done on
     * @param numSug the number of suggested words
     * @param ir the indexReader of the user index (can be null see field param)
     * @param field the field of the user index: if field is not null, the suggested
     * words are restricted to the words present in this field.
     * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word
     * (only if restricted mode = (indexReader!=null and field!=null)
     * @throws IOException
     * @return String[] the sorted list of the suggest words with these 2 criteria:
     * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
     * of the suggest words in the field of the user index
     */
    public String[] suggestSimilar(String word, int numSug, IndexReader ir,
        String field, boolean morePopular) throws IOException {

      float min = this.minScore;
      final int lengthWord = word.length();

      final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
      final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
      // if the word exists in the real index and we don't care for word frequency, return the word itself
      if (!morePopular && freq > 0) {
        return new String[] { word };
      }

      BooleanQuery query = new BooleanQuery();
      String[] grams;
      String key;

      for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {

        key = "gram" + ng; // form key

        grams = formGrams(word, ng); // form word into ngrams (allow dups too)

        if (grams.length == 0) {
          continue; // hmm
        }

        if (bStart > 0) { // should we boost prefixes?
          add(query, "start" + ng, grams[0], bStart); // matches start of word

        }
        if (bEnd > 0) { // should we boost suffixes
          add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word

        }
        for (int i = 0; i < grams.length; i++) {
          add(query, key, grams[i]);
        }
      }

//    System.out.println("Q: " + query);
      Hits hits = searcher.search(query);
//    System.out.println("HITS: " + hits.length());
      SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);

      // go thru more than 'maxr' matches in case the distance filter triggers
      int stop = Math.min(hits.length(), 10 * numSug);
      SuggestWord sugWord = new SuggestWord();
      for (int i = 0; i < stop; i++) {

        sugWord.string = hits.doc(i).get(F_WORD); // get orig word

        // don't suggest a word for itself, that would be silly
        if (sugWord.string.equals(word)) {
          continue;
        }

        // edit distance
        sugWord.score = sd.getDistance(word,sugWord.string);
        if (sugWord.score < min) {
          continue;
        }

        if (ir != null && field != null) { // use the user index
          sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
          // don't suggest a word that is not present in the field
          if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {
            continue;
          }
        }
        sugQueue.insert(sugWord);
        if (sugQueue.size() == numSug) {
          // if queue full, maintain the minScore score
          min = ((SuggestWord) sugQueue.top()).score;
        }
        sugWord = new SuggestWord();
      }

      // convert to array string
      String[] list = new String[sugQueue.size()];
      for (int i = sugQueue.size() - 1; i >= 0; i--) {
        list[i] = ((SuggestWord) sugQueue.pop()).string;
      }

      return list;
    }

    /**
     * Add a clause to a boolean query.
     */
    private static void add(BooleanQuery q, String name, String value, float boost) {
      Query tq = new TermQuery(new Term(name, value));
      tq.setBoost(boost);
      q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
    }

    /**
     * Add a clause to a boolean query.
     */
    private static void add(BooleanQuery q, String name, String value) {
      q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
    }

    /**
     * Form all ngrams for a given word.
     * @param text the word to parse
     * @param ng the ngram length e.g. 3
     * @return an array of all ngrams in the word and note that duplicates are not removed
     */
    private static String[] formGrams(String text, int ng) {
      int len = text.length();
      String[] res = new String[len - ng + 1];
      for (int i = 0; i < len - ng + 1; i++) {
        res[i] = text.substring(i, i + ng);
      }
      return res;
    }

    /**
     * Removes all terms from the spell check index.
     * @throws IOException
     */
    public void clearIndex() throws IOException {
      IndexWriter writer = new IndexWriter(spellIndex, null, true);
      writer.close();

        // COMASS: Remove closing the searcher
      //close the old searcher
//      searcher.close();
//      searcher = new IndexSearcher(this.spellIndex);
    }

    /**
     * Check whether the word exists in the index.
     * @param word
     * @throws IOException
     * @return true iff the word exists in the index
     */
    public boolean exist(String word) throws IOException {
        // COMPASS: Adding check for index reader
        if (reader == null) {
          reader = IndexReader.open(spellIndex, true);
        }
      return reader.docFreq(new Term(F_WORD, word)) > 0;
    }

    /**
     * Indexes the data from the given {@link Dictionary}.
     * @param dict Dictionary to index
     * @param mergeFactor mergeFactor to use when indexing
     * @param ramMB the max amount or memory in MB to use
     * @throws IOException
     */
    public void indexDictionary(IndexWriter writer, Dictionary dict) throws IOException {
      Iterator iter = dict.getWordsIterator();
      while (iter.hasNext()) {
        String word = (String) iter.next();

        int len = word.length();
        if (len < 3) {
          continue; // too short we bail but "too long" is fine...
        }

        if (this.exist(word)) { // if the word already exist in the gramindex
          continue;
        }

        // ok index the word
        Document doc = createDocument(word, getMin(len), getMax(len));
        writer.addDocument(doc);
      }
        // close writer (REMOVED IN COMPASS), will do it on close
//    writer.optimize();
//    writer.close();
        // close reader so it will be re-opened (and see the new content) when exist()
        // is called the next time:
        if (reader != null) {
          reader.close();
          reader = null;
        }
        // also re-open the spell index to see our own changes when the next suggestion
        // is fetched:
        if (searcher != null) {
            searcher.close();
            searcher = new IndexSearcher(this.spellIndex);
        }
    }

    private int getMin(int l) {
      if (l > 5) {
        return 3;
      }
      if (l == 5) {
        return 2;
      }
      return 1;
    }

    private int getMax(int l) {
      if (l > 5) {
        return 4;
      }
      if (l == 5) {
        return 3;
      }
      return 2;
    }

    private static Document createDocument(String text, int ng1, int ng2) {
      Document doc = new Document();
      doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term
      addGram(text, doc, ng1, ng2);
      return doc;
    }

    private static void addGram(String text, Document doc, int ng1, int ng2) {
      int len = text.length();
      for (int ng = ng1; ng <= ng2; ng++) {
        String key = "gram" + ng;
        String end = null;
        for (int i = 0; i < len - ng + 1; i++) {
          String gram = text.substring(i, i + ng);
          doc.add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
          if (i == 0) {
            doc.add(new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
          }
          end = gram;
        }
        if (end != null) { // may not be present if len==ng1
          doc.add(new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED));
        }
      }
    }
}