All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.spell.SpellChecker Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
package org.apache.lucene.search.spell;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.Version;

/**
 * 

* Spell Checker class (Main class)
* (initially inspired by the David Spencer code). *

* *

Example Usage: * *

 *  SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
 *  // To index a field of a user index:
 *  spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
 *  // To index a file containing words:
 *  spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
 *  String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
 * 
* * */ public class SpellChecker implements java.io.Closeable { /** * The default minimum score to use, if not specified by calling {@link #setAccuracy(float)} . */ public static final float DEFAULT_ACCURACY = 0.5f; /** * Field name for each word in the ngram index. */ public static final String F_WORD = "word"; /** * the spell index */ // don't modify the directory directly - see #swapSearcher() // TODO: why is this package private? Directory spellIndex; /** * Boost value for start and end grams */ private float bStart = 2.0f; private float bEnd = 1.0f; // don't use this searcher directly - see #swapSearcher() private IndexSearcher searcher; /* * this locks all modifications to the current searcher. */ private final Object searcherLock = new Object(); /* * this lock synchronizes all possible modifications to the * current index directory. It should not be possible to try modifying * the same index concurrently. Note: Do not acquire the searcher lock * before acquiring this lock! */ private final Object modifyCurrentIndexLock = new Object(); private volatile boolean closed = false; // minimum score for hits generated by the spell checker query private float accuracy = DEFAULT_ACCURACY; private StringDistance sd; private Comparator comparator; /** * Use the given directory as a spell checker index. The directory * is created if it doesn't exist yet. * @param spellIndex the spell index directory * @param sd the {@link StringDistance} measurement to use * @throws IOException if Spellchecker can not open the directory */ public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException { this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR); } /** * Use the given directory as a spell checker index with a * {@link LevensteinDistance} as the default {@link StringDistance}. The * directory is created if it doesn't exist yet. * * @param spellIndex * the spell index directory * @throws IOException * if spellchecker can not open the directory */ public SpellChecker(Directory spellIndex) throws IOException { this(spellIndex, new LevensteinDistance()); } /** * Use the given directory as a spell checker index with the given {@link org.apache.lucene.search.spell.StringDistance} measure * and the given {@link java.util.Comparator} for sorting the results. * @param spellIndex The spelling index * @param sd The distance * @param comparator The comparator * @throws IOException if there is a problem opening the index */ public SpellChecker(Directory spellIndex, StringDistance sd, Comparator comparator) throws IOException { setSpellIndex(spellIndex); setStringDistance(sd); this.comparator = comparator; } /** * Use a different index as the spell checker index or re-open * the existing index if spellIndex is the same value * as given in the constructor. * @param spellIndexDir the spell directory to use * @throws AlreadyClosedException if the Spellchecker is already closed * @throws IOException if spellchecker can not open the directory */ // TODO: we should make this final as it is called in the constructor public void setSpellIndex(Directory spellIndexDir) throws IOException { // this could be the same directory as the current spellIndex // modifications to the directory should be synchronized synchronized (modifyCurrentIndexLock) { ensureOpen(); if (!DirectoryReader.indexExists(spellIndexDir)) { IndexWriter writer = new IndexWriter(spellIndexDir, new IndexWriterConfig(Version.LUCENE_CURRENT, null)); writer.close(); } swapSearcher(spellIndexDir); } } /** * Sets the {@link java.util.Comparator} for the {@link SuggestWordQueue}. * @param comparator the comparator */ public void setComparator(Comparator comparator) { this.comparator = comparator; } /** * Gets the comparator in use for ranking suggestions. * @see #setComparator(Comparator) */ public Comparator getComparator() { return comparator; } /** * Sets the {@link StringDistance} implementation for this * {@link SpellChecker} instance. * * @param sd the {@link StringDistance} implementation for this * {@link SpellChecker} instance */ public void setStringDistance(StringDistance sd) { this.sd = sd; } /** * Returns the {@link StringDistance} instance used by this * {@link SpellChecker} instance. * * @return the {@link StringDistance} instance used by this * {@link SpellChecker} instance. */ public StringDistance getStringDistance() { return sd; } /** * Sets the accuracy 0 < minScore < 1; default {@link #DEFAULT_ACCURACY} * @param acc The new accuracy */ public void setAccuracy(float acc) { this.accuracy = acc; } /** * The accuracy (minimum score) to be used, unless overridden in {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)}, to * decide whether a suggestion is included or not. * @return The current accuracy setting */ public float getAccuracy() { return accuracy; } /** * Suggest similar words. * *

As the Lucene similarity that is used to fetch the most relevant n-grammed terms * is not the same as the edit distance strategy used to calculate the best * matching spell-checked word from the hits that Lucene found, one usually has * to retrieve a couple of numSug's in order to get the true best match. * *

I.e. if numSug == 1, don't count on that suggestion being the best one. * Thus, you should set this value to at least 5 for a good suggestion. * * @param word the word you want a spell check done on * @param numSug the number of suggested words * @throws IOException if the underlying index throws an {@link IOException} * @throws AlreadyClosedException if the Spellchecker is already closed * @return String[] * * @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float) */ public String[] suggestSimilar(String word, int numSug) throws IOException { return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); } /** * Suggest similar words. * *

As the Lucene similarity that is used to fetch the most relevant n-grammed terms * is not the same as the edit distance strategy used to calculate the best * matching spell-checked word from the hits that Lucene found, one usually has * to retrieve a couple of numSug's in order to get the true best match. * *

I.e. if numSug == 1, don't count on that suggestion being the best one. * Thus, you should set this value to at least 5 for a good suggestion. * * @param word the word you want a spell check done on * @param numSug the number of suggested words * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results * @throws IOException if the underlying index throws an {@link IOException} * @throws AlreadyClosedException if the Spellchecker is already closed * @return String[] * * @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float) */ public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException { return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy); } /** * Calls {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float) * suggestSimilar(word, numSug, ir, suggestMode, field, this.accuracy)} * */ public String[] suggestSimilar(String word, int numSug, IndexReader ir, String field, SuggestMode suggestMode) throws IOException { return suggestSimilar(word, numSug, ir, field, suggestMode, this.accuracy); } /** * Suggest similar words (optionally restricted to a field of an index). * *

As the Lucene similarity that is used to fetch the most relevant n-grammed terms * is not the same as the edit distance strategy used to calculate the best * matching spell-checked word from the hits that Lucene found, one usually has * to retrieve a couple of numSug's in order to get the true best match. * *

I.e. if numSug == 1, don't count on that suggestion being the best one. * Thus, you should set this value to at least 5 for a good suggestion. * * @param word the word you want a spell check done on * @param numSug the number of suggested words * @param ir the indexReader of the user index (can be null see field param) * @param field the field of the user index: if field is not null, the suggested * words are restricted to the words present in this field. * @param suggestMode * (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS) * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results * @throws IOException if the underlying index throws an {@link IOException} * @throws AlreadyClosedException if the Spellchecker is already closed * @return String[] the sorted list of the suggest words with these 2 criteria: * first criteria: the edit distance, second criteria (only if restricted mode): the popularity * of the suggest words in the field of the user index * */ public String[] suggestSimilar(String word, int numSug, IndexReader ir, String field, SuggestMode suggestMode, float accuracy) throws IOException { // obtainSearcher calls ensureOpen final IndexSearcher indexSearcher = obtainSearcher(); try { if (ir == null || field == null) { suggestMode = SuggestMode.SUGGEST_ALWAYS; } if (suggestMode == SuggestMode.SUGGEST_ALWAYS) { ir = null; field = null; } final int lengthWord = word.length(); final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0; final int goalFreq = suggestMode==SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0; // if the word exists in the real index and we don't care for word frequency, return the word itself if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) { return new String[] { word }; } BooleanQuery query = new BooleanQuery(); String[] grams; String key; for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) { key = "gram" + ng; // form key grams = formGrams(word, ng); // form word into ngrams (allow dups too) if (grams.length == 0) { continue; // hmm } if (bStart > 0) { // should we boost prefixes? add(query, "start" + ng, grams[0], bStart); // matches start of word } if (bEnd > 0) { // should we boost suffixes add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word } for (int i = 0; i < grams.length; i++) { add(query, key, grams[i]); } } int maxHits = 10 * numSug; // System.out.println("Q: " + query); ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs; // System.out.println("HITS: " + hits.length()); SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator); // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.min(hits.length, maxHits); SuggestWord sugWord = new SuggestWord(); for (int i = 0; i < stop; i++) { sugWord.string = indexSearcher.doc(hits[i].doc).get(F_WORD); // get orig word // don't suggest a word for itself, that would be silly if (sugWord.string.equals(word)) { continue; } // edit distance sugWord.score = sd.getDistance(word,sugWord.string); if (sugWord.score < accuracy) { continue; } if (ir != null && field != null) { // use the user index sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index // don't suggest a word that is not present in the field if ((suggestMode==SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1) { continue; } } sugQueue.insertWithOverflow(sugWord); if (sugQueue.size() == numSug) { // if queue full, maintain the minScore score accuracy = sugQueue.top().score; } sugWord = new SuggestWord(); } // convert to array string String[] list = new String[sugQueue.size()]; for (int i = sugQueue.size() - 1; i >= 0; i--) { list[i] = sugQueue.pop().string; } return list; } finally { releaseSearcher(indexSearcher); } } /** * Add a clause to a boolean query. */ private static void add(BooleanQuery q, String name, String value, float boost) { Query tq = new TermQuery(new Term(name, value)); tq.setBoost(boost); q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD)); } /** * Add a clause to a boolean query. */ private static void add(BooleanQuery q, String name, String value) { q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD)); } /** * Form all ngrams for a given word. * @param text the word to parse * @param ng the ngram length e.g. 3 * @return an array of all ngrams in the word and note that duplicates are not removed */ private static String[] formGrams(String text, int ng) { int len = text.length(); String[] res = new String[len - ng + 1]; for (int i = 0; i < len - ng + 1; i++) { res[i] = text.substring(i, i + ng); } return res; } /** * Removes all terms from the spell check index. * @throws IOException If there is a low-level I/O error. * @throws AlreadyClosedException if the Spellchecker is already closed */ public void clearIndex() throws IOException { synchronized (modifyCurrentIndexLock) { ensureOpen(); final Directory dir = this.spellIndex; final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( Version.LUCENE_CURRENT, null) .setOpenMode(OpenMode.CREATE)); writer.close(); swapSearcher(dir); } } /** * Check whether the word exists in the index. * @param word word to check * @throws IOException If there is a low-level I/O error. * @throws AlreadyClosedException if the Spellchecker is already closed * @return true if the word exists in the index */ public boolean exist(String word) throws IOException { // obtainSearcher calls ensureOpen final IndexSearcher indexSearcher = obtainSearcher(); try{ // TODO: we should use ReaderUtil+seekExact, we dont care about the docFreq // this is just an existence check return indexSearcher.getIndexReader().docFreq(new Term(F_WORD, word)) > 0; } finally { releaseSearcher(indexSearcher); } } /** * Indexes the data from the given {@link Dictionary}. * @param dict Dictionary to index * @param config {@link IndexWriterConfig} to use * @param fullMerge whether or not the spellcheck index should be fully merged * @throws AlreadyClosedException if the Spellchecker is already closed * @throws IOException If there is a low-level I/O error. */ public final void indexDictionary(Dictionary dict, IndexWriterConfig config, boolean fullMerge) throws IOException { synchronized (modifyCurrentIndexLock) { ensureOpen(); final Directory dir = this.spellIndex; final IndexWriter writer = new IndexWriter(dir, config); IndexSearcher indexSearcher = obtainSearcher(); final List termsEnums = new ArrayList(); final IndexReader reader = searcher.getIndexReader(); if (reader.maxDoc() > 0) { for (final AtomicReaderContext ctx : reader.leaves()) { Terms terms = ctx.reader().terms(F_WORD); if (terms != null) termsEnums.add(terms.iterator(null)); } } boolean isEmpty = termsEnums.isEmpty(); try { BytesRefIterator iter = dict.getEntryIterator(); BytesRef currentTerm; terms: while ((currentTerm = iter.next()) != null) { String word = currentTerm.utf8ToString(); int len = word.length(); if (len < 3) { continue; // too short we bail but "too long" is fine... } if (!isEmpty) { for (TermsEnum te : termsEnums) { if (te.seekExact(currentTerm)) { continue terms; } } } // ok index the word Document doc = createDocument(word, getMin(len), getMax(len)); writer.addDocument(doc); } } finally { releaseSearcher(indexSearcher); } if (fullMerge) { writer.forceMerge(1); } // close writer writer.close(); // TODO: this isn't that great, maybe in the future SpellChecker should take // IWC in its ctor / keep its writer open? // also re-open the spell index to see our own changes when the next suggestion // is fetched: swapSearcher(dir); } } private static int getMin(int l) { if (l > 5) { return 3; } if (l == 5) { return 2; } return 1; } private static int getMax(int l) { if (l > 5) { return 4; } if (l == 5) { return 3; } return 2; } private static Document createDocument(String text, int ng1, int ng2) { Document doc = new Document(); // the word field is never queried on... its indexed so it can be quickly // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos Field f = new StringField(F_WORD, text, Field.Store.YES); doc.add(f); // orig term addGram(text, doc, ng1, ng2); return doc; } private static void addGram(String text, Document doc, int ng1, int ng2) { int len = text.length(); for (int ng = ng1; ng <= ng2; ng++) { String key = "gram" + ng; String end = null; for (int i = 0; i < len - ng + 1; i++) { String gram = text.substring(i, i + ng); FieldType ft = new FieldType(StringField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); Field ngramField = new Field(key, gram, ft); // spellchecker does not use positional queries, but we want freqs // for scoring these multivalued n-gram fields. doc.add(ngramField); if (i == 0) { // only one term possible in the startXXField, TF/pos and norms aren't needed. Field startField = new StringField("start" + ng, gram, Field.Store.NO); doc.add(startField); } end = gram; } if (end != null) { // may not be present if len==ng1 // only one term possible in the endXXField, TF/pos and norms aren't needed. Field endField = new StringField("end" + ng, end, Field.Store.NO); doc.add(endField); } } } private IndexSearcher obtainSearcher() { synchronized (searcherLock) { ensureOpen(); searcher.getIndexReader().incRef(); return searcher; } } private void releaseSearcher(final IndexSearcher aSearcher) throws IOException{ // don't check if open - always decRef // don't decrement the private searcher - could have been swapped aSearcher.getIndexReader().decRef(); } private void ensureOpen() { if (closed) { throw new AlreadyClosedException("Spellchecker has been closed"); } } /** * Close the IndexSearcher used by this SpellChecker * @throws IOException if the close operation causes an {@link IOException} * @throws AlreadyClosedException if the {@link SpellChecker} is already closed */ @Override public void close() throws IOException { synchronized (searcherLock) { ensureOpen(); closed = true; if (searcher != null) { searcher.getIndexReader().close(); } searcher = null; } } private void swapSearcher(final Directory dir) throws IOException { /* * opening a searcher is possibly very expensive. * We rather close it again if the Spellchecker was closed during * this operation than block access to the current searcher while opening. */ final IndexSearcher indexSearcher = createSearcher(dir); synchronized (searcherLock) { if(closed){ indexSearcher.getIndexReader().close(); throw new AlreadyClosedException("Spellchecker has been closed"); } if (searcher != null) { searcher.getIndexReader().close(); } // set the spellindex in the sync block - ensure consistency. searcher = indexSearcher; this.spellIndex = dir; } } /** * Creates a new read-only IndexSearcher * @param dir the directory used to open the searcher * @return a new read-only IndexSearcher * @throws IOException f there is a low-level IO error */ // for testing purposes IndexSearcher createSearcher(final Directory dir) throws IOException{ return new IndexSearcher(DirectoryReader.open(dir)); } /** * Returns true if and only if the {@link SpellChecker} is * closed, otherwise false. * * @return true if and only if the {@link SpellChecker} is * closed, otherwise false. */ boolean isClosed(){ return closed; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy