org.apache.lucene.search.spell.SpellChecker Maven / Gradle / Ivy
Show all versions of aem-sdk-api Show documentation
package org.apache.lucene.search.spell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.Version;
/**
*
* Spell Checker class (Main class)
* (initially inspired by the David Spencer code).
*
*
* Example Usage:
*
*
* SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
* // To index a field of a user index:
* spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
* // To index a file containing words:
* spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
* String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
*
*
*
*/
public class SpellChecker implements java.io.Closeable {
/**
* The default minimum score to use, if not specified by calling {@link #setAccuracy(float)} .
*/
public static final float DEFAULT_ACCURACY = 0.5f;
/**
* Field name for each word in the ngram index.
*/
public static final String F_WORD = "word";
/**
* the spell index
*/
// don't modify the directory directly - see #swapSearcher()
// TODO: why is this package private?
Directory spellIndex;
/**
* Boost value for start and end grams
*/
private float bStart = 2.0f;
private float bEnd = 1.0f;
// don't use this searcher directly - see #swapSearcher()
private IndexSearcher searcher;
/*
* this locks all modifications to the current searcher.
*/
private final Object searcherLock = new Object();
/*
* this lock synchronizes all possible modifications to the
* current index directory. It should not be possible to try modifying
* the same index concurrently. Note: Do not acquire the searcher lock
* before acquiring this lock!
*/
private final Object modifyCurrentIndexLock = new Object();
private volatile boolean closed = false;
// minimum score for hits generated by the spell checker query
private float accuracy = DEFAULT_ACCURACY;
private StringDistance sd;
private Comparator comparator;
/**
* Use the given directory as a spell checker index. The directory
* is created if it doesn't exist yet.
* @param spellIndex the spell index directory
* @param sd the {@link StringDistance} measurement to use
* @throws IOException if Spellchecker can not open the directory
*/
public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException {
this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR);
}
/**
* Use the given directory as a spell checker index with a
* {@link LevensteinDistance} as the default {@link StringDistance}. The
* directory is created if it doesn't exist yet.
*
* @param spellIndex
* the spell index directory
* @throws IOException
* if spellchecker can not open the directory
*/
public SpellChecker(Directory spellIndex) throws IOException {
this(spellIndex, new LevensteinDistance());
}
/**
* Use the given directory as a spell checker index with the given {@link org.apache.lucene.search.spell.StringDistance} measure
* and the given {@link java.util.Comparator} for sorting the results.
* @param spellIndex The spelling index
* @param sd The distance
* @param comparator The comparator
* @throws IOException if there is a problem opening the index
*/
public SpellChecker(Directory spellIndex, StringDistance sd, Comparator comparator) throws IOException {
setSpellIndex(spellIndex);
setStringDistance(sd);
this.comparator = comparator;
}
/**
* Use a different index as the spell checker index or re-open
* the existing index if spellIndex
is the same value
* as given in the constructor.
* @param spellIndexDir the spell directory to use
* @throws AlreadyClosedException if the Spellchecker is already closed
* @throws IOException if spellchecker can not open the directory
*/
// TODO: we should make this final as it is called in the constructor
public void setSpellIndex(Directory spellIndexDir) throws IOException {
// this could be the same directory as the current spellIndex
// modifications to the directory should be synchronized
synchronized (modifyCurrentIndexLock) {
ensureOpen();
if (!DirectoryReader.indexExists(spellIndexDir)) {
IndexWriter writer = new IndexWriter(spellIndexDir,
new IndexWriterConfig(Version.LUCENE_CURRENT,
null));
writer.close();
}
swapSearcher(spellIndexDir);
}
}
/**
* Sets the {@link java.util.Comparator} for the {@link SuggestWordQueue}.
* @param comparator the comparator
*/
public void setComparator(Comparator comparator) {
this.comparator = comparator;
}
/**
* Gets the comparator in use for ranking suggestions.
* @see #setComparator(Comparator)
*/
public Comparator getComparator() {
return comparator;
}
/**
* Sets the {@link StringDistance} implementation for this
* {@link SpellChecker} instance.
*
* @param sd the {@link StringDistance} implementation for this
* {@link SpellChecker} instance
*/
public void setStringDistance(StringDistance sd) {
this.sd = sd;
}
/**
* Returns the {@link StringDistance} instance used by this
* {@link SpellChecker} instance.
*
* @return the {@link StringDistance} instance used by this
* {@link SpellChecker} instance.
*/
public StringDistance getStringDistance() {
return sd;
}
/**
* Sets the accuracy 0 < minScore < 1; default {@link #DEFAULT_ACCURACY}
* @param acc The new accuracy
*/
public void setAccuracy(float acc) {
this.accuracy = acc;
}
/**
* The accuracy (minimum score) to be used, unless overridden in {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)}, to
* decide whether a suggestion is included or not.
* @return The current accuracy setting
*/
public float getAccuracy() {
return accuracy;
}
/**
* Suggest similar words.
*
* As the Lucene similarity that is used to fetch the most relevant n-grammed terms
* is not the same as the edit distance strategy used to calculate the best
* matching spell-checked word from the hits that Lucene found, one usually has
* to retrieve a couple of numSug's in order to get the true best match.
*
*
I.e. if numSug == 1, don't count on that suggestion being the best one.
* Thus, you should set this value to at least 5 for a good suggestion.
*
* @param word the word you want a spell check done on
* @param numSug the number of suggested words
* @throws IOException if the underlying index throws an {@link IOException}
* @throws AlreadyClosedException if the Spellchecker is already closed
* @return String[]
*
* @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
*/
public String[] suggestSimilar(String word, int numSug) throws IOException {
return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
}
/**
* Suggest similar words.
*
*
As the Lucene similarity that is used to fetch the most relevant n-grammed terms
* is not the same as the edit distance strategy used to calculate the best
* matching spell-checked word from the hits that Lucene found, one usually has
* to retrieve a couple of numSug's in order to get the true best match.
*
*
I.e. if numSug == 1, don't count on that suggestion being the best one.
* Thus, you should set this value to at least 5 for a good suggestion.
*
* @param word the word you want a spell check done on
* @param numSug the number of suggested words
* @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
* @throws IOException if the underlying index throws an {@link IOException}
* @throws AlreadyClosedException if the Spellchecker is already closed
* @return String[]
*
* @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
*/
public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException {
return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy);
}
/**
* Calls {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
* suggestSimilar(word, numSug, ir, suggestMode, field, this.accuracy)}
*
*/
public String[] suggestSimilar(String word, int numSug, IndexReader ir,
String field, SuggestMode suggestMode) throws IOException {
return suggestSimilar(word, numSug, ir, field, suggestMode, this.accuracy);
}
/**
* Suggest similar words (optionally restricted to a field of an index).
*
*
As the Lucene similarity that is used to fetch the most relevant n-grammed terms
* is not the same as the edit distance strategy used to calculate the best
* matching spell-checked word from the hits that Lucene found, one usually has
* to retrieve a couple of numSug's in order to get the true best match.
*
*
I.e. if numSug == 1, don't count on that suggestion being the best one.
* Thus, you should set this value to at least 5 for a good suggestion.
*
* @param word the word you want a spell check done on
* @param numSug the number of suggested words
* @param ir the indexReader of the user index (can be null see field param)
* @param field the field of the user index: if field is not null, the suggested
* words are restricted to the words present in this field.
* @param suggestMode
* (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS)
* @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
* @throws IOException if the underlying index throws an {@link IOException}
* @throws AlreadyClosedException if the Spellchecker is already closed
* @return String[] the sorted list of the suggest words with these 2 criteria:
* first criteria: the edit distance, second criteria (only if restricted mode): the popularity
* of the suggest words in the field of the user index
*
*/
public String[] suggestSimilar(String word, int numSug, IndexReader ir,
String field, SuggestMode suggestMode, float accuracy) throws IOException {
// obtainSearcher calls ensureOpen
final IndexSearcher indexSearcher = obtainSearcher();
try {
if (ir == null || field == null) {
suggestMode = SuggestMode.SUGGEST_ALWAYS;
}
if (suggestMode == SuggestMode.SUGGEST_ALWAYS) {
ir = null;
field = null;
}
final int lengthWord = word.length();
final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
final int goalFreq = suggestMode==SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0;
// if the word exists in the real index and we don't care for word frequency, return the word itself
if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) {
return new String[] { word };
}
BooleanQuery query = new BooleanQuery();
String[] grams;
String key;
for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {
key = "gram" + ng; // form key
grams = formGrams(word, ng); // form word into ngrams (allow dups too)
if (grams.length == 0) {
continue; // hmm
}
if (bStart > 0) { // should we boost prefixes?
add(query, "start" + ng, grams[0], bStart); // matches start of word
}
if (bEnd > 0) { // should we boost suffixes
add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word
}
for (int i = 0; i < grams.length; i++) {
add(query, key, grams[i]);
}
}
int maxHits = 10 * numSug;
// System.out.println("Q: " + query);
ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs;
// System.out.println("HITS: " + hits.length());
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator);
// go thru more than 'maxr' matches in case the distance filter triggers
int stop = Math.min(hits.length, maxHits);
SuggestWord sugWord = new SuggestWord();
for (int i = 0; i < stop; i++) {
sugWord.string = indexSearcher.doc(hits[i].doc).get(F_WORD); // get orig word
// don't suggest a word for itself, that would be silly
if (sugWord.string.equals(word)) {
continue;
}
// edit distance
sugWord.score = sd.getDistance(word,sugWord.string);
if (sugWord.score < accuracy) {
continue;
}
if (ir != null && field != null) { // use the user index
sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
// don't suggest a word that is not present in the field
if ((suggestMode==SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1) {
continue;
}
}
sugQueue.insertWithOverflow(sugWord);
if (sugQueue.size() == numSug) {
// if queue full, maintain the minScore score
accuracy = sugQueue.top().score;
}
sugWord = new SuggestWord();
}
// convert to array string
String[] list = new String[sugQueue.size()];
for (int i = sugQueue.size() - 1; i >= 0; i--) {
list[i] = sugQueue.pop().string;
}
return list;
} finally {
releaseSearcher(indexSearcher);
}
}
/**
* Add a clause to a boolean query.
*/
private static void add(BooleanQuery q, String name, String value, float boost) {
Query tq = new TermQuery(new Term(name, value));
tq.setBoost(boost);
q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
}
/**
* Add a clause to a boolean query.
*/
private static void add(BooleanQuery q, String name, String value) {
q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
}
/**
* Form all ngrams for a given word.
* @param text the word to parse
* @param ng the ngram length e.g. 3
* @return an array of all ngrams in the word and note that duplicates are not removed
*/
private static String[] formGrams(String text, int ng) {
int len = text.length();
String[] res = new String[len - ng + 1];
for (int i = 0; i < len - ng + 1; i++) {
res[i] = text.substring(i, i + ng);
}
return res;
}
/**
* Removes all terms from the spell check index.
* @throws IOException If there is a low-level I/O error.
* @throws AlreadyClosedException if the Spellchecker is already closed
*/
public void clearIndex() throws IOException {
synchronized (modifyCurrentIndexLock) {
ensureOpen();
final Directory dir = this.spellIndex;
final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
Version.LUCENE_CURRENT,
null)
.setOpenMode(OpenMode.CREATE));
writer.close();
swapSearcher(dir);
}
}
/**
* Check whether the word exists in the index.
* @param word word to check
* @throws IOException If there is a low-level I/O error.
* @throws AlreadyClosedException if the Spellchecker is already closed
* @return true if the word exists in the index
*/
public boolean exist(String word) throws IOException {
// obtainSearcher calls ensureOpen
final IndexSearcher indexSearcher = obtainSearcher();
try{
// TODO: we should use ReaderUtil+seekExact, we dont care about the docFreq
// this is just an existence check
return indexSearcher.getIndexReader().docFreq(new Term(F_WORD, word)) > 0;
} finally {
releaseSearcher(indexSearcher);
}
}
/**
* Indexes the data from the given {@link Dictionary}.
* @param dict Dictionary to index
* @param config {@link IndexWriterConfig} to use
* @param fullMerge whether or not the spellcheck index should be fully merged
* @throws AlreadyClosedException if the Spellchecker is already closed
* @throws IOException If there is a low-level I/O error.
*/
public final void indexDictionary(Dictionary dict, IndexWriterConfig config, boolean fullMerge) throws IOException {
synchronized (modifyCurrentIndexLock) {
ensureOpen();
final Directory dir = this.spellIndex;
final IndexWriter writer = new IndexWriter(dir, config);
IndexSearcher indexSearcher = obtainSearcher();
final List termsEnums = new ArrayList();
final IndexReader reader = searcher.getIndexReader();
if (reader.maxDoc() > 0) {
for (final AtomicReaderContext ctx : reader.leaves()) {
Terms terms = ctx.reader().terms(F_WORD);
if (terms != null)
termsEnums.add(terms.iterator(null));
}
}
boolean isEmpty = termsEnums.isEmpty();
try {
BytesRefIterator iter = dict.getEntryIterator();
BytesRef currentTerm;
terms: while ((currentTerm = iter.next()) != null) {
String word = currentTerm.utf8ToString();
int len = word.length();
if (len < 3) {
continue; // too short we bail but "too long" is fine...
}
if (!isEmpty) {
for (TermsEnum te : termsEnums) {
if (te.seekExact(currentTerm)) {
continue terms;
}
}
}
// ok index the word
Document doc = createDocument(word, getMin(len), getMax(len));
writer.addDocument(doc);
}
} finally {
releaseSearcher(indexSearcher);
}
if (fullMerge) {
writer.forceMerge(1);
}
// close writer
writer.close();
// TODO: this isn't that great, maybe in the future SpellChecker should take
// IWC in its ctor / keep its writer open?
// also re-open the spell index to see our own changes when the next suggestion
// is fetched:
swapSearcher(dir);
}
}
private static int getMin(int l) {
if (l > 5) {
return 3;
}
if (l == 5) {
return 2;
}
return 1;
}
private static int getMax(int l) {
if (l > 5) {
return 4;
}
if (l == 5) {
return 3;
}
return 2;
}
private static Document createDocument(String text, int ng1, int ng2) {
Document doc = new Document();
// the word field is never queried on... its indexed so it can be quickly
// checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos
Field f = new StringField(F_WORD, text, Field.Store.YES);
doc.add(f); // orig term
addGram(text, doc, ng1, ng2);
return doc;
}
private static void addGram(String text, Document doc, int ng1, int ng2) {
int len = text.length();
for (int ng = ng1; ng <= ng2; ng++) {
String key = "gram" + ng;
String end = null;
for (int i = 0; i < len - ng + 1; i++) {
String gram = text.substring(i, i + ng);
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
Field ngramField = new Field(key, gram, ft);
// spellchecker does not use positional queries, but we want freqs
// for scoring these multivalued n-gram fields.
doc.add(ngramField);
if (i == 0) {
// only one term possible in the startXXField, TF/pos and norms aren't needed.
Field startField = new StringField("start" + ng, gram, Field.Store.NO);
doc.add(startField);
}
end = gram;
}
if (end != null) { // may not be present if len==ng1
// only one term possible in the endXXField, TF/pos and norms aren't needed.
Field endField = new StringField("end" + ng, end, Field.Store.NO);
doc.add(endField);
}
}
}
private IndexSearcher obtainSearcher() {
synchronized (searcherLock) {
ensureOpen();
searcher.getIndexReader().incRef();
return searcher;
}
}
private void releaseSearcher(final IndexSearcher aSearcher) throws IOException{
// don't check if open - always decRef
// don't decrement the private searcher - could have been swapped
aSearcher.getIndexReader().decRef();
}
private void ensureOpen() {
if (closed) {
throw new AlreadyClosedException("Spellchecker has been closed");
}
}
/**
* Close the IndexSearcher used by this SpellChecker
* @throws IOException if the close operation causes an {@link IOException}
* @throws AlreadyClosedException if the {@link SpellChecker} is already closed
*/
@Override
public void close() throws IOException {
synchronized (searcherLock) {
ensureOpen();
closed = true;
if (searcher != null) {
searcher.getIndexReader().close();
}
searcher = null;
}
}
private void swapSearcher(final Directory dir) throws IOException {
/*
* opening a searcher is possibly very expensive.
* We rather close it again if the Spellchecker was closed during
* this operation than block access to the current searcher while opening.
*/
final IndexSearcher indexSearcher = createSearcher(dir);
synchronized (searcherLock) {
if(closed){
indexSearcher.getIndexReader().close();
throw new AlreadyClosedException("Spellchecker has been closed");
}
if (searcher != null) {
searcher.getIndexReader().close();
}
// set the spellindex in the sync block - ensure consistency.
searcher = indexSearcher;
this.spellIndex = dir;
}
}
/**
* Creates a new read-only IndexSearcher
* @param dir the directory used to open the searcher
* @return a new read-only IndexSearcher
* @throws IOException f there is a low-level IO error
*/
// for testing purposes
IndexSearcher createSearcher(final Directory dir) throws IOException{
return new IndexSearcher(DirectoryReader.open(dir));
}
/**
* Returns true
if and only if the {@link SpellChecker} is
* closed, otherwise false
.
*
* @return true
if and only if the {@link SpellChecker} is
* closed, otherwise false
.
*/
boolean isClosed(){
return closed;
}
}