org.apache.solr.handler.SpellCheckerRequestHandler Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
import org.apache.solr.util.HighFrequencyDictionary;
import org.apache.solr.util.plugin.SolrCoreAware;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Takes a string (e.g. a query string) as the value of the "q" parameter
* and looks up alternative spelling suggestions in the spellchecker.
* The spellchecker used by this handler is the Lucene contrib SpellChecker.
*
*
* The results identifies the original words echoing it as an entry with the
* name of "words" and original word value. It
* also identifies if the requested "words" is contained in the index through
* the use of the exist true/false name value. Examples of these output
* parameters in the standard output format is as follows:
*
<str name="words">facial</str>
<str name="exist">true</str>
*
* If a query string parameter of "extendedResults" is used, then each word within the
* "q" parameter (seperated by a space or +) will
* be iterated through the spell checker and will be wrapped in an
* NamedList. Each word will then get its own set of results: words, exists, and
* suggestions.
* NOTE : Query terms are simply split on whitespace when using extendedResults mode. This is may not be adequate.
* See the {@link org.apache.solr.handler.component.SpellCheckComponent} for alternatives.
*
* Also note that multiword queries will be treated as a single term if extendedResults is false. This may or may not make sense
* depending on how the spelling field was indexed.
*
* Examples of the use of the standard ouput (XML) without and with the
* use of the "extendedResults" parameter are as follows.
*
* The following URL
* examples were configured with the solr.SpellCheckerRequestHandler
* named as "/spellchecker".
*
* Without the use of "extendedResults" and one word
* spelled correctly: facial
* http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&q=facial
*
<?xml version="1.0" encoding="UTF-8"?>
<response>
<lst name="responseHeader">
<int name="status">0</int>
<int name="QTime">6</int>
</lst>
<str name="words">facial</str>
<str name="exist">true</str>
<arr name="suggestions">
<str>faciale</str>
<str>faucial</str>
<str>fascial</str>
<str>facing</str>
<str>faciei</str>
<str>facialis</str>
<str>social</str>
<str>facile</str>
<str>spacial</str>
<str>glacial</str>
<str>marcial</str>
<str>facies</str>
<str>facio</str>
</arr>
</response>
*
* Without the use of "extendedResults" and two words,
* one spelled correctly and one misspelled: facial salophosphoprotein
* http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&q=facial+salophosphoprotein
*
<?xml version="1.0" encoding="UTF-8"?>
<response>
<lst name="responseHeader">
<int name="status">0</int>
<int name="QTime">18</int>
</lst>
<str name="words">facial salophosphoprotein</str>
<str name="exist">false</str>
<arr name="suggestions">
<str>sialophosphoprotein</str>
</arr>
</response>
*
*
* With the use of "extendedResults" and two words,
* one spelled correctly and one misspelled: facial salophosphoprotein
* http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&extendedResults=true&q=facial+salophosphoprotein
*
<?xml version="1.0" encoding="UTF-8"?>
<response>
<lst name="responseHeader">
<int name="status">0</int>
<int name="QTime">23</int>
</lst>
<lst name="result">
<lst name="facial">
<int name="frequency">1</int>
<lst name="suggestions">
<lst name="faciale"><int name="frequency">1</int></lst>
<lst name="faucial"><int name="frequency">1</int></lst>
<lst name="fascial"><int name="frequency">1</int></lst>
<lst name="facing"><int name="frequency">1</int></lst>
<lst name="faciei"><int name="frequency">1</int></lst>
<lst name="facialis"><int name="frequency">1</int></lst>
<lst name="social"><int name="frequency">1</int></lst>
<lst name="facile"><int name="frequency">1</int></lst>
<lst name="spacial"><int name="frequency">1</int></lst>
<lst name="glacial"><int name="frequency">1</int></lst>
<lst name="marcial"><int name="frequency">1</int></lst>
<lst name="facies"><int name="frequency">1</int></lst>
<lst name="facio"><int name="frequency">1</int></lst>
</lst>
</lst>
<lst name="salophosphoprotein">
<int name="frequency">0</int>
<lst name="suggestions">
<lst name="sialophosphoprotein"><int name="frequency">1</int></lst>
<lst name="phosphoprotein"><int name="frequency">1</int></lst>
<lst name="phosphoproteins"><int name="frequency">1</int></lst>
<lst name="alphalipoprotein"><int name="frequency">1</int></lst>
</lst>
</lst>
</lst>
</response>
*
* @see The Lucene Spellchecker documentation
*
*
* @deprecated Use {@link org.apache.solr.handler.component.SpellCheckComponent} instead.
*
* See also https://issues.apache.org/jira/browse/SOLR-474 and https://issues.apache.org/jira/browse/SOLR-485
*
*/
@Deprecated
public class SpellCheckerRequestHandler extends RequestHandlerBase implements SolrCoreAware {
private static Logger log = LoggerFactory.getLogger(SpellCheckerRequestHandler.class);
private SpellChecker spellChecker;
/*
* From http://wiki.apache.org/jakarta-lucene/SpellChecker
* If reader and restrictToField are both not null:
* 1. The returned words are restricted only to the words presents in the field
* "restrictToField "of the Lucene Index "reader".
*
* 2. The list is also sorted with a second criterium: the popularity (the
* frequence) of the word in the user field.
*
* 3. If "onlyMorePopular" is true and the mispelled word exist in the user field,
* return only the words more frequent than this.
*
*/
protected Directory spellcheckerIndexDir = new RAMDirectory();
protected String dirDescription = "(ramdir)";
protected String termSourceField;
protected static final String PREFIX = "sp.";
protected static final String QUERY_PREFIX = PREFIX + "query.";
protected static final String DICTIONARY_PREFIX = PREFIX + "dictionary.";
protected static final String SOURCE_FIELD = DICTIONARY_PREFIX + "termSourceField";
protected static final String INDEX_DIR = DICTIONARY_PREFIX + "indexDir";
protected static final String THRESHOLD = DICTIONARY_PREFIX + "threshold";
protected static final String ACCURACY = QUERY_PREFIX + "accuracy";
protected static final String SUGGESTIONS = QUERY_PREFIX + "suggestionCount";
protected static final String POPULAR = QUERY_PREFIX + "onlyMorePopular";
protected static final String EXTENDED = QUERY_PREFIX + "extendedResults";
protected static final float DEFAULT_ACCURACY = 0.5f;
protected static final int DEFAULT_SUGGESTION_COUNT = 1;
protected static final boolean DEFAULT_MORE_POPULAR = false;
protected static final boolean DEFAULT_EXTENDED_RESULTS = false;
protected static final float DEFAULT_DICTIONARY_THRESHOLD = 0.0f;
protected SolrParams args = null;
@Override
public void init(NamedList args) {
super.init(args);
this.args = SolrParams.toSolrParams(args);
}
public void inform(SolrCore core)
{
termSourceField = args.get(SOURCE_FIELD, args.get("termSourceField"));
try {
String dir = args.get(INDEX_DIR, args.get("spellcheckerIndexDir"));
if (null != dir) {
File f = new File(dir);
if ( ! f.isAbsolute() ) {
f = new File(core.getDataDir(), dir);
}
dirDescription = f.getAbsolutePath();
log.info("using spell directory: " + dirDescription);
spellcheckerIndexDir = FSDirectory.getDirectory(f);
} else {
log.info("using RAM based spell directory");
}
spellChecker = new SpellChecker(spellcheckerIndexDir);
} catch (IOException e) {
throw new RuntimeException("Cannot open SpellChecker index", e);
}
}
/**
* Processes the following query string parameters: q, extendedResults, cmd rebuild,
* cmd reopen, accuracy, suggestionCount, restrictToField, and onlyMorePopular.
*/
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
throws Exception {
SolrParams p = req.getParams();
String words = p.get("q");
String cmd = p.get("cmd");
if (cmd != null) {
cmd = cmd.trim();
if (cmd.equals("rebuild")) {
rebuild(req);
rsp.add("cmdExecuted","rebuild");
} else if (cmd.equals("reopen")) {
reopen();
rsp.add("cmdExecuted","reopen");
} else {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unrecognized Command: " + cmd);
}
}
// empty query string
if (null == words || "".equals(words.trim())) {
return;
}
IndexReader indexReader = null;
String suggestionField = null;
Float accuracy;
int numSug;
boolean onlyMorePopular;
boolean extendedResults;
try {
accuracy = p.getFloat(ACCURACY, p.getFloat("accuracy", DEFAULT_ACCURACY));
spellChecker.setAccuracy(accuracy);
} catch (NumberFormatException e) {
throw new RuntimeException("Accuracy must be a valid positive float", e);
}
try {
numSug = p.getInt(SUGGESTIONS, p.getInt("suggestionCount", DEFAULT_SUGGESTION_COUNT));
} catch (NumberFormatException e) {
throw new RuntimeException("Spelling suggestion count must be a valid positive integer", e);
}
try {
onlyMorePopular = p.getBool(POPULAR, DEFAULT_MORE_POPULAR);
} catch (SolrException e) {
throw new RuntimeException("'Only more popular' must be a valid boolean", e);
}
try {
extendedResults = p.getBool(EXTENDED, DEFAULT_EXTENDED_RESULTS);
} catch (SolrException e) {
throw new RuntimeException("'Extended results' must be a valid boolean", e);
}
// when searching for more popular, a non null index-reader and
// restricted-field are required
if (onlyMorePopular || extendedResults) {
indexReader = req.getSearcher().getReader();
suggestionField = termSourceField;
}
if (extendedResults) {
rsp.add("numDocs", indexReader.numDocs());
SimpleOrderedMap