com.swabunga.spell.SpellChecker Maven / Gradle / Ivy
Show all versions of jazzy Show documentation
/*
Jazzy - a Java library for Spell Checking
Copyright (C) 2001 Mindaugas Idzelis
Full text of license can be found in LICENSE.txt
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
package com.swabunga.spell;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import com.swabunga.spell.engine.Configuration;
import com.swabunga.spell.engine.SpellDictionary;
import com.swabunga.spell.engine.SpellDictionaryHashMap;
import com.swabunga.spell.engine.Word;
import com.swabunga.spell.event.BasicSpellCheckEvent;
import com.swabunga.spell.event.SpellCheckEvent;
import com.swabunga.spell.event.SpellCheckListener;
import com.swabunga.spell.tokenizer.StringWordTokenizer;
import com.swabunga.spell.tokenizer.WordTokenizer;
import com.swabunga.spell.util.VectorUtility;
/**
* This is the main class for spell checking (using the new event based spell
* checking).
*
* By default, the class makes a user dictionary to accumulate added words.
* Since this user directory has no file assign to persist added words, they
* will be retained for the duration of the spell checker instance. If you set a
* user dictionary like {@link com.swabunga.spell.engine.SpellDictionaryHashMap
* SpellDictionaryHashMap} to persist the added word, the user dictionary will
* have the possibility to grow and be available across differents invocations
* of the spell checker.
*
* @author Jason Height ([email protected]) 19 June 2002
*/
public class SpellChecker {
/**
* Flag indicating that the Spell Check completed without any errors present
*/
public static final int SPELLCHECK_OK = -1;
/** Flag indicating that the Spell Check completed due to user cancellation */
public static final int SPELLCHECK_CANCEL = -2;
private List eventListeners = new ArrayList();
private List dictionaries = new ArrayList();
private SpellDictionary userdictionary;
private Configuration config = Configuration.getConfiguration();
/** This variable holds all of the words that are to be always ignored */
private List ignoredWords = new ArrayList();
private Map autoReplaceWords = new HashMap();
// added caching - bd
// For cached operation a separate user dictionary is required
private Map> cache;
private int threshold = 0;
private int cacheSize = 0;
/**
* Constructs the SpellChecker.
*/
public SpellChecker() {
try {
userdictionary = new SpellDictionaryHashMap();
} catch (IOException e) {
throw new RuntimeException(
"this exception should never happen because we are using null phonetic file");
}
}
/**
* Constructs the SpellChecker. The default threshold is used
*
* @param dictionary The dictionary used for looking up words.
*/
public SpellChecker(SpellDictionary dictionary) {
this();
addDictionary(dictionary);
}
/**
* Constructs the SpellChecker with a threshold
*
* @param dictionary the dictionary used for looking up words.
* @param threshold the cost value above which any suggestions are thrown
* away
*/
public SpellChecker(SpellDictionary dictionary, int threshold) {
this(dictionary);
config.setInteger(Configuration.SPELL_THRESHOLD, threshold);
}
/**
* Accumulates a dictionary at the end of the dictionaries list used for
* looking up words. Adding a dictionary give the flexibility to assign the
* base language dictionary, then a more technical, then...
*
* @param dictionary the dictionary to add at the end of the dictionary
* list.
*/
public void addDictionary(SpellDictionary dictionary) {
if (dictionary == null) {
throw new IllegalArgumentException("dictionary must be non-null");
}
this.dictionaries.add(dictionary);
}
/**
* Registers the user dictionary to which words are added.
*
* @param dictionary the dictionary to use when the user specify a new word
* to add.
*/
public void setUserDictionary(SpellDictionary dictionary) {
userdictionary = dictionary;
}
/**
* Supply the instance of the configuration holding the spell checking
* engine parameters.
*
* @return Current Configuration
*/
public Configuration getConfiguration() {
return config;
}
/**
* Adds a SpellCheckListener to the listeners list.
*
* @param listener The feature to be added to the SpellCheckListener
* attribute
*/
public void addSpellCheckListener(SpellCheckListener listener) {
eventListeners.add(listener);
}
/**
* Removes a SpellCheckListener from the listeners list.
*
* @param listener The listener to be removed from the listeners list.
*/
public void removeSpellCheckListener(SpellCheckListener listener) {
eventListeners.remove(listener);
}
/**
* Fires off a spell check event to the listeners.
*
* @param event The event that need to be processed by the spell checking
* system.
*/
protected void fireSpellCheckEvent(SpellCheckEvent event) {
for (int i = eventListeners.size() - 1; i >= 0; i--) {
((SpellCheckListener) eventListeners.get(i))
.spellingError(event);
}
}
/**
* This method clears the words that are currently being remembered as
* Ignore All
words and Replace All
words.
*/
public void reset() {
ignoredWords = new ArrayList();
autoReplaceWords = new HashMap();
}
/**
* Checks the text string.
*
* Returns the corrected string.
*
* @param text The text that need to be spelled checked
* @return The text after spell checking
* @deprecated use checkSpelling(WordTokenizer)
*/
public String checkString(String text) {
StringWordTokenizer tokens = new StringWordTokenizer(text);
checkSpelling(tokens);
return tokens.getContext();
}
/**
* Verifies if the word that is being spell checked contains elementAtat least a
* digit. Returns true if this word contains a digit.
*
* @param word The word to analyze for digit.
* @return true if the word contains at least a digit.
*/
private final static boolean isDigitWord(String word) {
for (int i = word.length() - 1; i >= 0; i--) {
if (Character.isDigit(word.charAt(i))) {
return true;
}
}
return false;
}
/**
* Verifies if the word that is being spell checked contains an Internet
* address. The method look for typical protocol or the habitual string in
* the word:
*
* - http://
* - ftp://
* - https://
* - ftps://
* - www.
*
*
* One limitation is that this method cannot currently recognize email
* addresses. Since the 'word' that is passed in, may in fact contain the
* rest of the document to be checked, it is not (yet!) a good idea to scan
* for the @ character.
*
* @param word The word to analyze for an Internet address.
* @return true if this word looks like an Internet address.
*/
public final static boolean isINETWord(String word) {
String lowerCaseWord = word.toLowerCase();
return lowerCaseWord.startsWith("http://")
|| lowerCaseWord.startsWith("www.")
|| lowerCaseWord.startsWith("ftp://")
|| lowerCaseWord.startsWith("https://")
|| lowerCaseWord.startsWith("ftps://");
}
/**
* Verifies if the word that is being spell checked contains all uppercases
* characters.
*
* @param word The word to analyze for uppercases characters
* @return true if this word contains all upper case characters
*/
private final static boolean isUpperCaseWord(String word) {
for (int i = word.length() - 1; i >= 0; i--) {
if (Character.isLowerCase(word.charAt(i))) {
return false;
}
}
return true;
}
/**
* Verifies if the word that is being spell checked contains lower and upper
* cased characters. Note that a phrase beginning with an upper cased
* character is not considered a mixed case word.
*
* @param word The word to analyze for mixed cases characters
* @param startsSentence True if this word is at the start of a sentence
* @return true if this word contains mixed case characters
*/
private final static boolean isMixedCaseWord(String word,
boolean startsSentence) {
int strLen = word.length();
boolean isUpper = Character.isUpperCase(word.charAt(0));
// Ignore the first character if this word starts the sentence and the
// first
// character was upper cased, since this is normal behaviour
if ((startsSentence) && isUpper && (strLen > 1))
isUpper = Character.isUpperCase(word.charAt(1));
if (isUpper) {
for (int i = word.length() - 1; i > 0; i--) {
if (Character.isLowerCase(word.charAt(i))) {
return true;
}
}
} else {
for (int i = word.length() - 1; i > 0; i--) {
if (Character.isUpperCase(word.charAt(i))) {
return true;
}
}
}
return false;
}
/**
* This method will fire the spell check event and then handle the event
* action that has been selected by the user.
*
* @param tokenizer Description of the Parameter
* @param event The event to handle
* @return Returns true if the event action is to cancel the current spell
* checking, false if the spell checking should continue
*/
protected boolean fireAndHandleEvent(WordTokenizer tokenizer,
SpellCheckEvent event) {
fireSpellCheckEvent(event);
String word = event.getInvalidWord();
// Work out what to do in response to the event.
switch (event.getAction()) {
case SpellCheckEvent.INITIAL:
break;
case SpellCheckEvent.IGNORE:
break;
case SpellCheckEvent.IGNOREALL:
ignoreAll(word);
break;
case SpellCheckEvent.REPLACE:
tokenizer.replaceWord(event.getReplaceWord());
break;
case SpellCheckEvent.REPLACEALL:
String replaceAllWord = event.getReplaceWord();
if (!autoReplaceWords.containsKey(word)) {
autoReplaceWords.put(word, replaceAllWord);
}
tokenizer.replaceWord(replaceAllWord);
break;
case SpellCheckEvent.ADDTODICT:
String addWord = event.getReplaceWord();
if (!addWord.equals(word))
tokenizer.replaceWord(addWord);
userdictionary.addWord(addWord);
break;
case SpellCheckEvent.CANCEL:
return true;
default:
throw new IllegalArgumentException("Unhandled case.");
}
return false;
}
/**
* Adds a word to the list of ignored words
*
* @param word The text of the word to ignore
*/
public void ignoreAll(String word) {
if (!ignoredWords.contains(word)) {
ignoredWords.add(word);
}
}
/**
* Adds a word to the user dictionary
*
* @param word The text of the word to add
*/
public void addToDictionary(String word) {
if (!userdictionary.isCorrect(word))
userdictionary.addWord(word);
}
/**
* Indicates if a word is in the list of ignored words
*
* @param word The text of the word check
*/
public boolean isIgnored(String word) {
return ignoredWords.contains(word);
}
/**
* Verifies if the word to analyze is contained in dictionaries. The order
* of dictionary lookup is:
*
* - The default user dictionary or the one set through
* {@link SpellChecker#setUserDictionary}
* - The dictionary specified at construction time, if any.
* - Any dictionary in the order they were added through
* {@link SpellChecker#addDictionary}
*
*
* @param word The word to verify that it's spelling is known.
* @return true if the word is in a dictionary.
*/
public boolean isCorrect(String word) {
if (userdictionary.isCorrect(word))
return true;
for (Iterator e = dictionaries.iterator(); e.hasNext();) {
SpellDictionary dictionary = e.next();
if (dictionary.isCorrect(word))
return true;
}
return false;
}
/**
* Produces a list of suggested word after looking for suggestions in
* various dictionaries. The order of dictionary lookup is:
*
* - The default user dictionary or the one set through
* {@link SpellChecker#setUserDictionary}
* - The dictionary specified at construction time, if any.
* - Any dictionary in the order they were added through
* {@link SpellChecker#addDictionary}
*
*
* @param word The word for which we want to gather suggestions
* @param threshold the cost value above which any suggestions are thrown
* away
* @return the list of words suggested
*/
public List getSuggestions(String word, int threshold) {
if (this.threshold != threshold && cache != null) {
this.threshold = threshold;
cache.clear();
}
List suggestions = null;
if (cache != null)
suggestions = cache.get(word);
if (suggestions == null) {
suggestions = new ArrayList(50);
for (Iterator e = dictionaries.iterator(); e.hasNext();) {
SpellDictionary dictionary = e.next();
if (dictionary != userdictionary)
VectorUtility.addAll(suggestions,
dictionary.getSuggestions(word, threshold), false);
}
if (cache != null && cache.size() < cacheSize)
cache.put(word, suggestions);
}
VectorUtility.addAll(suggestions,
userdictionary.getSuggestions(word, threshold), false);
//TODO: suggestions.trimToSize();
return suggestions;
}
/**
* Activates a cache with the maximum number of entries set to 300
*/
public void setCache() {
setCache(300);
}
/**
* Activates a cache with specified size
*
* @param size - max. number of cache entries (0 to disable chache)
*/
public void setCache(int size) {
cacheSize = size;
if (size == 0)
cache = null;
else
cache = new HashMap>((size + 2) / 3 * 4);
}
/**
* This method is called to check the spelling of the words that are
* returned by the WordTokenizer.
*
* For each invalid word the action listeners will be informed with a new
* SpellCheckEvent.
*
*
* @param tokenizer The media containing the text to analyze.
* @return Either SPELLCHECK_OK, SPELLCHECK_CANCEL or the number of errors
* found. The number of errors are those that are found BEFORE any
* corrections are made.
*/
public final int checkSpelling(WordTokenizer tokenizer) {
int errors = 0;
boolean terminated = false;
// Keep track of the previous word
// String previousWord = null;
while (tokenizer.hasMoreWords() && !terminated) {
String word = tokenizer.nextWord();
// Check the spelling of the word
if (!isCorrect(word)) {
if ((config.getBoolean(Configuration.SPELL_IGNOREMIXEDCASE) && isMixedCaseWord(
word, tokenizer.isNewSentence()))
|| (config
.getBoolean(Configuration.SPELL_IGNOREUPPERCASE) && isUpperCaseWord(word))
|| (config
.getBoolean(Configuration.SPELL_IGNOREDIGITWORDS) && isDigitWord(word))
|| (config
.getBoolean(Configuration.SPELL_IGNOREINTERNETADDRESSES) && isINETWord(word))) {
// Null event. Since we are ignoring this word due
// to one of the above cases.
} else {
// We cant ignore this misspelt word
// For this invalid word are we ignoring the misspelling?
if (!isIgnored(word)) {
errors++;
// Is this word being automagically replaced
if (autoReplaceWords.containsKey(word)) {
tokenizer.replaceWord((String) autoReplaceWords
.get(word));
} else {
// JMH Need to somehow capitalise the suggestions if
// ignoreSentenceCapitalisation is not set to true
// Fire the event.
List suggestions = getSuggestions(
word,
config.getInteger(Configuration.SPELL_THRESHOLD));
if (capitalizeSuggestions(word, tokenizer))
suggestions = makeSuggestionsCapitalized(suggestions);
SpellCheckEvent event = new BasicSpellCheckEvent(word, suggestions, tokenizer);
terminated = fireAndHandleEvent(tokenizer, event);
}
}
}
} else {
// This is a correctly spelt word. However perform some extra
// checks
/*
* JMH TBD //Check for multiple words if (!ignoreMultipleWords
* &&) { }
*/
// Check for capitalisation
if (isSupposedToBeCapitalized(word, tokenizer)) {
errors++;
StringBuilder buf = new StringBuilder(word);
buf.setCharAt(0, Character.toUpperCase(word.charAt(0)));
List suggestion = new ArrayList();
suggestion.add(new Word(buf.toString(), 0));
SpellCheckEvent event = new BasicSpellCheckEvent(word, suggestion, tokenizer);
terminated = fireAndHandleEvent(tokenizer, event);
}
}
}
if (terminated)
return SPELLCHECK_CANCEL;
else if (errors == 0)
return SPELLCHECK_OK;
else
return errors;
}
private List makeSuggestionsCapitalized(List suggestions) {
Iterator iterator = suggestions.iterator();
while (iterator.hasNext()) {
Word word = iterator.next();
String suggestion = word.getWord();
StringBuilder StringBuilder = new StringBuilder(suggestion);
StringBuilder.setCharAt(0,
Character.toUpperCase(suggestion.charAt(0)));
word.setWord(StringBuilder.toString());
}
return suggestions;
}
private boolean isSupposedToBeCapitalized(String word,
WordTokenizer wordTokenizer) {
boolean configCapitalize = !config
.getBoolean(Configuration.SPELL_IGNORESENTENCECAPITALIZATION);
return configCapitalize && wordTokenizer.isNewSentence()
&& Character.isLowerCase(word.charAt(0));
}
private boolean capitalizeSuggestions(String word,
WordTokenizer wordTokenizer) {
// if SPELL_IGNORESENTENCECAPITALIZATION and the initial word is
// capitalized, suggestions should also be capitalized
// if !SPELL_IGNORESENTENCECAPITALIZATION, capitalize suggestions only
// for the first word in a sentence
boolean configCapitalize = !config
.getBoolean(Configuration.SPELL_IGNORESENTENCECAPITALIZATION);
boolean uppercase = Character.isUpperCase(word.charAt(0));
return (configCapitalize && wordTokenizer.isNewSentence())
|| (!configCapitalize && uppercase);
}
}