com.swabunga.spell.engine.SpellDictionaryDisk Maven / Gradle / Ivy
Show all versions of jazzy Show documentation
/*
Jazzy - a Java library for Spell Checking
Copyright (C) 2001 Mindaugas Idzelis
Full text of license can be found in LICENSE.txt
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* Created by bgalbs on Jan 30, 2003 at 11:38:39 PM */
package com.swabunga.spell.engine;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
/**
* An implementation of SpellDictionary
that doesn't cache any
* words in memory. Avoids the huge footprint of
* SpellDictionaryHashMap
at the cost of relatively minor latency.
* A future version of this class that implements some caching strategies might
* be a good idea in the future, if there's any demand for it.
*
* This class makes use of the "classic" Java IO library (java.io). However, it
* could probably benefit from the new IO APIs (java.nio) and it is anticipated
* that a future version of this class, probably called
* SpellDictionaryDiskNIO
will appear at some point.
*
* @author Ben Galbraith ([email protected])
* @version 0.1
* @since 0.5
*/
public class SpellDictionaryDisk extends SpellDictionaryASpell {
private final static String DIRECTORY_WORDS = "words";
private final static String DIRECTORY_DB = "db";
private final static String FILE_CONTENTS = "contents";
private final static String FILE_DB = "words.db";
private final static String FILE_INDEX = "words.idx";
/* maximum number of words an index entry can represent */
private final static int INDEX_SIZE_MAX = 200;
private File base;
private File words;
private File db;
private Map index;
/**
* The flag indicating if the initial preparation or loading of the on disk
* dictionary is complete.
*/
protected boolean ready;
/*
* used at time of creation of index to speed up determining the number of
* words per index entry
*/
private List indexCodeCache = null;
/**
* Construct a spell dictionary on disk. The spell dictionary is created
* from words list(s) contained in file(s). A words list file is a file with
* one word per line. Words list files are located in a
* base/words
dictionary where base
is the path to
* words
dictionary. The on disk spell dictionary is created in
* base/db
dictionary and contains files:
*
* contents
list the words files used for spelling.
* words.db
the content of words files organized as a
* database of words.
* words.idx
an index file to the words.db
* file content.
*
* The contents
file has a list of filename, size
* indicating the name and length of each files in the
* base/words
dictionary. If one of theses files was changed,
* added or deleted before the call to the constructor, the process of
* producing new or updated words.db
and words.idx
* files is started again.
*
* The spellchecking process is then worked upon the words.db
* and words.idx
files.
*
*
* NOTE: Do *not* create two instances of this class pointing to the same
* base
unless you are sure that a new dictionary does not have
* to be created. In the future, some sort of external locking mechanism may
* be created that handles this scenario gracefully.
*
* @param base the base directory in which SpellDictionaryDisk
* can expect to find its necessary files.
* @param phonetic the phonetic file used by the spellchecker.
* @param block if a new word db needs to be created, there can be a
* considerable delay before the constructor returns. If block is
* true, this method will block while the db is created and
* return when done. If block is false, this method will create a
* thread to create the new dictionary and return immediately.
* @throws java.io.FileNotFoundException indicates problems locating the
* files on the system
* @throws java.io.IOException indicates problems reading the files
*/
public SpellDictionaryDisk(File base, File phonetic, boolean block)
throws FileNotFoundException, IOException {
super(phonetic);
this.ready = false;
this.base = base;
this.words = new File(base, DIRECTORY_WORDS);
this.db = new File(base, DIRECTORY_DB);
if (!this.base.exists())
throw new FileNotFoundException("Couldn't find required path '"
+ this.base + "'");
if (!this.words.exists())
throw new FileNotFoundException("Couldn't find required path '"
+ this.words + "'");
if (!this.db.exists())
db.mkdirs();
if (newDictionaryFiles()) {
if (block) {
buildNewDictionaryDatabase();
loadIndex();
ready = true;
} else {
Thread t = new Thread() {
public void run() {
try {
buildNewDictionaryDatabase();
loadIndex();
ready = true;
} catch (Exception e) {
e.printStackTrace();
}
}
};
t.start(); // FIXME find bugs points this as an error. And it
// does look weird.
}
} else {
loadIndex();
ready = true;
}
}
/**
* Builds the file words database file and the contents file for the on disk
* dictionary.
*/
protected void buildNewDictionaryDatabase() throws FileNotFoundException,
IOException {
/* combine all dictionary files into one sorted file */
File sortedFile = buildSortedFile();
/* create the db for the sorted file */
buildCodeDb(sortedFile);
sortedFile.delete();
/* build contents file */
buildContentsFile();
}
/**
* Adds another word to the dictionary.
* This method is not yet implemented
* for this class.
*
* @param word The word to add.
*/
public void addWord(String word) {
throw new UnsupportedOperationException("addWord not yet implemented (sorry)");
}
/**
* Returns a list of words that have the same phonetic code.
*
* @param code The phonetic code common to the list of words
* @return A list of words having the same phonetic code
*/
public List getWords(String code) {
List words = new ArrayList();
Integer[] posLen = getStartPosAndLen(code);
if (posLen != null) {
try {
InputStream input = new FileInputStream(new File(db, FILE_DB));
input.skip(posLen[0]);
byte[] bytes = new byte[posLen[1]];
input.read(bytes, 0, posLen[1]);
input.close();
String data = new String(bytes);
String[] lines = split(data, "\n");
for (int i = 0; i < lines.length; i++) {
String[] s = split(lines[i], ",");
if (s[0].equals(code))
words.add(s[1]);
}
} catch (Exception e) {
e.printStackTrace();
}
}
return words;
}
/**
* Indicates if the initial preparation or loading of the on disk dictionary
* is complete.
*
* @return the indication that the dictionary initial setup is done.
*/
public boolean isReady() {
return ready;
}
private boolean newDictionaryFiles() throws FileNotFoundException,
IOException {
/*
* load in contents file, which indicates the files and sizes of the
* last db build
*/
List contents = new ArrayList();
File c = new File(db, FILE_CONTENTS);
if (c.exists()) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(c));
String line;
while ((line = reader.readLine()) != null) {
// format of file should be [filename],[size]
String[] s = split(line, ",");
contents.add(new FileSize(s[0], Integer.parseInt(s[1])));
}
} catch (FileNotFoundException e) {
throw e;
} catch (IOException e) {
throw e;
} finally {
if (reader != null)
reader.close();
}
}
/* compare this to the actual directory */
boolean changed = false;
File[] wordFiles = words.listFiles();
if (contents.size() != wordFiles.length) {
// if the size of the contents list and the number of word files are
// different, it
// means we've definitely got to reindex
changed = true;
} else {
// check and make sure that all the word files haven't changed on us
for (int i = 0; i < wordFiles.length; i++) {
FileSize fs = new FileSize(wordFiles[i].getName(),
wordFiles[i].length());
if (!contents.contains(fs)) {
changed = true;
break;
}
}
}
return changed;
}
private File buildSortedFile() throws FileNotFoundException, IOException {
List w = new ArrayList();
/*
* read every single word into the list. eeek. if this causes problems,
* we may wish to explore disk-based sorting or more efficient
* memory-based storage
*/
File[] wordFiles = words.listFiles();
for (int i = 0; i < wordFiles.length; i++) {
BufferedReader r = new BufferedReader(new FileReader(wordFiles[i]));
String word;
while ((word = r.readLine()) != null) {
if (!word.equals("")) {
w.add(word.trim());
}
}
r.close();
}
Collections.sort(w);
// FIXME - error handling for running out of disk space would be nice.
File file = File.createTempFile("jazzy", "sorted");
BufferedWriter writer = new BufferedWriter(new FileWriter(file));
String prev = null;
for (int i = 0; i < w.size(); i++) {
String word = (String) w.get(i);
if (prev == null || !prev.equals(word)) {
writer.write(word);
writer.newLine();
}
prev = word;
}
writer.close();
return file;
}
private void buildCodeDb(File sortedWords) throws FileNotFoundException,
IOException {
List codeList = new ArrayList();
BufferedReader reader = new BufferedReader(new FileReader(sortedWords));
String word;
while ((word = reader.readLine()) != null) {
codeList.add(new CodeWord(this.getCode(word), word));
}
reader.close();
Collections.sort(codeList);
List