All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.swabunga.spell.engine.SpellDictionaryDisk Maven / Gradle / Ivy

Go to download

A set of APIs that allow you to add spell checking functionality to Java Applications easily. Jazzy is based on most of the algorithms that aspell has; so the suggestions they come up with are very similar. Note that this is not an official release from the jazzy project; it is a release of 0.5.2 with enhancements / bug-fixes as required by the RText SpellChecker project hosted on www.fifesoft.com. The scm urls in this pom indicate where the enhanced source code is hosted

There is a newer version: 0.5.2-rtext-1.4.1-2
Show newest version
/*
Jazzy - a Java library for Spell Checking
Copyright (C) 2001 Mindaugas Idzelis
Full text of license can be found in LICENSE.txt

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
*/
/* Created by bgalbs on Jan 30, 2003 at 11:38:39 PM */
package com.swabunga.spell.engine;

import java.io.*;
import java.util.*;

/**
 * An implementation of SpellDictionary that doesn't cache any words in memory. Avoids the huge
 * footprint of SpellDictionaryHashMap at the cost of relatively minor latency. A future version
 * of this class that implements some caching strategies might be a good idea in the future, if there's any
 * demand for it.
 * 

* This class makes use of the "classic" Java IO library (java.io). However, it could probably benefit from * the new IO APIs (java.nio) and it is anticipated that a future version of this class, probably called * SpellDictionaryDiskNIO will appear at some point. * * @author Ben Galbraith ([email protected]) * @version 0.1 * @since 0.5 */ public class SpellDictionaryDisk extends SpellDictionaryASpell { private final static String DIRECTORY_WORDS = "words"; private final static String DIRECTORY_DB = "db"; private final static String FILE_CONTENTS = "contents"; private final static String FILE_DB = "words.db"; private final static String FILE_INDEX = "words.idx"; /* maximum number of words an index entry can represent */ private final static int INDEX_SIZE_MAX = 200; private File base; private File words; private File db; private Map index; /** * The flag indicating if the initial preparation or loading of the on * disk dictionary is complete. */ protected boolean ready; /* used at time of creation of index to speed up determining the number of words per index entry */ private List indexCodeCache = null; /** * Construct a spell dictionary on disk. * The spell dictionary is created from words list(s) contained in file(s). * A words list file is a file with one word per line. Words list files are * located in a base/words dictionary where base * is the path to words dictionary. The on disk spell * dictionary is created in base/db dictionary and contains * files: *

    *
  • contents list the words files used for spelling.
  • *
  • words.db the content of words files organized as * a database of words.
  • *
  • words.idx an index file to the words.db * file content.
  • *
* The contents file has a list of * filename, size indicating the name and length of each files * in the base/words dictionary. If one of theses files was * changed, added or deleted before the call to the constructor, the process * of producing new or updated words.db and * words.idx files is started again. *

* The spellchecking process is then worked upon the words.db * and words.idx files. *

* * NOTE: Do *not* create two instances of this class pointing to the same base unless * you are sure that a new dictionary does not have to be created. In the future, some sort of * external locking mechanism may be created that handles this scenario gracefully. * * @param base the base directory in which SpellDictionaryDisk can expect to find * its necessary files. * @param phonetic the phonetic file used by the spellchecker. * @param block if a new word db needs to be created, there can be a considerable delay before * the constructor returns. If block is true, this method will block while the db is created * and return when done. If block is false, this method will create a thread to create the new * dictionary and return immediately. * @throws java.io.FileNotFoundException indicates problems locating the * files on the system * @throws java.io.IOException indicates problems reading the files */ public SpellDictionaryDisk(File base, File phonetic, boolean block) throws FileNotFoundException, IOException { super(phonetic); this.ready = false; this.base = base; this.words = new File(base, DIRECTORY_WORDS); this.db = new File(base, DIRECTORY_DB); if (!this.base.exists()) throw new FileNotFoundException("Couldn't find required path '" + this.base + "'"); if (!this.words.exists()) throw new FileNotFoundException("Couldn't find required path '" + this.words + "'"); if (!this.db.exists()) db.mkdirs(); if (newDictionaryFiles()) { if (block) { buildNewDictionaryDatabase(); loadIndex(); ready = true; } else { Thread t = new Thread() { public void run() { try { buildNewDictionaryDatabase(); loadIndex(); ready = true; } catch (Exception e) { e.printStackTrace(); } } }; t.start(); } } else { loadIndex(); ready = true; } } /** * Builds the file words database file and the contents file for the on * disk dictionary. */ protected void buildNewDictionaryDatabase() throws FileNotFoundException, IOException { /* combine all dictionary files into one sorted file */ File sortedFile = buildSortedFile(); /* create the db for the sorted file */ buildCodeDb(sortedFile); sortedFile.delete(); /* build contents file */ buildContentsFile(); } /** * Adds another word to the dictionary. This method is not yet implemented * for this class. * @param word The word to add. */ public boolean addWord(String word) { throw new UnsupportedOperationException("addWord not yet implemented (sorry)"); } /** * Returns a list of words that have the same phonetic code. * @param code The phonetic code common to the list of words * @return A list of words having the same phonetic code */ public List getWords(String code) { Vector words = new Vector(); int[] posLen = getStartPosAndLen(code); if (posLen != null) { try { InputStream input = new FileInputStream(new File(db, FILE_DB)); input.skip(posLen[0]); byte[] bytes = new byte[posLen[1]]; input.read(bytes, 0, posLen[1]); input.close(); String data = new String(bytes); String[] lines = split(data, "\n"); for (int i = 0; i < lines.length; i++) { String[] s = split(lines[i], ","); if (s[0].equals(code)) words.addElement(s[1]); } } catch (Exception e) { e.printStackTrace(); } } return words; } // robert: Faster getWords() implementation (buffering and re-using stream, // otherwise just micro-optimizations). /* BufferedInputStream input; byte[] bytes; public List getWords(String code) { Vector words = new Vector(); int[] posLen = getStartPosAndLen(code); if (posLen != null) { try { //InputStream input = new FileInputStream(new File(db, FILE_DB)); if (input==null) { input = new BufferedInputStream(new FileInputStream(new File(db, FILE_DB))); input.mark(Integer.MAX_VALUE); } input.skip(posLen[0]); //byte[] bytes = new byte[posLen[1]]; if (bytes==null || bytes.length 3) thisCode = thisCode.substring(0, 3); thisCode = getIndexCode(thisCode, codeList); String toWrite = cw.getCode() + "," + cw.getWord() + "\n"; byte[] bytes = toWrite.getBytes(); if (currentCode == null) currentCode = thisCode; if (!currentCode.equals(thisCode)) { index.add(new Object[]{currentCode, new int[]{currentPosition, currentLength}}); currentPosition += currentLength; currentLength = bytes.length; currentCode = thisCode; } else { currentLength += bytes.length; } out.write(bytes); } out.close(); // Output the last iteration if (currentCode != null && currentPosition != 0 && currentLength != 0) index.add(new Object[]{currentCode, new int[]{currentPosition, currentLength}}); BufferedWriter writer = new BufferedWriter(new FileWriter(new File(db, FILE_INDEX))); for (int i = 0; i < index.size(); i++) { Object[] o = (Object[]) index.get(i); writer.write(o[0].toString()); writer.write(","); writer.write(String.valueOf(((int[]) o[1])[0])); writer.write(","); writer.write(String.valueOf(((int[]) o[1])[1])); writer.newLine(); } writer.close(); } private void buildContentsFile() throws IOException { File[] wordFiles = words.listFiles(); if (wordFiles.length > 0) { BufferedWriter writer = new BufferedWriter(new FileWriter(new File(db, FILE_CONTENTS))); for (int i = 0; i < wordFiles.length; i++) { writer.write(wordFiles[i].getName()); writer.write(","); writer.write(String.valueOf(wordFiles[i].length())); writer.newLine(); } writer.close(); } else { new File(db, FILE_CONTENTS).delete(); } } /** * Loads the index file from disk. The index file accelerates words lookup * into the dictionary db file. */ protected void loadIndex() throws IOException { index = new HashMap(); File idx = new File(db, FILE_INDEX); BufferedReader reader = new BufferedReader(new FileReader(idx)); String line; while ((line = reader.readLine()) != null) { String[] fields = split(line, ","); index.put(fields[0], new int[]{Integer.parseInt(fields[1]), Integer.parseInt(fields[2])}); } reader.close(); } private int[] getStartPosAndLen(String code) { while (code.length() > 0) { int[] posLen = (int[]) index.get(code); if (posLen == null) { code = code.substring(0, code.length() - 1); } else { return posLen; } } return null; } private String getIndexCode(String code, List codes) { if (indexCodeCache == null) indexCodeCache = new ArrayList(); if (code.length() <= 1) return code; for (int i = 0; i < indexCodeCache.size(); i++) { String c = (String) indexCodeCache.get(i); if (code.startsWith(c)) return c; } int foundSize = -1; boolean cacheable = false; for (int z = 1; z < code.length(); z++) { String thisCode = code.substring(0, z); int count = 0; for (int i = 0; i < codes.size();) { if (i == 0) { i = Collections.binarySearch(codes, new CodeWord(thisCode, "")); if (i < 0) i = 0; } CodeWord cw = (CodeWord) codes.get(i); if (cw.getCode().startsWith(thisCode)) { count++; if (count > INDEX_SIZE_MAX) break; } else if (cw.getCode().compareTo(thisCode) > 0) break; i++; } if (count <= INDEX_SIZE_MAX) { cacheable = true; foundSize = z; break; } } String newCode = (foundSize == -1) ? code : code.substring(0, foundSize); if (cacheable) indexCodeCache.add(newCode); return newCode; } private static String[] split(String input, String delimiter) { StringTokenizer st = new StringTokenizer(input, delimiter); int count = st.countTokens(); String[] out = new String[count]; for (int i = 0; i < count; i++) { out[i] = st.nextToken(); } return out; } private static class CodeWord implements Comparable { // robert: static private String code; private String word; public CodeWord(String code, String word) { this.code = code; this.word = word; } public String getCode() { return code; } public String getWord() { return word; } public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof CodeWord)) return false; final CodeWord codeWord = (CodeWord) o; if (!word.equals(codeWord.word)) return false; return true; } public int hashCode() { return word.hashCode(); } public int compareTo(Object o) { return code.compareTo(((CodeWord) o).getCode()); } } private static class FileSize { // robert: static private String filename; private long size; public FileSize(String filename, long size) { this.filename = filename; this.size = size; } public boolean equals(Object o) { if (this == o) return true; if (o instanceof FileSize) { FileSize fs = (FileSize)o; return size==fs.size && fs.equals(fs.filename); } return false; } public int hashCode() { int result; result = filename.hashCode(); result = (int) (29 * result + size); return result; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy