
com.inet.jortho.Tokenizer Maven / Gradle / Ivy
/*
* JOrtho
*
* Copyright (C) 2005-2008 by i-net software
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA.
*
* Created on 07.11.2005
*/
package com.inet.jortho;
import java.text.BreakIterator;
import java.util.Locale;
import javax.swing.text.AbstractDocument;
import javax.swing.text.BadLocationException;
import javax.swing.text.Document;
import javax.swing.text.JTextComponent;
import javax.swing.text.Utilities;
/**
* Break the text and words and search for misspelling.
* @author Volker Berlin
*/
class Tokenizer {
private final Document doc;
private final SpellCheckerOptions options;
private final LanguageBundle bundle;
/** start offset of current paragraph */
private int paragraphOffset;
/** end offset of current paragraph */
private int endOffset;
private String phrase;
private final Dictionary dictionary;
private BreakIterator sentences;
private int startSentence, endSentence, startWord, endWord;
private String sentence;
private BreakIterator words;
private int wordOffset;
private boolean isFirstWordInSentence;
/**
* Create a tokenizer for the completely text document.
*/
Tokenizer( JTextComponent jText, Dictionary dictionary, Locale locale, SpellCheckerOptions options ) {
this( jText, dictionary, locale, 0, jText.getDocument().getLength(), options );
}
/**
* Create a Tokenizer for the current paragraph
* @param jText the checking JTextComponent
* @param dictionary the used Dictionary
* @param locale the used Locale, is needed for the word and sentence breaker
* @param offset the current offset.
*/
Tokenizer( JTextComponent jText, Dictionary dictionary, Locale locale, int offset, SpellCheckerOptions options ) {
this( jText, dictionary, locale, Utilities.getParagraphElement( jText, offset ).getStartOffset(),
Utilities.getParagraphElement( jText, offset ).getEndOffset(), options );
}
/**
* Create a tokenizer for the selected range.
*/
Tokenizer( JTextComponent jText, Dictionary dictionary, Locale locale, int startOffset, int endOffset, SpellCheckerOptions options ) {
this.dictionary = dictionary;
doc = jText.getDocument();
bundle = LanguageBundle.get( locale );
this.options = options == null ? SpellChecker.getOptions() : options;
sentences = BreakIterator.getSentenceInstance( locale );
words = BreakIterator.getWordInstance( locale );
paragraphOffset = startOffset;
this.endOffset = endOffset;
//loadSentences();
setSentencesText();
endSentence = sentences.first();
endWord = BreakIterator.DONE;
}
/**
* Get the next misspelling word. If not found then it return null.
*/
String nextInvalidWord() {
isFirstWordInSentence = false;
while( true ) {
if( endWord == BreakIterator.DONE ) {
startSentence = endSentence;
endSentence = sentences.next();
if( endSentence == BreakIterator.DONE ) {
if(!nextParagraph()){
return null;
}
}else{
nextSentence();
}
}
while( endWord != BreakIterator.DONE ) {
String word = sentence.substring( startWord, endWord ).trim();
wordOffset = startSentence + startWord;
startWord = endWord;
endWord = words.next();
//only words with 2 or more characters are checked
if( word.length() > 1 && Character.isLetter( word.charAt( 0 ) )){
boolean exist = bundle.existInDictionary( word, dictionary, options, isFirstWordInSentence );
if( !exist && options.isIgnoreAllCapsWords() && Utils.isAllCapitalized( word ) ){
exist = true;
}
if( !exist && options.isIgnoreWordsWithNumbers() && Utils.isIncludeNumbers( word ) ){
exist = true;
}
if( !exist && startWord + 1 == endWord ) {
char nextChar = sentence.charAt( startWord );
switch( nextChar ) {
case '.':
case '\'':
exist = bundle.existInDictionary( word + nextChar, dictionary, options, isFirstWordInSentence );
}
}
if( !exist && !isWebAddress( word )) {
return word;
}
isFirstWordInSentence = false;
} else {
// Handle a colon like a point. The next word can start with upper case.
if( ":".equals( word ) || "(".equals( word ) ){
isFirstWordInSentence = true;
}
}
}
}
}
/**
* Check if the word is a web address. This means a email address or web page address.
*
* @param word
* the word that should be check. It can not be null and can not include any whitespace.
* @return true if it is a web address.
*/
private boolean isWebAddress( String word ){
if( startWord >= sentence.length() ){
return false;
}
if( sentence.charAt( startWord ) == '@' ){
word += '@';
startWord = endWord;
endWord = words.next();
String domaine = sentence.substring( startWord, endWord ).trim();
if( domaine.length()>3 && domaine.indexOf( '.' ) > 0 ){
startWord = endWord;
endWord = words.next();
return true;
}
return false;
}
if( startWord + 3 < sentence.length() && sentence.charAt( startWord ) == ':' && sentence.charAt( startWord + 1 ) == '/' && sentence.charAt( startWord + 2 ) == '/' ) {
while(startWord < endWord){
String next = sentence.substring( startWord, endWord ).trim();
if( next.length() > 0 ){
word += next;
startWord = endWord;
endWord = words.next();
} else {
break;
}
}
return true;
}
return false;
}
/**
* Was the last invalid word the first word in a sentence.
*
* @return true if it was the first word.
*/
boolean isFirstWordInSentence(){
return isFirstWordInSentence;
}
/**
* Initialize the variables for the next paragraph.
* @return true, if there is a next paragraph
*/
private boolean nextParagraph(){
if(doc instanceof AbstractDocument){
paragraphOffset = ((AbstractDocument)doc).getParagraphElement( paragraphOffset ).getEndOffset();
if(paragraphOffset >= endOffset){
return false;
}
}else{
return false;
}
loadSentences();
return true;
}
/**
* Loads the sentences of the current paragraph.
*/
private void loadSentences(){
setSentencesText();
startSentence = sentences.first();
endSentence = sentences.next();
nextSentence();
}
/**
* Call sentences.setText( String ) based on the current value of paragraphOffset.
*/
private void setSentencesText(){
int end = endOffset;
if(doc instanceof AbstractDocument){
end = ((AbstractDocument)doc).getParagraphElement( paragraphOffset ).getEndOffset();
}
try {
phrase = doc.getText( paragraphOffset, end-paragraphOffset );
phrase = Utils.replaceUnicodeQuotation( phrase );
} catch( BadLocationException e ) {
SpellChecker.getMessageHandler().handleException( e );
}
sentences.setText( phrase );
}
/**
* Load the next Sentence in the word breaker.
*/
private void nextSentence() {
sentence = phrase.substring( startSentence, endSentence );
words.setText( sentence );
startWord = words.first();
endWord = words.next();
isFirstWordInSentence = true;
}
/**
* Get start offset of the last misspelling in the JTextComponent.
*/
int getWordOffset() {
return paragraphOffset + wordOffset;
}
/**
* Update the text after a word was replaced. The changes in the text should be only after the current word offset.
*/
void updatePhrase() {
endOffset = doc.getLength();
setSentencesText();
endSentence = sentences.following( startSentence );
sentence = phrase.substring( startSentence, endSentence );
words.setText( sentence );
startWord = words.following( wordOffset - startSentence );
endWord = words.next();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy