net.sf.sfac.string.AbstractCharIterator Maven / Gradle / Ivy
Show all versions of sfac-utils Show documentation
/*-------------------------------------------------------------------------
Copyright 2009 Olivier Berlanger
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-------------------------------------------------------------------------*/
package net.sf.sfac.string;
/**
* Abstract implementation of a CharIterator where the chars are found in a set of Strings.
* The way the string are stored/organized is defined by the concrete subclass (implementing getFirstString()
and
* getNextString()
).
*
* Rem: Subclasses have to call reset()
in their constructor to correctly initialize the iterator state.
*
* @author Olivier Berlanger
*/
public abstract class AbstractCharIterator implements CharIterator {
/** The currently iterated string. */
private String currentString;
/** The length of the currently iterated string. */
private int currentStringLen;
/** The index of the next char in the currently iterated string. */
private int currentIndex;
/** true if there is no more available char to iterate. */
private boolean finished;
/** true if the last returned char was a space. */
private boolean lastWhite;
/**
* Reset the iterator state so it points to it's first char.
*/
public void reset() {
finished = false;
lastWhite = true;
currentIndex = 0;
currentString = getFirstString();
if (currentString == null) currentString = "";
currentStringLen = currentString.length();
}
/**
* Get the first String available for iteration.
*
* @return the first String available for iteration.
*/
protected abstract String getFirstString();
/**
* Get the next String available for iteration.
* If no next string is available, implementing classes have to call setFinished()
(then the return value is
* ignored).
*
* @return the next String available for iteration.
*/
protected abstract String getNextString();
/**
* Notify this class that no more character are available.
*/
protected void setFinished() {
finished = true;
}
protected boolean isFinished() {
return finished;
}
/**
* Get the next char of this iterator.
* The returned char are normalized as explained in CharIterator
interface javadoc comment. When no more characters
* are available, an zero char ('\0') is returned.
*
* @return the next char of this iterator or '\0' if no more characters are availbale.
*/
public char nextNormalizedChar() {
if (finished) return '\0';
char ch = '\0';
boolean continueRead = true;
while (continueRead) {
ch = nextChar();
ch = StringUtils.removeDiacritic(ch);
if (ch == '\0') {
finished = true;
continueRead = false;
} else if (Character.isLetter(ch)) {
ch = Character.toLowerCase(ch);
lastWhite = false;
continueRead = false;
} else if (Character.isDigit(ch)) {
lastWhite = false;
continueRead = false;
} else {
if (!lastWhite) {
ch = ' ';
lastWhite = true;
continueRead = false;
}
}
}
return ch;
}
/**
* Get the next non-normalised char.
* If the end of the current string is reached, this method will query the next one with getNextString()
.
*
* @return the next non-normalised char or '\0' if no more characters are availbale.
*/
public char nextChar() {
char ch = '\0';
boolean continueRead = true;
while (continueRead && !finished) {
if (currentIndex < currentStringLen) {
ch = currentString.charAt(currentIndex);
currentIndex++;
continueRead = false;
} else {
currentString = getNextString();
if (currentString == null) currentString = "";
currentStringLen = currentString.length();
currentIndex = 0;
}
}
return ch;
}
/**
* Get the content of this char iterator as a normalized string.
* The result will be:
*
* - Accentued chars replaced by the corresponding non-accentued char.
*
- All lowercase.
*
- All non-letter or digit chars replaced by space.
*
- Any suite of white chars replaced by a single space.
*
- Trimmed.
*
*
* @return The content of this char iterator as a normalized string.
*/
public String getNormalizedString() {
reset();
StringBuffer sb = new StringBuffer();
char ch = nextNormalizedChar();
while (ch != '\0') {
sb.append(ch);
ch = nextNormalizedChar();
}
return sb.toString().trim();
}
}