
org.eobjects.analyzer.beans.stringpattern.DefaultTokenizer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of AnalyzerBeans-pattern-finder
Show all versions of AnalyzerBeans-pattern-finder
Pattern finder component for AnalyzerBeans
The newest version!
/**
* AnalyzerBeans
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.eobjects.analyzer.beans.stringpattern;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import org.eobjects.analyzer.util.CharIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class DefaultTokenizer implements Serializable, Tokenizer {
private static final long serialVersionUID = 1L;
private static final Logger logger = LoggerFactory.getLogger(DefaultTokenizer.class);
private final TokenizerConfiguration _configuration;
private final boolean _predefinedTokens;
public DefaultTokenizer() {
this(new TokenizerConfiguration());
}
public DefaultTokenizer(TokenizerConfiguration configuration) {
if (configuration == null) {
throw new NullPointerException("configuration argument cannot be null");
}
_configuration = configuration;
final List predefinedTokens = _configuration.getPredefinedTokens();
_predefinedTokens = !predefinedTokens.isEmpty() && _configuration.isTokenTypeEnabled(TokenType.PREDEFINED);
if (_predefinedTokens) {
logger.debug("Predefined tokens are turned ON, using PredefinedTokenTokenizer");
} else {
logger.debug("Predefined tokens are turned OFF, using tokenizeInternal");
}
}
public List tokenize(String string) {
if (string == null) {
return Arrays.asList(NullToken.INSTANCE);
}
if ("".equals(string)) {
return Arrays.asList(BlankToken.INSTANCE);
}
List tokens;
if (_predefinedTokens) {
final List predefinedTokens = _configuration.getPredefinedTokens();
PredefinedTokenTokenizer tokenizer = new PredefinedTokenTokenizer(predefinedTokens);
tokens = tokenizer.tokenize(string);
for (ListIterator it = tokens.listIterator(); it.hasNext();) {
Token token = it.next();
TokenType tokenType = token.getType();
logger.debug("Next token type is: {}", tokenType);
if (tokenType == TokenType.UNDEFINED) {
List replacementTokens = tokenizeInternal(token.getString());
boolean replace = true;
if (replacementTokens.size() == 1) {
if (token.equals(replacementTokens.get(0))) {
replace = false;
}
}
if (replace) {
it.remove();
for (SimpleToken replacementToken : replacementTokens) {
it.add(replacementToken);
}
}
}
}
} else {
tokens = new ArrayList();
tokens.addAll(tokenizeInternal(string));
}
return tokens;
}
private List tokenizeInternal(String string) {
List tokens = preliminaryTokenize(string, _configuration);
if (_configuration.isTokenTypeEnabled(TokenType.MIXED)) {
tokens = flattenMixedTokens(tokens);
}
return tokens;
}
protected static List preliminaryTokenize(final String string, final TokenizerConfiguration configuration) {
LinkedList result = new LinkedList();
SimpleToken lastToken = null;
CharIterator ci = new CharIterator(string);
while (ci.hasNext()) {
char c = ci.next();
if (ci.is(configuration.getThousandsSeparator()) || ci.is(configuration.getDecimalSeparator())) {
boolean treatAsSeparator = false;
if (lastToken != null && lastToken.getType() == TokenType.NUMBER) {
// there's a previous NUMBER token
if (ci.hasNext()) {
char next = ci.next();
if (ci.isDigit()) {
// the next token is also a NUMBER
// now we're ready to assume that this is a
// separator
treatAsSeparator = true;
lastToken = registerChar(result, lastToken, c, TokenType.NUMBER);
lastToken = registerChar(result, lastToken, next, TokenType.NUMBER);
} else {
ci.previous();
}
}
}
if (!treatAsSeparator) {
// the thousand separator is treated as a delim
lastToken = registerChar(result, lastToken, c, TokenType.DELIM);
}
} else if (ci.is(configuration.getMinusSign())) {
// the meaning of minus sign is dependent on the next token
// (maybe it's the negative number operator)
boolean treatAsMinus = false;
if (lastToken == null || lastToken.getType() != TokenType.NUMBER) {
if (ci.hasNext()) {
char next = ci.next();
if (ci.isDigit()) {
// the minus sign was the number operator
treatAsMinus = true;
lastToken = registerChar(result, null, c, TokenType.NUMBER);
lastToken = registerChar(result, lastToken, next, TokenType.NUMBER);
} else {
ci.previous();
}
}
}
if (!treatAsMinus) {
// the minus sign is treated as a delim
lastToken = registerChar(result, lastToken, c, TokenType.DELIM);
}
} else if (ci.isDigit()) {
lastToken = registerChar(result, lastToken, c, TokenType.NUMBER);
} else if (ci.isLetter()) {
if (configuration.isDiscriminateTextCase()) {
if (lastToken != null && lastToken.getType() == TokenType.TEXT) {
// if we need to discriminate on case then we should
// check the previous token and make sure that we only
// append to that if they share the same case.
char charFromPreviousToken = lastToken.getString().charAt(0);
if (Character.isUpperCase(charFromPreviousToken) != Character.isUpperCase(c)) {
lastToken = null;
}
}
}
lastToken = registerChar(result, lastToken, c, TokenType.TEXT);
} else if (ci.isWhitespace()) {
lastToken = registerChar(result, lastToken, c, TokenType.WHITESPACE);
} else {
lastToken = registerChar(result, lastToken, c, TokenType.DELIM);
}
}
return result;
}
private static SimpleToken registerChar(List result, SimpleToken lastToken, char c, TokenType tokenType) {
if (lastToken == null) {
logger.debug("Creating new {} token", tokenType);
lastToken = new SimpleToken(tokenType, c);
result.add(lastToken);
} else if (lastToken.getType() == tokenType) {
logger.debug("Appending to previous token", tokenType);
lastToken.appendChar(c);
} else {
logger.debug("Creating new {} token", tokenType);
lastToken = new SimpleToken(tokenType, c);
result.add(lastToken);
}
logger.debug("{} registered as {}", c, tokenType);
return lastToken;
}
public static List flattenMixedTokens(List tokens) {
SimpleToken previousToken = null;
for (ListIterator it = tokens.listIterator(); it.hasNext();) {
SimpleToken token = it.next();
if (previousToken == null) {
previousToken = token;
} else {
boolean mix = false;
TokenType previousType = previousToken.getType();
TokenType currentType = token.getType();
if (previousType != currentType) {
if (isMixedCandidate(previousType) && isMixedCandidate(currentType)) {
mix = true;
previousToken.appendString(token.getString());
previousToken.setType(TokenType.MIXED);
it.remove();
}
}
if (!mix) {
previousToken = token;
}
}
}
return tokens;
}
private static boolean isMixedCandidate(TokenType type) {
return type == TokenType.MIXED || type == TokenType.NUMBER || type == TokenType.TEXT;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy