org.eobjects.analyzer.beans.stringpattern.DefaultTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of AnalyzerBeans-pattern-finder
Pattern finder component for AnalyzerBeans
The newest version!
/**
 * AnalyzerBeans
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.eobjects.analyzer.beans.stringpattern;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;

import org.eobjects.analyzer.util.CharIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DefaultTokenizer implements Serializable, Tokenizer {

	private static final long serialVersionUID = 1L;

	private static final Logger logger = LoggerFactory.getLogger(DefaultTokenizer.class);

	private final TokenizerConfiguration _configuration;
	private final boolean _predefinedTokens;

	public DefaultTokenizer() {
		this(new TokenizerConfiguration());
	}

	public DefaultTokenizer(TokenizerConfiguration configuration) {
		if (configuration == null) {
			throw new NullPointerException("configuration argument cannot be null");
		}
		_configuration = configuration;

		final List predefinedTokens = _configuration.getPredefinedTokens();
		_predefinedTokens = !predefinedTokens.isEmpty() && _configuration.isTokenTypeEnabled(TokenType.PREDEFINED);

		if (_predefinedTokens) {
			logger.debug("Predefined tokens are turned ON, using PredefinedTokenTokenizer");
		} else {
			logger.debug("Predefined tokens are turned OFF, using tokenizeInternal");
		}
	}

	public List tokenize(String string) {
		if (string == null) {
			return Arrays.asList(NullToken.INSTANCE);
		}
		
		if ("".equals(string)) {
		    return Arrays.asList(BlankToken.INSTANCE);
		}

		List tokens;

		if (_predefinedTokens) {
			final List predefinedTokens = _configuration.getPredefinedTokens();
			PredefinedTokenTokenizer tokenizer = new PredefinedTokenTokenizer(predefinedTokens);
			tokens = tokenizer.tokenize(string);
			for (ListIterator it = tokens.listIterator(); it.hasNext();) {
				Token token = it.next();
				TokenType tokenType = token.getType();
				logger.debug("Next token type is: {}", tokenType);
				if (tokenType == TokenType.UNDEFINED) {
					List replacementTokens = tokenizeInternal(token.getString());
					boolean replace = true;
					if (replacementTokens.size() == 1) {
						if (token.equals(replacementTokens.get(0))) {
							replace = false;
						}
					}

					if (replace) {
						it.remove();
						for (SimpleToken replacementToken : replacementTokens) {
							it.add(replacementToken);
						}
					}
				}
			}
		} else {
			tokens = new ArrayList();
			tokens.addAll(tokenizeInternal(string));
		}

		return tokens;
	}

	private List tokenizeInternal(String string) {
		List tokens = preliminaryTokenize(string, _configuration);

		if (_configuration.isTokenTypeEnabled(TokenType.MIXED)) {
			tokens = flattenMixedTokens(tokens);
		}

		return tokens;
	}

	protected static List preliminaryTokenize(final String string, final TokenizerConfiguration configuration) {
		LinkedList result = new LinkedList();
		SimpleToken lastToken = null;

		CharIterator ci = new CharIterator(string);
		while (ci.hasNext()) {
			char c = ci.next();

			if (ci.is(configuration.getThousandsSeparator()) || ci.is(configuration.getDecimalSeparator())) {
				boolean treatAsSeparator = false;
				if (lastToken != null && lastToken.getType() == TokenType.NUMBER) {
					// there's a previous NUMBER token

					if (ci.hasNext()) {
						char next = ci.next();
						if (ci.isDigit()) {
							// the next token is also a NUMBER

							// now we're ready to assume that this is a
							// separator
							treatAsSeparator = true;
							lastToken = registerChar(result, lastToken, c, TokenType.NUMBER);
							lastToken = registerChar(result, lastToken, next, TokenType.NUMBER);
						} else {
							ci.previous();
						}
					}
				}

				if (!treatAsSeparator) {
					// the thousand separator is treated as a delim
					lastToken = registerChar(result, lastToken, c, TokenType.DELIM);
				}
			} else if (ci.is(configuration.getMinusSign())) {
				// the meaning of minus sign is dependent on the next token
				// (maybe it's the negative number operator)
				boolean treatAsMinus = false;

				if (lastToken == null || lastToken.getType() != TokenType.NUMBER) {
					if (ci.hasNext()) {
						char next = ci.next();
						if (ci.isDigit()) {
							// the minus sign was the number operator
							treatAsMinus = true;
							lastToken = registerChar(result, null, c, TokenType.NUMBER);
							lastToken = registerChar(result, lastToken, next, TokenType.NUMBER);
						} else {
							ci.previous();
						}
					}
				}

				if (!treatAsMinus) {
					// the minus sign is treated as a delim
					lastToken = registerChar(result, lastToken, c, TokenType.DELIM);
				}
			} else if (ci.isDigit()) {
				lastToken = registerChar(result, lastToken, c, TokenType.NUMBER);
			} else if (ci.isLetter()) {
				if (configuration.isDiscriminateTextCase()) {
					if (lastToken != null && lastToken.getType() == TokenType.TEXT) {
						// if we need to discriminate on case then we should
						// check the previous token and make sure that we only
						// append to that if they share the same case.
						char charFromPreviousToken = lastToken.getString().charAt(0);
						if (Character.isUpperCase(charFromPreviousToken) != Character.isUpperCase(c)) {
							lastToken = null;
						}
					}
				}
				lastToken = registerChar(result, lastToken, c, TokenType.TEXT);
			} else if (ci.isWhitespace()) {
				lastToken = registerChar(result, lastToken, c, TokenType.WHITESPACE);
			} else {
				lastToken = registerChar(result, lastToken, c, TokenType.DELIM);
			}
		}

		return result;
	}

	private static SimpleToken registerChar(List result, SimpleToken lastToken, char c, TokenType tokenType) {
		if (lastToken == null) {
			logger.debug("Creating new {} token", tokenType);
			lastToken = new SimpleToken(tokenType, c);
			result.add(lastToken);
		} else if (lastToken.getType() == tokenType) {
			logger.debug("Appending to previous token", tokenType);
			lastToken.appendChar(c);
		} else {
			logger.debug("Creating new {} token", tokenType);
			lastToken = new SimpleToken(tokenType, c);
			result.add(lastToken);
		}
		logger.debug("{} registered as {}", c, tokenType);
		return lastToken;
	}

	public static List flattenMixedTokens(List tokens) {
		SimpleToken previousToken = null;
		for (ListIterator it = tokens.listIterator(); it.hasNext();) {
			SimpleToken token = it.next();
			if (previousToken == null) {
				previousToken = token;
			} else {
				boolean mix = false;

				TokenType previousType = previousToken.getType();
				TokenType currentType = token.getType();
				if (previousType != currentType) {
					if (isMixedCandidate(previousType) && isMixedCandidate(currentType)) {
						mix = true;
						previousToken.appendString(token.getString());
						previousToken.setType(TokenType.MIXED);
						it.remove();
					}
				}

				if (!mix) {
					previousToken = token;
				}
			}
		}
		return tokens;
	}

	private static boolean isMixedCandidate(TokenType type) {
		return type == TokenType.MIXED || type == TokenType.NUMBER || type == TokenType.TEXT;
	}
}