org.datacleaner.beans.stringpattern.TokenizerConfiguration Maven / Gradle / Ivy

Go to download
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.beans.stringpattern;

import java.io.Serializable;
import java.text.DecimalFormatSymbols;
import java.util.EnumMap;
import java.util.EnumSet;
import java.util.LinkedList;
import java.util.List;

public class TokenizerConfiguration implements Serializable {

	private static final long serialVersionUID = 1L;

	private EnumSet _tokenTypes;
	private EnumMap _discriminateTokenLength;

	private boolean _discriminateTextCase;
	private boolean _discriminateWhiteSpaces;
	private boolean _discriminateDecimalNumbers;
	private boolean _discriminateNegativeNumbers;
	private boolean _upperCaseExpandable;
	private boolean _lowerCaseExpandable;
	private Character _thousandsSeparator;
	private Character _decimalSeparator;
	private Character _minusSign;

	private List _predefinedTokens = new LinkedList();

	public TokenizerConfiguration() {
		this(true);
	}

	public TokenizerConfiguration(boolean enableMixedTokens) {
		this(enableMixedTokens, DecimalFormatSymbols.getInstance().getDecimalSeparator(), DecimalFormatSymbols.getInstance()
				.getGroupingSeparator(), DecimalFormatSymbols.getInstance().getMinusSign());
	}

	public TokenizerConfiguration(boolean enableMixed, Character decimalSeparator, Character thousandsSeparator,
			Character minusSign) {
		_tokenTypes = EnumSet.allOf(TokenType.class);
		if (!enableMixed) {
			_tokenTypes.remove(TokenType.MIXED);
		}

		// set default values;
		_discriminateTokenLength = new EnumMap(TokenType.class);
		_discriminateTokenLength.put(TokenType.TEXT, false);
		_discriminateTokenLength.put(TokenType.NUMBER, false);
		_discriminateTokenLength.put(TokenType.MIXED, false);
		_discriminateTokenLength.put(TokenType.PREDEFINED, false);
		_discriminateTokenLength.put(TokenType.WHITESPACE, true);
		_discriminateTokenLength.put(TokenType.DELIM, true);

		_discriminateTextCase = true;
		_discriminateWhiteSpaces = true;
		_discriminateDecimalNumbers = true;
		_discriminateNegativeNumbers = false;

		_upperCaseExpandable = false;
		_lowerCaseExpandable = true;

		_decimalSeparator = decimalSeparator;
		_thousandsSeparator = thousandsSeparator;
		_minusSign = minusSign;
	}

	/**
	 * Sets which token types are enabled
	 */
	public void setTokenTypes(EnumSet tokenTypes) {
		_tokenTypes = tokenTypes;
	}

	/**
	 * Which token types are enabled
	 */
	public EnumSet getTokenTypes() {
		return _tokenTypes;
	}

	/**
	 * Should tokens be discriminated (when matching) based on length. For
	 * example, if "hello" and "hi" should be matched, then length
	 * discrimination should be false. If only "hello" and "world", but not "hi"
	 * should be matched then length discrimination should be true.
	 */
	public EnumMap getDiscriminateTokenLength() {
		return _discriminateTokenLength;
	}

	/**
	 * Should tokens be discriminated (when matching) based on length. For
	 * example, if "hello" and "hi" should be matched, then length
	 * discrimination should be false. If only "hello" and "world", but not "hi"
	 * should be matched then length discrimination should be true.
	 */
	public boolean isDistriminateTokenLength(TokenType tokenType) {
		Boolean discriminateTokenLength = _discriminateTokenLength.get(tokenType);
		if (discriminateTokenLength == null) {
			return false;
		}
		return discriminateTokenLength.booleanValue();
	}

	/**
	 * Sets which tokens should be discriminated (when matching) based on
	 * length. For example, if "hello" and "hi" should be matched, then length
	 * discrimination should be false. If only "hello" and "world", but not "hi"
	 * should be matched then length discrimination should be true.
	 */
	public void setDistriminateTokenLength(EnumMap discriminateTokenLength) {
		_discriminateTokenLength = discriminateTokenLength;
	}

	/**
	 * Sets which tokens should be discriminated (when matching) based on
	 * length. For example, if "hello" and "hi" should be matched, then length
	 * discrimination should be false. If only "hello" and "world", but not "hi"
	 * should be matched then length discrimination should be true.
	 */
	public void setDistriminateTokenLength(TokenType tokenType, boolean discriminateTokenLength) {
		_discriminateTokenLength.put(tokenType, Boolean.valueOf(discriminateTokenLength));
	}

	/**
	 * Discriminate the case of characters in TEXT tokens
	 */
	public boolean isDiscriminateTextCase() {
		return _discriminateTextCase;
	}

	/**
	 * Sets whether to discriminate the case of characters in TEXT tokens
	 */
	public void setDiscriminateTextCase(boolean discriminateTextCase) {
		_discriminateTextCase = discriminateTextCase;
	}

	/**
	 * Discriminate the type of whitespaces (space, tab etc.)
	 */
	public boolean isDiscriminateWhiteSpaces() {
		return _discriminateWhiteSpaces;
	}

	/**
	 * Sets whether to discriminate the type of whitespaces (space, tab etc.)
	 */
	public void setDiscriminateWhiteSpaces(boolean discriminateWhiteSpaces) {
		_discriminateWhiteSpaces = discriminateWhiteSpaces;
	}

	public List getPredefinedTokens() {
		return _predefinedTokens;
	}

	public void setPredefinedTokens(List predefinedTokens) {
		_predefinedTokens = predefinedTokens;
	}

	/**
	 * Discriminate decimal numbers from integers when matching
	 */
	public boolean isDiscriminateDecimalNumbers() {
		return _discriminateDecimalNumbers;
	}

	/**
	 * Sets whether to discriminate decimal numbers from integers when matching
	 */
	public void setDiscriminateDecimalNumbers(boolean discriminateDecimalNumbers) {
		_discriminateDecimalNumbers = discriminateDecimalNumbers;
	}

	/**
	 * Characters to use for thousands separator in numbers (typically ',')
	 */
	public Character getThousandsSeparator() {
		return _thousandsSeparator;
	}

	/**
	 * Sets the characters to use for thousands separator in numbers (typically
	 * ',')
	 */
	public void setThousandsSeparator(Character thousandSeparator) {
		_thousandsSeparator = thousandSeparator;
	}

	/**
	 * Characters to use for decimal separation in numbers (typically '.')
	 */
	public Character getDecimalSeparator() {
		return _decimalSeparator;
	}

	/**
	 * Sets the characters to use for decimal separation in numbers (typically
	 * '.')
	 */
	public void setDecimalSeparator(Character decimalSeparator) {
		_decimalSeparator = decimalSeparator;
	}

	/**
	 * Character to use for minus sign in numbers (typically '-')
	 */
	public Character getMinusSign() {
		return _minusSign;
	}

	/**
	 * Sets the character to use for minus sign in numbers (typically '-')
	 */
	public void setMinusSign(Character minusSign) {
		_minusSign = minusSign;
	}

	/**
	 * Discriminate negative numbers from positive numbers
	 */
	public boolean isDiscriminateNegativeNumbers() {
		return _discriminateNegativeNumbers;
	}

	/**
	 * Sets whether to discriminate negative numbers from positive numbers
	 */
	public void setDiscriminateNegativeNumbers(boolean discriminateNegativeNumbers) {
		_discriminateNegativeNumbers = discriminateNegativeNumbers;
	}

	/**
	 * Are upper case TEXT tokens expandable (ie. "ABC" and "ABCD" is treated as
	 * a single "AAAA" pattern) or not
	 */
	public boolean isUpperCaseExpandable() {
		return _upperCaseExpandable;
	}

	/**
	 * Sets whether or not to make upper case TEXT tokens expandable
	 * 
	 * @param upperCaseExpandable
	 */
	public void setUpperCaseExpandable(boolean upperCaseExpandable) {
		_upperCaseExpandable = upperCaseExpandable;
	}

	/**
	 * Are lower case TEXT tokens expandable (ie. "hello" and "hi" is treated as
	 * a single "aaaaa" pattern) or not
	 */
	public boolean isLowerCaseExpandable() {
		return _lowerCaseExpandable;
	}

	/**
	 * Sets whether or not to make lower case TEXT tokens expandable
	 * 
	 * @param lowerCaseExpandable
	 */
	public void setLowerCaseExpandable(boolean lowerCaseExpandable) {
		_lowerCaseExpandable = lowerCaseExpandable;
	}

	public boolean isTokenTypeEnabled(TokenType tokenType) {
		return _tokenTypes.contains(tokenType);
	}

}