org.eobjects.analyzer.beans.CharacterSetDistributionAnalyzerColumnDelegate Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of AnalyzerBeans-i18n
Internationalization components for transliteration and character set identification
The newest version!
/**
 * AnalyzerBeans
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.eobjects.analyzer.beans;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.eobjects.analyzer.data.InputRow;
import org.eobjects.analyzer.storage.RowAnnotation;
import org.eobjects.analyzer.storage.RowAnnotationFactory;
import org.eobjects.analyzer.util.CharIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.ibm.icu.text.UnicodeSet;

/**
 * Performs character set distribution analysis for a single column. Used by the
 * {@link CharacterSetDistributionAnalyzer} for splitting up work.
 * 
 * 
 */
final class CharacterSetDistributionAnalyzerColumnDelegate {

	private static final Logger logger = LoggerFactory.getLogger(CharacterSetDistributionAnalyzerColumnDelegate.class);
	private final RowAnnotationFactory _annotationFactory;
	private final Map _unicodeSets;
	private final Map _annotations;

	public CharacterSetDistributionAnalyzerColumnDelegate(RowAnnotationFactory annotationFactory,
			Map unicodeSets) {
		_annotationFactory = annotationFactory;
		_unicodeSets = unicodeSets;
		_annotations = new HashMap();
		for (String name : unicodeSets.keySet()) {
			_annotations.put(name, _annotationFactory.createAnnotation());
		}
	}

	public RowAnnotation getAnnotation(String unicodeSetName) {
		return _annotations.get(unicodeSetName);
	}

	public synchronized void run(String value, InputRow row, int distinctCount) {
		final List> unicodeSetsRemaining = new ArrayList>(
				_unicodeSets.entrySet());
		CharIterator charIterator = new CharIterator(value);
		while (charIterator.hasNext()) {
			Character c = charIterator.next();
			if (charIterator.isWhitespace() || charIterator.isDigit()) {
				logger.debug("Skipping whitespace/digit char: {}", c);
			} else {

				Iterator> it = unicodeSetsRemaining.iterator();
				while (it.hasNext()) {
					Entry unicodeSet = it.next();
					if (unicodeSet.getValue().contains(c)) {
						String name = unicodeSet.getKey();
						RowAnnotation annotation = _annotations.get(name);
						_annotationFactory.annotate(row, distinctCount, annotation);

						// remove this unicode set from the remaining checks on
						// this value.
						it.remove();
					}
				}
			}
		}
	}

}