All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.datacleaner.beans.StringAnalyzer Maven / Gradle / Ivy

/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.beans;

import java.util.HashMap;
import java.util.Map;

import javax.inject.Named;

import org.datacleaner.api.Analyzer;
import org.datacleaner.api.ColumnProperty;
import org.datacleaner.api.Concurrent;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.Provided;
import org.datacleaner.result.AnnotatedRowsResult;
import org.datacleaner.result.Crosstab;
import org.datacleaner.result.CrosstabDimension;
import org.datacleaner.result.CrosstabNavigator;
import org.datacleaner.storage.InMemoryRowAnnotationFactory;
import org.datacleaner.storage.RowAnnotation;
import org.datacleaner.storage.RowAnnotationFactory;
import org.datacleaner.util.AverageBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * An analyzer for various typical String measures.
 * 
 * 
 */
@Named("String analyzer")
@Description("The String analyzer is used to collect a variety of typical metrics on string values.\nMetrics include statistics on character case, words, diacritics, white-spaces and more...")
@Concurrent(true)
public class StringAnalyzer implements Analyzer {

	public static final String DIMENSION_MEASURES = "Measures";
	public static final String DIMENSION_COLUMN = "Column";
	
	public static final String MEASURE_MIN_WORDS = "Min words";
	public static final String MEASURE_MAX_WORDS = "Max words";
	public static final String MEASURE_WORD_COUNT = "Word count";
	public static final String MEASURE_NON_LETTER_CHARS = "Non-letter chars";
	public static final String MEASURE_DIACRITIC_CHARS = "Diacritic chars";
	public static final String MEASURE_DIGIT_CHARS = "Digit chars";
	public static final String MEASURE_LOWERCASE_CHARS = "Lowercase chars";
	public static final String MEASURE_UPPERCASE_CHARS_EXCL_FIRST_LETTERS = "Uppercase chars (excl. first letters)";
	public static final String MEASURE_UPPERCASE_CHARS = "Uppercase chars";
	public static final String MEASURE_AVG_WHITE_SPACES = "Avg white spaces";
	public static final String MEASURE_MIN_WHITE_SPACES = "Min white spaces";
	public static final String MEASURE_MAX_WHITE_SPACES = "Max white spaces";
	public static final String MEASURE_AVG_CHARS = "Avg chars";
	public static final String MEASURE_MIN_CHARS = "Min chars";
	public static final String MEASURE_MAX_CHARS = "Max chars";
	public static final String MEASURE_TOTAL_CHAR_COUNT = "Total char count";
	public static final String MEASURE_ENTIRELY_LOWERCASE_COUNT = "Entirely lowercase count";
	public static final String MEASURE_ENTIRELY_UPPERCASE_COUNT = "Entirely uppercase count";
	public static final String MEASURE_BLANK_COUNT = "Blank count";
	public static final String MEASURE_NULL_COUNT = "Null count";
	public static final String MEASURE_ROW_COUNT = "Row count";

	private static final Logger logger = LoggerFactory.getLogger(StringAnalyzer.class);

	private final Map, StringAnalyzerColumnDelegate> _columnDelegates = new HashMap, StringAnalyzerColumnDelegate>();

	@Configured
	@ColumnProperty(escalateToMultipleJobs=true)
	InputColumn[] _columns;

	@Provided
	RowAnnotationFactory _annotationFactory;

	public StringAnalyzer() {
	}

	@SafeVarargs
	public StringAnalyzer(InputColumn... columns) {
		_columns = columns;
		_annotationFactory = new InMemoryRowAnnotationFactory();
		init();
	}

	@Initialize
	public void init() {
		for (InputColumn column : _columns) {
			_columnDelegates.put(column, new StringAnalyzerColumnDelegate(_annotationFactory));
		}
	}

	@Override
	public void run(InputRow row, int distinctCount) {
		for (InputColumn column : _columns) {
			String value = row.getValue(column);

			StringAnalyzerColumnDelegate delegate = _columnDelegates.get(column);
			delegate.run(row, value, distinctCount);
		}
	}

	@Override
	public StringAnalyzerResult getResult() {
		logger.info("getResult()");
		CrosstabDimension measureDimension = new CrosstabDimension(DIMENSION_MEASURES);
		measureDimension.addCategory(MEASURE_ROW_COUNT);
		measureDimension.addCategory(MEASURE_NULL_COUNT);
		measureDimension.addCategory(MEASURE_BLANK_COUNT);
		measureDimension.addCategory(MEASURE_ENTIRELY_UPPERCASE_COUNT);
		measureDimension.addCategory(MEASURE_ENTIRELY_LOWERCASE_COUNT);
		measureDimension.addCategory(MEASURE_TOTAL_CHAR_COUNT);
		measureDimension.addCategory(MEASURE_MAX_CHARS);
		measureDimension.addCategory(MEASURE_MIN_CHARS);
		measureDimension.addCategory(MEASURE_AVG_CHARS);
		measureDimension.addCategory(MEASURE_MAX_WHITE_SPACES);
		measureDimension.addCategory(MEASURE_MIN_WHITE_SPACES);
		measureDimension.addCategory(MEASURE_AVG_WHITE_SPACES);
		measureDimension.addCategory(MEASURE_UPPERCASE_CHARS);
		measureDimension.addCategory(MEASURE_UPPERCASE_CHARS_EXCL_FIRST_LETTERS);
		measureDimension.addCategory(MEASURE_LOWERCASE_CHARS);
		measureDimension.addCategory(MEASURE_DIGIT_CHARS);
		measureDimension.addCategory(MEASURE_DIACRITIC_CHARS);
		measureDimension.addCategory(MEASURE_NON_LETTER_CHARS);
		measureDimension.addCategory(MEASURE_WORD_COUNT);
		measureDimension.addCategory(MEASURE_MAX_WORDS);
		measureDimension.addCategory(MEASURE_MIN_WORDS);

		CrosstabDimension columnDimension = new CrosstabDimension(DIMENSION_COLUMN);

		Crosstab crosstab = new Crosstab(Number.class, columnDimension, measureDimension);

		for (InputColumn column : _columns) {
			String columnName = column.getName();

			StringAnalyzerColumnDelegate delegate = _columnDelegates.get(column);

			columnDimension.addCategory(columnName);

			final Integer numRows = delegate.getNumRows();
			final Integer numNull = delegate.getNumNull();
			final Integer numBlank = delegate.getNumBlank();
			final Integer numEntirelyUppercase = delegate.getNumEntirelyUppercase();
			final Integer numEntirelyLowercase = delegate.getNumEntirelyLowercase();
			final Integer numChars = delegate.getNumChars();
			final Integer maxChars = delegate.getMaxChars();
			final Integer minChars = delegate.getMinChars();
			final Integer numWords = delegate.getNumWords();
			final Integer maxWords = delegate.getMaxWords();
			final Integer minWords = delegate.getMinWords();
			final Integer maxWhitespace = delegate.getMaxWhitespace();
			final Integer minWhitespace = delegate.getMinWhitespace();
			final Integer numUppercase = delegate.getNumUppercase();
			final Integer numUppercaseExclFirstLetter = delegate.getNumUppercaseExclFirstLetter();
			final Integer numLowercase = delegate.getNumLowercase();
			final Integer numDigits = delegate.getNumDigit();
			final Integer numDiacritics = delegate.getNumDiacritics();
			final Integer numNonLetter = delegate.getNumNonLetter();
			final AverageBuilder charAverageBuilder = delegate.getCharAverageBuilder();
			final AverageBuilder blanksAverageBuilder = delegate.getWhitespaceAverageBuilder();

			Double avgChars = null;
			if (charAverageBuilder.getNumValues() > 0) {
				avgChars = charAverageBuilder.getAverage();
			}
			Double avgBlanks = null;
			if (blanksAverageBuilder.getNumValues() > 0) {
				avgBlanks = blanksAverageBuilder.getAverage();
			}

			// begin entering numbers into the crosstab
			CrosstabNavigator nav = crosstab.where(columnDimension, columnName);

			nav.where(measureDimension, MEASURE_ROW_COUNT).put(numRows);

			nav.where(measureDimension, MEASURE_NULL_COUNT).put(numNull);
			if (numNull > 0) {
				addAttachment(nav, delegate.getNullAnnotation(), column);
			}
			
			nav.where(measureDimension, MEASURE_BLANK_COUNT).put(numBlank);
            if (numBlank > 0) {
                addAttachment(nav, delegate.getBlankAnnotation(), column);
            }

			nav.where(measureDimension, MEASURE_ENTIRELY_UPPERCASE_COUNT).put(numEntirelyUppercase);
			if (numEntirelyUppercase > 0) {
				addAttachment(nav, delegate.getEntirelyUppercaseAnnotation(), column);
			}

			nav.where(measureDimension, MEASURE_ENTIRELY_LOWERCASE_COUNT).put(numEntirelyLowercase);
			if (numEntirelyLowercase > 0) {
				addAttachment(nav, delegate.getEntirelyLowercaseAnnotation(), column);
			}

			nav.where(measureDimension, MEASURE_TOTAL_CHAR_COUNT).put(numChars);

			nav.where(measureDimension, MEASURE_MAX_CHARS).put(maxChars);
			if (maxChars != null) {
				addAttachment(nav, delegate.getMaxCharsAnnotation(), column);
			}

			nav.where(measureDimension, MEASURE_MIN_CHARS).put(minChars);
			if (minChars != null) {
				addAttachment(nav, delegate.getMinCharsAnnotation(), column);
			}

			nav.where(measureDimension, MEASURE_AVG_CHARS).put(avgChars);
			nav.where(measureDimension, MEASURE_MAX_WHITE_SPACES).put(maxWhitespace);
			if (maxWhitespace != null) {
				addAttachment(nav, delegate.getMaxWhitespaceAnnotation(), column);
			}

			nav.where(measureDimension, MEASURE_MIN_WHITE_SPACES).put(minWhitespace);
			if (minWhitespace != null) {
				addAttachment(nav, delegate.getMinWhitespaceAnnotation(), column);
			}

			nav.where(measureDimension, MEASURE_AVG_WHITE_SPACES).put(avgBlanks);
			nav.where(measureDimension, MEASURE_UPPERCASE_CHARS).put(numUppercase);
			nav.where(measureDimension, MEASURE_UPPERCASE_CHARS_EXCL_FIRST_LETTERS).put(numUppercaseExclFirstLetter);
			if (numUppercaseExclFirstLetter > 0) {
				addAttachment(nav, delegate.getUppercaseExclFirstLetterAnnotation(), column);
			}

			nav.where(measureDimension, MEASURE_LOWERCASE_CHARS).put(numLowercase);
			nav.where(measureDimension, MEASURE_DIGIT_CHARS).put(numDigits);
			if (numDigits > 0) {
				addAttachment(nav, delegate.getDigitAnnotation(), column);
			}

			nav.where(measureDimension, MEASURE_DIACRITIC_CHARS).put(numDiacritics);
			if (numDiacritics > 0) {
				addAttachment(nav, delegate.getDiacriticAnnotation(), column);
			}

			nav.where(measureDimension, MEASURE_NON_LETTER_CHARS).put(numNonLetter);
			nav.where(measureDimension, MEASURE_WORD_COUNT).put(numWords);

			nav.where(measureDimension, MEASURE_MAX_WORDS).put(maxWords);
			if (maxWords != null) {
				addAttachment(nav, delegate.getMaxWordsAnnotation(), column);
			}

			nav.where(measureDimension, MEASURE_MIN_WORDS).put(minWords);
			if (minWords != null) {
				addAttachment(nav, delegate.getMinWordsAnnotation(), column);
			}
		}

		return new StringAnalyzerResult(_columns, crosstab);
	}

	private void addAttachment(CrosstabNavigator nav, RowAnnotation annotation, InputColumn column) {
		nav.attach(new AnnotatedRowsResult(annotation, _annotationFactory, column));
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy