All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.datacleaner.beans.StringAnalyzerColumnDelegate Maven / Gradle / Ivy

/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.beans;

import java.util.StringTokenizer;

import org.datacleaner.api.InputRow;
import org.datacleaner.storage.RowAnnotation;
import org.datacleaner.storage.RowAnnotationFactory;
import org.datacleaner.util.AverageBuilder;
import org.datacleaner.util.CharIterator;

/**
 * Helper class for the String Analyzer. This class collects all the statistics
 * for a single column. The String Analyzer then consists of a number of these
 * delegates.
 * 
 * 
 */
final class StringAnalyzerColumnDelegate {

    private final RowAnnotationFactory _annotationFactory;
    private final AverageBuilder _charAverageBuilder = new AverageBuilder();
    private final AverageBuilder _whitespaceAverageBuilder = new AverageBuilder();
    private final RowAnnotation _nullAnnotation;
    private final RowAnnotation _blankAnnotation;
    private final RowAnnotation _entirelyUppercaseAnnotation;
    private final RowAnnotation _entirelyLowercaseAnnotation;
    private final RowAnnotation _maxCharsAnnotation;
    private final RowAnnotation _minCharsAnnotation;
    private final RowAnnotation _maxWhitespaceAnnotation;
    private final RowAnnotation _minWhitespaceAnnotation;
    private final RowAnnotation _uppercaseExclFirstLetterAnnotation;
    private final RowAnnotation _digitAnnotation;
    private final RowAnnotation _diacriticAnnotation;
    private final RowAnnotation _maxWordsAnnotation;
    private final RowAnnotation _minWordsAnnotation;
    private volatile int _numRows;
    private volatile int _numEntirelyUppercase;
    private volatile int _numEntirelyLowercase;
    private volatile int _numChars;
    private volatile Integer _minChars;
    private volatile Integer _maxChars;
    private volatile Integer _minWhitespace;
    private volatile Integer _maxWhitespace;
    private volatile int _numUppercase;
    private volatile int _numUppercaseExclFirstLetter;
    private volatile int _numLowercase;
    private volatile int _numDigit;
    private volatile int _numDiacritics;
    private volatile int _numNonLetter;
    private volatile int _numWords;
    private volatile Integer _maxWords;
    private volatile Integer _minWords;

    public StringAnalyzerColumnDelegate(RowAnnotationFactory annotationFactory) {
        _annotationFactory = annotationFactory;
        _nullAnnotation = annotationFactory.createAnnotation();
        _blankAnnotation = annotationFactory.createAnnotation();
        _entirelyUppercaseAnnotation = annotationFactory.createAnnotation();
        _entirelyLowercaseAnnotation = annotationFactory.createAnnotation();
        _maxCharsAnnotation = annotationFactory.createAnnotation();
        _minCharsAnnotation = annotationFactory.createAnnotation();
        _maxWhitespaceAnnotation = annotationFactory.createAnnotation();
        _minWhitespaceAnnotation = annotationFactory.createAnnotation();
        _uppercaseExclFirstLetterAnnotation = annotationFactory.createAnnotation();
        _digitAnnotation = annotationFactory.createAnnotation();
        _diacriticAnnotation = annotationFactory.createAnnotation();
        _maxWordsAnnotation = annotationFactory.createAnnotation();
        _minWordsAnnotation = annotationFactory.createAnnotation();
    }

    public synchronized void run(InputRow row, final String value, int distinctCount) {
        _numRows += distinctCount;

        if (value == null) {
            _annotationFactory.annotate(row, distinctCount, _nullAnnotation);
        } else {
            final int numChars = value.length();

            if (numChars == 0) {
                _annotationFactory.annotate(row, distinctCount, _blankAnnotation);
            }

            final int totalChars = numChars * distinctCount;
            final int numWords = new StringTokenizer(value).countTokens();
            final int totalWords = numWords * distinctCount;

            int numWhitespace = 0;
            int numDigits = 0;
            int numDiacritics = 0;
            int numLetters = 0;
            int numNonLetters = 0;
            int numUppercase = 0;
            int numUppercaseExclFirstLetter = 0;
            int numLowercase = 0;

            boolean firstLetter = true;
            CharIterator it = new CharIterator(value);
            while (it.hasNext()) {
                it.next();
                if (it.isLetter()) {
                    numLetters += distinctCount;
                    if (it.isUpperCase()) {
                        numUppercase += distinctCount;
                        if (!firstLetter) {
                            numUppercaseExclFirstLetter += distinctCount;
                        }
                    } else {
                        numLowercase += distinctCount;
                    }
                    if (it.isDiacritic()) {
                        numDiacritics += distinctCount;
                    }
                    firstLetter = false;
                } else {
                    numNonLetters += distinctCount;
                    if (it.isDigit()) {
                        numDigits += distinctCount;
                    }
                    if (it.isWhitespace()) {
                        numWhitespace++;
                    }
                    if (it.is('.')) {
                        firstLetter = true;
                    }
                }
            }

            _numUppercase += +numUppercase;
            if (numUppercaseExclFirstLetter > 0) {
                _annotationFactory.annotate(row, distinctCount, _uppercaseExclFirstLetterAnnotation);
                _numUppercaseExclFirstLetter += numUppercaseExclFirstLetter;
            }
            _numLowercase += numLowercase;
            _numNonLetter += numNonLetters;

            if (_minChars == null) {
                // This is the first time we encounter a non-null value, so
                // we just set all counters
                _minChars = numChars;
                _maxChars = numChars;
                _minWords = numWords;
                _maxWords = numWords;
                _minWhitespace = numWhitespace;
                _maxWhitespace = numWhitespace;
            }

            _numChars += totalChars;
            _numWords += totalWords;

            if (numDiacritics > 0) {
                _numDiacritics += numDiacritics;
                _annotationFactory.annotate(row, distinctCount, _diacriticAnnotation);
            }

            if (numDigits > 0) {
                _numDigit += numDigits;
                _annotationFactory.annotate(row, distinctCount, _digitAnnotation);
            }

            if (_maxChars < numChars) {
                _annotationFactory.reset(_maxCharsAnnotation);
                _maxChars = numChars;
            }
            if (_maxChars == numChars) {
                _annotationFactory.annotate(row, distinctCount, _maxCharsAnnotation);
            }

            if (_minChars > numChars) {
                _annotationFactory.reset(_minCharsAnnotation);
                _minChars = numChars;
            }
            if (_minChars == numChars) {
                _annotationFactory.annotate(row, distinctCount, _minCharsAnnotation);
            }

            if (_maxWords < numWords) {
                _maxWords = numWords;
                _annotationFactory.reset(_maxWordsAnnotation);
            }
            if (_maxWords == numWords) {
                _annotationFactory.annotate(row, distinctCount, _maxWordsAnnotation);
            }
            if (_minWords > numWords) {
                _minWords = numWords;
                _annotationFactory.reset(_minWordsAnnotation);
            }
            if (_minWords == numWords) {
                _annotationFactory.annotate(row, distinctCount, _minWordsAnnotation);
            }

            if (_maxWhitespace < numWhitespace) {
                _maxWhitespace = numWhitespace;
                _annotationFactory.reset(_maxWhitespaceAnnotation);
            }
            if (_maxWhitespace == numWhitespace) {
                _annotationFactory.annotate(row, distinctCount, _maxWhitespaceAnnotation);
            }

            if (_minWhitespace > numWhitespace) {
                _minWhitespace = numWhitespace;
                _annotationFactory.reset(_minWhitespaceAnnotation);
            }
            if (_minWhitespace == numWhitespace) {
                _annotationFactory.annotate(row, distinctCount, _minWhitespaceAnnotation);
            }

            if (numLetters > 0) {
                if (isEntirelyUpperCase(value)) {
                    _numEntirelyUppercase += distinctCount;
                    _annotationFactory.annotate(row, distinctCount, _entirelyUppercaseAnnotation);
                }

                if (isEntirelyLowerCase(value)) {
                    _numEntirelyLowercase += distinctCount;
                    _annotationFactory.annotate(row, distinctCount, _entirelyLowercaseAnnotation);
                }
            }

            _charAverageBuilder.addValue(numChars);
            _whitespaceAverageBuilder.addValue(numWhitespace);
        }
    }

    protected static boolean isEntirelyLowerCase(String value) {
        return value.equals(value.toLowerCase());
    }

    protected static boolean isEntirelyUpperCase(String value) {
        return value.equals(value.toUpperCase());
    }

    public int getNumRows() {
        return _numRows;
    }

    public int getNumNull() {
        return _nullAnnotation.getRowCount();
    }

    public int getNumEntirelyUppercase() {
        return _numEntirelyUppercase;
    }

    public int getNumEntirelyLowercase() {
        return _numEntirelyLowercase;
    }

    public int getNumChars() {
        return _numChars;
    }

    public Integer getMinChars() {
        return _minChars;
    }

    public Integer getMaxChars() {
        return _maxChars;
    }

    public Integer getMinWhitespace() {
        return _minWhitespace;
    }

    public Integer getMaxWhitespace() {
        return _maxWhitespace;
    }

    public int getNumUppercase() {
        return _numUppercase;
    }

    public int getNumUppercaseExclFirstLetter() {
        return _numUppercaseExclFirstLetter;
    }

    public int getNumLowercase() {
        return _numLowercase;
    }

    public int getNumDigit() {
        return _numDigit;
    }

    public int getNumDiacritics() {
        return _numDiacritics;
    }

    public int getNumNonLetter() {
        return _numNonLetter;
    }

    public int getNumWords() {
        return _numWords;
    }

    public Integer getMinWords() {
        return _minWords;
    }

    public Integer getMaxWords() {
        return _maxWords;
    }

    public AverageBuilder getCharAverageBuilder() {
        return _charAverageBuilder;
    }

    public AverageBuilder getWhitespaceAverageBuilder() {
        return _whitespaceAverageBuilder;
    }

    public RowAnnotation getNullAnnotation() {
        return _nullAnnotation;
    }
    
    public RowAnnotation getBlankAnnotation() {
        return _blankAnnotation;
    }

    public RowAnnotation getEntirelyUppercaseAnnotation() {
        return _entirelyUppercaseAnnotation;
    }

    public RowAnnotation getEntirelyLowercaseAnnotation() {
        return _entirelyLowercaseAnnotation;
    }

    public RowAnnotation getMaxCharsAnnotation() {
        return _maxCharsAnnotation;
    }

    public RowAnnotation getMinCharsAnnotation() {
        return _minCharsAnnotation;
    }

    public RowAnnotation getMaxWhitespaceAnnotation() {
        return _maxWhitespaceAnnotation;
    }

    public RowAnnotation getMinWhitespaceAnnotation() {
        return _minWhitespaceAnnotation;
    }

    public RowAnnotation getUppercaseExclFirstLetterAnnotation() {
        return _uppercaseExclFirstLetterAnnotation;
    }

    public RowAnnotation getDigitAnnotation() {
        return _digitAnnotation;
    }

    public RowAnnotation getDiacriticAnnotation() {
        return _diacriticAnnotation;
    }

    public RowAnnotation getMaxWordsAnnotation() {
        return _maxWordsAnnotation;
    }

    public RowAnnotation getMinWordsAnnotation() {
        return _minWordsAnnotation;
    }

    public Integer getNumBlank() {
        return _blankAnnotation.getRowCount();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy