org.eobjects.analyzer.beans.valuedist.ValueDistributionAnalyzer Maven / Gradle / Ivy

Go to download
/**
 * AnalyzerBeans
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.eobjects.analyzer.beans.valuedist;

import java.util.Map;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;

import javax.inject.Inject;

import org.eobjects.analyzer.beans.api.Analyzer;
import org.eobjects.analyzer.beans.api.AnalyzerBean;
import org.eobjects.analyzer.beans.api.ColumnProperty;
import org.eobjects.analyzer.beans.api.Concurrent;
import org.eobjects.analyzer.beans.api.Configured;
import org.eobjects.analyzer.beans.api.Description;
import org.eobjects.analyzer.beans.api.Provided;
import org.eobjects.analyzer.data.InputColumn;
import org.eobjects.analyzer.data.InputRow;
import org.eobjects.analyzer.storage.CollectionFactory;
import org.eobjects.analyzer.storage.CollectionFactoryImpl;
import org.eobjects.analyzer.storage.InMemoryRowAnnotationFactory;
import org.eobjects.analyzer.storage.InMemoryStorageProvider;
import org.eobjects.analyzer.storage.RowAnnotationFactory;
import org.eobjects.analyzer.util.NullTolerableComparator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@AnalyzerBean("Value distribution")
@Description("Gets the distributions of values that occur in a dataset.\nOften used as an initial way to see if a lot of repeated values are to be expected, if nulls occur and if a few un-repeated values add exceptions to the typical usage-pattern.")
@Concurrent(true)
public class ValueDistributionAnalyzer implements Analyzer {

    private static final Logger logger = LoggerFactory.getLogger(ValueDistributionAnalyzer.class);

    @Inject
    @Configured(value = "Column", order = 1)
    @ColumnProperty(escalateToMultipleJobs = true)
    InputColumn _column;

    @Inject
    @Configured(value = "Group column", required = false, order = 2)
    InputColumn _groupColumn;

    @Inject
    @Configured(value = "Record unique values", required = false, order = 3)
    boolean _recordUniqueValues = true;

    @Inject
    @Configured(value = "Record drill-down information", required = false, order = 4)
    @Description("Record extra information to allow drilling to the records that represent a particular value in the distribution")
    boolean _recordDrillDownInformation = true;

    @Inject
    @Configured(value = "Top n most frequent values", required = false, order = 5)
    @Deprecated
    Integer _topFrequentValues;

    @Inject
    @Configured(value = "Bottom n most frequent values", required = false, order = 6)
    @Deprecated
    Integer _bottomFrequentValues;

    @Inject
    @Provided
    CollectionFactory _collectionFactory;

    @Inject
    @Provided
    RowAnnotationFactory _annotationFactory;

    private final Map _valueDistributionGroups;

    /**
     * Constructor used for testing and ad-hoc purposes
     * 
     * @param column
     * @param recordUniqueValues
     * @param topFrequentValues
     * @param bottomFrequentValues
     */
    public ValueDistributionAnalyzer(InputColumn column, boolean recordUniqueValues, Integer topFrequentValues,
            Integer bottomFrequentValues) {
        this(column, null, recordUniqueValues, topFrequentValues, bottomFrequentValues);
    }

    /**
     * Constructor used for testing and ad-hoc purposes
     * 
     * @param column
     * @param groupColumn
     * @param recordUniqueValues
     * @param topFrequentValues
     * @param bottomFrequentValues
     */
    public ValueDistributionAnalyzer(InputColumn column, InputColumn groupColumn,
            boolean recordUniqueValues, Integer topFrequentValues, Integer bottomFrequentValues) {
        this();
        _column = column;
        _groupColumn = groupColumn;
        _recordUniqueValues = recordUniqueValues;
        _topFrequentValues = topFrequentValues;
        _bottomFrequentValues = bottomFrequentValues;
        _collectionFactory = new CollectionFactoryImpl(new InMemoryStorageProvider());
        _annotationFactory = new InMemoryRowAnnotationFactory();
    }

    /**
     * Main constructor
     */
    public ValueDistributionAnalyzer() {
        _valueDistributionGroups = new TreeMap(
                NullTolerableComparator.get(String.class));
    }

    @Override
    public void run(InputRow row, int distinctCount) {
        final Object value = row.getValue(_column);
        if (_groupColumn == null) {
            runInternal(row, value, distinctCount);
        } else {
            final String group = row.getValue(_groupColumn);
            runInternal(row, value, group, distinctCount);
        }
    }

    public void runInternal(InputRow row, Object value, int distinctCount) {
        runInternal(row, value, _column.getName(), distinctCount);
    }

    public void runInternal(InputRow row, Object value, String group, int distinctCount) {
        final ValueDistributionGroup valueDistributionGroup = getValueDistributionGroup(group);
        final String stringValue;
        if (value == null) {
            logger.debug("value is null");
            stringValue = null;
        } else {
            stringValue = value.toString();
        }
        valueDistributionGroup.run(row, stringValue, distinctCount);
    }

    private ValueDistributionGroup getValueDistributionGroup(String group) {
        ValueDistributionGroup valueDistributionGroup = _valueDistributionGroups.get(group);
        if (valueDistributionGroup == null) {
            synchronized (this) {
                valueDistributionGroup = _valueDistributionGroups.get(group);
                if (valueDistributionGroup == null) {
                    final InputColumn[] inputColumns;
                    if (_groupColumn == null) {
                        inputColumns = new InputColumn[] { _column };
                    } else {
                        inputColumns = new InputColumn[] { _column, _groupColumn };
                    }
                    valueDistributionGroup = new ValueDistributionGroup(group, _collectionFactory, _annotationFactory,
                            _recordDrillDownInformation, inputColumns);
                    _valueDistributionGroups.put(group, valueDistributionGroup);
                }
            }
        }
        return valueDistributionGroup;
    }

    @Override
    public ValueDistributionAnalyzerResult getResult() {
        if (_groupColumn == null) {
            logger.info("getResult() invoked, processing single group");
            final ValueDistributionGroup valueDistributionGroup = getValueDistributionGroup(_column.getName());
            final SingleValueDistributionResult ungroupedResult = valueDistributionGroup.createResult(
                    _topFrequentValues, _bottomFrequentValues, _recordUniqueValues);
            return ungroupedResult;
        } else {
            logger.info("getResult() invoked, processing {} groups", _valueDistributionGroups.size());

            final SortedSet groupedResults = new TreeSet();
            for (String group : _valueDistributionGroups.keySet()) {
                final ValueDistributionGroup valueDistributibutionGroup = getValueDistributionGroup(group);
                final SingleValueDistributionResult result = valueDistributibutionGroup.createResult(
                        _topFrequentValues, _bottomFrequentValues, _recordUniqueValues);
                groupedResults.add(result);
            }
            return new GroupedValueDistributionResult(_column, _groupColumn, groupedResults);
        }
    }

    public void setAnnotationFactory(RowAnnotationFactory annotationFactory) {
        _annotationFactory = annotationFactory;
    }

    public void setCollectionFactory(CollectionFactory collectionFactory) {
        _collectionFactory = collectionFactory;
    }

    /**
     * 
     * @param bottomFrequentValues
     * @deprecated use of this property is no longer adviced. It will be phased
     *             out in later versions of AnalyzerBeans
     */
    @Deprecated
    public void setBottomFrequentValues(Integer bottomFrequentValues) {
        _bottomFrequentValues = bottomFrequentValues;
    }

    public void setColumn(InputColumn column) {
        _column = column;
    }

    public void setGroupColumn(InputColumn groupColumn) {
        _groupColumn = groupColumn;
    }

    public void setRecordDrillDownInformation(boolean recordDrillDownInformation) {
        _recordDrillDownInformation = recordDrillDownInformation;
    }

    public void setRecordUniqueValues(boolean recordUniqueValues) {
        _recordUniqueValues = recordUniqueValues;
    }

    /**
     * 
     * @param topFrequentValues
     * 
     * @deprecated use of this property is no longer adviced. It will be phased
     *             out in later versions of AnalyzerBeans
     */
    @Deprecated
    public void setTopFrequentValues(Integer topFrequentValues) {
        _topFrequentValues = topFrequentValues;
    }
}