org.datacleaner.beans.valuedist.ValueDistributionAnalyzer Maven / Gradle / Ivy
/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.beans.valuedist;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import javax.inject.Inject;
import javax.inject.Named;
import org.datacleaner.api.Analyzer;
import org.datacleaner.api.ColumnProperty;
import org.datacleaner.api.Concurrent;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.ExternalDocumentation;
import org.datacleaner.api.ExternalDocumentation.DocumentationLink;
import org.datacleaner.api.ExternalDocumentation.DocumentationType;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.Provided;
import org.datacleaner.storage.CollectionFactory;
import org.datacleaner.storage.RowAnnotationFactory;
import org.datacleaner.storage.RowAnnotations;
import org.datacleaner.util.NullTolerableComparator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Named("Value distribution")
@Description("Gets the distributions of values that occur in a dataset.\nOften used as an initial way to see if a lot of repeated values are to be expected, if nulls occur and if a few un-repeated values add exceptions to the typical usage-pattern.")
@ExternalDocumentation({ @DocumentationLink(title = "Analyzer rundown", url = "https://www.youtube.com/watch?v=hZWxB_eu_A0", type = DocumentationType.VIDEO, version = "4.0") })
@Concurrent(true)
public class ValueDistributionAnalyzer implements Analyzer {
public static final String PROPERTY_COLUMN = "Column";
public static final String PROPERTY_GROUP_COLUMN = "Group column";
public static final String PROPERTY_RECORD_UNIQUE_VALUES = "Record unique values";
public static final String PROPERTY_RECORD_DRILL_DOWN_INFORMATION = "Record drill-down information";
private static final Logger logger = LoggerFactory.getLogger(ValueDistributionAnalyzer.class);
@Inject
@Configured(value = PROPERTY_COLUMN, order = 1)
@ColumnProperty(escalateToMultipleJobs = true)
InputColumn> _column;
@Inject
@Configured(value = PROPERTY_GROUP_COLUMN, required = false, order = 2)
InputColumn _groupColumn;
@Inject
@Configured(value = PROPERTY_RECORD_UNIQUE_VALUES, required = false, order = 3)
boolean _recordUniqueValues = true;
@Inject
@Configured(value = PROPERTY_RECORD_DRILL_DOWN_INFORMATION, required = false, order = 4)
@Description("Record extra information to allow drilling to the records that represent a particular value in the distribution")
boolean _recordDrillDownInformation = true;
@Inject
@Configured(value = "Top n most frequent values", required = false, order = 5)
@Deprecated
Integer _topFrequentValues;
@Inject
@Configured(value = "Bottom n most frequent values", required = false, order = 6)
@Deprecated
Integer _bottomFrequentValues;
@Inject
@Provided
RowAnnotationFactory _annotationFactory;
private final Map _valueDistributionGroups;
/**
* Constructor used for testing and ad-hoc purposes
*
* @param column
* @param recordUniqueValues
* @param topFrequentValues
* @param bottomFrequentValues
*/
public ValueDistributionAnalyzer(InputColumn> column, boolean recordUniqueValues) {
this(column, null, recordUniqueValues);
}
/**
* Constructor used for testing and ad-hoc purposes
*
* @param column
* @param groupColumn
* @param recordUniqueValues
* @param topFrequentValues
* @param bottomFrequentValues
*/
public ValueDistributionAnalyzer(InputColumn> column, InputColumn groupColumn, boolean recordUniqueValues) {
this();
_column = column;
_groupColumn = groupColumn;
_recordUniqueValues = recordUniqueValues;
_annotationFactory = RowAnnotations.getDefaultFactory();
}
/**
* Main constructor
*/
public ValueDistributionAnalyzer() {
_valueDistributionGroups = new TreeMap(
NullTolerableComparator.get(String.class));
}
@Override
public void run(InputRow row, int distinctCount) {
final Object value = row.getValue(_column);
if (_groupColumn == null) {
runInternal(row, value, distinctCount);
} else {
final String group = row.getValue(_groupColumn);
runInternal(row, value, group, distinctCount);
}
}
public void runInternal(InputRow row, Object value, int distinctCount) {
runInternal(row, value, _column.getName(), distinctCount);
}
public void runInternal(InputRow row, Object value, String group, int distinctCount) {
final ValueDistributionGroup valueDistributionGroup = getValueDistributionGroup(group);
final String stringValue;
if (value == null) {
logger.debug("value is null");
stringValue = null;
} else {
stringValue = value.toString();
}
valueDistributionGroup.run(row, stringValue, distinctCount);
}
private ValueDistributionGroup getValueDistributionGroup(String group) {
ValueDistributionGroup valueDistributionGroup = _valueDistributionGroups.get(group);
if (valueDistributionGroup == null) {
synchronized (this) {
valueDistributionGroup = _valueDistributionGroups.get(group);
if (valueDistributionGroup == null) {
final InputColumn>[] inputColumns;
if (_groupColumn == null) {
inputColumns = new InputColumn[] { _column };
} else {
inputColumns = new InputColumn[] { _column, _groupColumn };
}
valueDistributionGroup = new ValueDistributionGroup(group, _annotationFactory,
_recordDrillDownInformation, inputColumns);
_valueDistributionGroups.put(group, valueDistributionGroup);
}
}
}
return valueDistributionGroup;
}
@Override
public ValueDistributionAnalyzerResult getResult() {
if (_groupColumn == null) {
logger.info("getResult() invoked, processing single group");
final ValueDistributionGroup valueDistributionGroup = getValueDistributionGroup(_column.getName());
final SingleValueDistributionResult ungroupedResult = valueDistributionGroup
.createResult(_recordUniqueValues);
return ungroupedResult;
} else {
logger.info("getResult() invoked, processing {} groups", _valueDistributionGroups.size());
final SortedSet groupedResults = new TreeSet();
for (String group : _valueDistributionGroups.keySet()) {
final ValueDistributionGroup valueDistributibutionGroup = getValueDistributionGroup(group);
final SingleValueDistributionResult result = valueDistributibutionGroup
.createResult(_recordUniqueValues);
groupedResults.add(result);
}
return new GroupedValueDistributionResult(_column, _groupColumn, groupedResults);
}
}
public void setAnnotationFactory(RowAnnotationFactory annotationFactory) {
_annotationFactory = annotationFactory;
}
/**
*
* @param collectionFactory
* @deprecated use of this property is no longer adviced. It will be phased
* out in later versions of DataCleaner
*/
@Deprecated
public void setCollectionFactory(CollectionFactory collectionFactory) {
// do nothing
}
public void setColumn(InputColumn> column) {
_column = column;
}
public void setGroupColumn(InputColumn groupColumn) {
_groupColumn = groupColumn;
}
public void setRecordDrillDownInformation(boolean recordDrillDownInformation) {
_recordDrillDownInformation = recordDrillDownInformation;
}
public void setRecordUniqueValues(boolean recordUniqueValues) {
_recordUniqueValues = recordUniqueValues;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy