org.datacleaner.beans.ReferenceDataMatcherAnalyzer Maven / Gradle / Ivy
/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.beans;
import java.util.ArrayList;
import java.util.List;
import javax.inject.Named;
import org.datacleaner.api.Alias;
import org.datacleaner.api.Analyzer;
import org.datacleaner.api.Close;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.Distributed;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.OutputColumns;
import org.datacleaner.api.Provided;
import org.datacleaner.api.Validate;
import org.datacleaner.beans.transform.DictionaryMatcherTransformer;
import org.datacleaner.beans.transform.StringPatternMatcherTransformer;
import org.datacleaner.components.convert.ConvertToStringTransformer;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.data.MockInputColumn;
import org.datacleaner.data.MockInputRow;
import org.datacleaner.reference.Dictionary;
import org.datacleaner.reference.StringPattern;
import org.datacleaner.reference.SynonymCatalog;
import org.datacleaner.reference.SynonymCatalogConnection;
@Named("Reference data matcher")
@Alias("Matching analyzer")
@Description("Check your data values against multiple forms of reference data in one simple analyzer step.\n"
+ "This analyzer provides a handy shortcut for doing matching with dictionaries, synonym lookups or "
+ "string patterns matching, retrieving matching matrices for all matches.")
@Distributed(reducer = BooleanAnalyzerReducer.class)
public class ReferenceDataMatcherAnalyzer implements Analyzer {
@Configured(order = 1)
InputColumn>[] columns;
@Configured(order = 2, required = false)
Dictionary[] dictionaries;
@Configured(order = 3, required = false)
SynonymCatalog[] synonymCatalogs;
@Configured(order = 4, required = false)
StringPattern[] stringPatterns;
@Provided
DataCleanerConfiguration configuration;
private BooleanAnalyzer _booleanAnalyzer;
private DictionaryMatcherTransformer[] _dictionaryMatchers;
private StringPatternMatcherTransformer[] _stringPatternMatchers;
private SynonymCatalogConnection[] _synonymCatalogConnections;
private List> _matchColumns;
public ReferenceDataMatcherAnalyzer(final InputColumn>[] columns, final Dictionary[] dictionaries,
final SynonymCatalog[] synonymCatalogs, final StringPattern[] stringPatterns,
final DataCleanerConfiguration configuration) {
this();
this.columns = columns;
this.dictionaries = dictionaries;
this.stringPatterns = stringPatterns;
this.synonymCatalogs = synonymCatalogs;
this.configuration = configuration;
}
public ReferenceDataMatcherAnalyzer() {
}
@Validate
public void validate() {
if (!isDictionaryMatchingEnabled() && !isSynonymCatalogLookupEnabled() && !isStringPatternMatchingEnabled()) {
throw new IllegalStateException("No dictionaries, synonym catalogs or string patterns selected");
}
}
@Initialize
public void init() {
_dictionaryMatchers = new DictionaryMatcherTransformer[columns.length];
_stringPatternMatchers = new StringPatternMatcherTransformer[columns.length];
_matchColumns = new ArrayList<>();
OutputColumns outputColumns;
for (int i = 0; i < columns.length; i++) {
if (isDictionaryMatchingEnabled()) {
// create matcher for dictionaries
final DictionaryMatcherTransformer dictionaryMatcher =
new DictionaryMatcherTransformer(columns[i], dictionaries, configuration);
dictionaryMatcher.init();
outputColumns = dictionaryMatcher.getOutputColumns();
addMatchColumns(outputColumns);
_dictionaryMatchers[i] = dictionaryMatcher;
}
if (isSynonymCatalogLookupEnabled()) {
outputColumns = new OutputColumns(synonymCatalogs.length, Boolean.class);
_synonymCatalogConnections = new SynonymCatalogConnection[synonymCatalogs.length];
for (int j = 0; j < synonymCatalogs.length; j++) {
final SynonymCatalog synonymCatalog = synonymCatalogs[j];
_synonymCatalogConnections[j] = synonymCatalog.openConnection(configuration);
outputColumns.setColumnName(j, columns[i].getName() + " in " + synonymCatalog.getName());
}
addMatchColumns(outputColumns);
}
if (isStringPatternMatchingEnabled()) {
// create matcher for string patterns
final StringPatternMatcherTransformer stringPatternMatcher =
new StringPatternMatcherTransformer(columns[i], stringPatterns, configuration);
stringPatternMatcher.init();
outputColumns = stringPatternMatcher.getOutputColumns();
addMatchColumns(outputColumns);
_stringPatternMatchers[i] = stringPatternMatcher;
}
}
@SuppressWarnings("unchecked") final InputColumn[] columnArray =
_matchColumns.toArray(new InputColumn[_matchColumns.size()]);
_booleanAnalyzer = new BooleanAnalyzer(columnArray);
_booleanAnalyzer.init();
}
@Close
public void close() {
if (isDictionaryMatchingEnabled() && _dictionaryMatchers != null) {
for (final DictionaryMatcherTransformer matcher : _dictionaryMatchers) {
matcher.close();
}
_dictionaryMatchers = null;
}
if (isStringPatternMatchingEnabled() && _stringPatternMatchers != null) {
for (final StringPatternMatcherTransformer matcher : _stringPatternMatchers) {
matcher.close();
}
_stringPatternMatchers = null;
}
if (isSynonymCatalogLookupEnabled() && _synonymCatalogConnections != null) {
for (final SynonymCatalogConnection connection : _synonymCatalogConnections) {
connection.close();
}
_synonymCatalogConnections = null;
}
}
private boolean isStringPatternMatchingEnabled() {
return stringPatterns != null && stringPatterns.length > 0;
}
private boolean isSynonymCatalogLookupEnabled() {
return synonymCatalogs != null && synonymCatalogs.length > 0;
}
private boolean isDictionaryMatchingEnabled() {
return dictionaries != null && dictionaries.length > 0;
}
private void addMatchColumns(final OutputColumns outputColumns) {
final int count = outputColumns.getColumnCount();
for (int i = 0; i < count; i++) {
final String columnName = outputColumns.getColumnName(i);
final InputColumn col = new MockInputColumn<>(columnName, Boolean.class);
_matchColumns.add(col);
}
}
@Override
public void run(final InputRow row, final int distinctCount) {
final MockInputRow mockInputRow = new MockInputRow();
int matchColumnIndex = 0;
for (int i = 0; i < columns.length; i++) {
final Object value = row.getValue(columns[i]);
final String stringValue = ConvertToStringTransformer.transformValue(value);
mockInputRow.put(columns[i], value);
if (isDictionaryMatchingEnabled()) {
final Object[] matches = _dictionaryMatchers[i].transform(row);
for (final Object match : matches) {
assert match instanceof Boolean;
final InputColumn matchColumn = _matchColumns.get(matchColumnIndex);
matchColumnIndex++;
mockInputRow.put(matchColumn, match);
}
}
if (isSynonymCatalogLookupEnabled()) {
for (final SynonymCatalogConnection synonymCatalogConnection : _synonymCatalogConnections) {
final InputColumn matchColumn = _matchColumns.get(matchColumnIndex);
matchColumnIndex++;
final String masterTerm = synonymCatalogConnection.getMasterTerm(stringValue);
if (masterTerm == null) {
// no match
mockInputRow.put(matchColumn, Boolean.FALSE);
} else {
mockInputRow.put(matchColumn, Boolean.TRUE);
}
}
}
if (isStringPatternMatchingEnabled()) {
final Object[] matches = _stringPatternMatchers[i].transform(row);
for (final Object match : matches) {
assert match instanceof Boolean;
final InputColumn matchColumn = _matchColumns.get(matchColumnIndex);
matchColumnIndex++;
mockInputRow.put(matchColumn, match);
}
}
}
_booleanAnalyzer.run(mockInputRow, distinctCount);
}
@Override
public BooleanAnalyzerResult getResult() {
return _booleanAnalyzer.getResult();
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy