
org.datacleaner.beans.stringpattern.PatternFinderResultReducer Maven / Gradle / Ivy
/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.beans.stringpattern;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import org.datacleaner.api.AnalyzerResultReducer;
import org.datacleaner.api.InputColumn;
import org.datacleaner.result.Crosstab;
import org.datacleaner.result.CrosstabDimension;
import org.datacleaner.result.CrosstabNavigator;
/**
* Result reducer for {@link PatternFinderResult}s
*/
public class PatternFinderResultReducer implements AnalyzerResultReducer {
@Override
public PatternFinderResult reduce(Collection extends PatternFinderResult> results) {
final PatternFinderResult firstResult = results.iterator().next();
final InputColumn column = firstResult.getColumn();
final TokenizerConfiguration tokenizerConfiguration = firstResult.getTokenizerConfiguration();
if (!firstResult.isGroupingEnabled()) {
// a single list of patterns
final List> crosstabs = new ArrayList>(results.size());
for (PatternFinderResult result : results) {
Crosstab> crosstab = result.getSingleCrosstab();
crosstabs.add(crosstab);
}
final Crosstab> crosstab = reduce(crosstabs, tokenizerConfiguration);
return new PatternFinderResult(column, crosstab, tokenizerConfiguration);
} else {
// groups of lists of patterns
final Map>> groupedCrosstabs = new HashMap>>();
for (PatternFinderResult result : results) {
final Set>> entries = result.getGroupedCrosstabs().entrySet();
for (Entry> entry : entries) {
final String group = entry.getKey();
List> crosstabsInGroup = groupedCrosstabs.get(group);
if (crosstabsInGroup == null) {
crosstabsInGroup = new ArrayList>();
groupedCrosstabs.put(group, crosstabsInGroup);
}
crosstabsInGroup.add(entry.getValue());
}
}
final Map> crosstabs = new TreeMap>();
final Set>>> entries = groupedCrosstabs.entrySet();
for (Entry>> entry : entries) {
final String group = entry.getKey();
final List> crosstabInGroup = entry.getValue();
final Crosstab> crosstab = reduce(crosstabInGroup, tokenizerConfiguration);
crosstabs.put(group, crosstab);
}
final InputColumn groupColumn = firstResult.getGroupColumn();
return new PatternFinderResult(column, groupColumn, crosstabs, tokenizerConfiguration);
}
}
private Crosstab> reduce(List> crosstabs, TokenizerConfiguration tokenizerConfiguration) {
if (crosstabs.size() == 1) {
return crosstabs.get(0);
}
final ReversePatternFinder patternFinder = new ReversePatternFinder(tokenizerConfiguration);
for (Crosstab> crosstab : crosstabs) {
final CrosstabDimension patternDimension = crosstab
.getDimension(PatternFinderAnalyzer.DIMENSION_NAME_PATTERN);
final List patterns = patternDimension.getCategories();
for (String pattern : patterns) {
final CrosstabNavigator> navigator = crosstab.where(PatternFinderAnalyzer.DIMENSION_NAME_PATTERN,
pattern);
final Number matchCount = (Number) navigator.where(PatternFinderAnalyzer.DIMENSION_NAME_MEASURES,
PatternFinderAnalyzer.MEASURE_MATCH_COUNT).get();
final String sample = (String) navigator.where(PatternFinderAnalyzer.DIMENSION_NAME_MEASURES,
PatternFinderAnalyzer.MEASURE_SAMPLE).get();
patternFinder.run(sample, pattern, matchCount.intValue());
}
}
final Set> entries = patternFinder.getPatternCounts().entrySet();
// sort the entries so that the ones with the highest amount of
// matches are at the top
final Set> sortedEntrySet = new TreeSet>(
new Comparator>() {
public int compare(Entry o1, Entry o2) {
int result = o2.getValue().get() - o1.getValue().get();
if (result == 0) {
result = o1.getKey().toSymbolicString().compareTo(o2.getKey().toSymbolicString());
}
return result;
}
});
sortedEntrySet.addAll(entries);
final Crosstab crosstab = PatternFinderAnalyzer.createCrosstab();
for (Entry entry : sortedEntrySet) {
final CrosstabNavigator nav = crosstab.navigate();
final TokenPattern pattern = entry.getKey();
nav.where(PatternFinderAnalyzer.DIMENSION_NAME_PATTERN, pattern.toSymbolicString());
nav.where(PatternFinderAnalyzer.DIMENSION_NAME_MEASURES, PatternFinderAnalyzer.MEASURE_MATCH_COUNT);
final AtomicInteger count = entry.getValue();
nav.put(count, true);
nav.where(PatternFinderAnalyzer.DIMENSION_NAME_MEASURES, PatternFinderAnalyzer.MEASURE_SAMPLE);
final String sample = patternFinder.getSample(pattern);
nav.put(sample, true);
}
return crosstab;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy